1use super::encryption_handler::EncryptionHandler;
6use super::header::PdfHeader;
7use super::object_stream::ObjectStream;
8use super::objects::{PdfArray, PdfDictionary, PdfObject, PdfString};
9use super::stack_safe::StackSafeContext;
10use super::trailer::PdfTrailer;
11use super::xref::XRefTable;
12use super::{ParseError, ParseResult};
13use crate::objects::ObjectId;
14use std::collections::HashMap;
15use std::fs::File;
16use std::io::{BufReader, Read, Seek, SeekFrom};
17use std::path::Path;
18
19fn find_bytes(haystack: &[u8], needle: &[u8]) -> Option<usize> {
21 haystack
22 .windows(needle.len())
23 .position(|window| window == needle)
24}
25
26fn is_immediate_stream_start(data: &[u8]) -> bool {
28 let mut i = 0;
29
30 while i < data.len() && matches!(data[i], b' ' | b'\t' | b'\n' | b'\r') {
32 i += 1;
33 }
34
35 data[i..].starts_with(b"stream")
37}
38
39pub struct PdfReader<R: Read + Seek> {
41 reader: BufReader<R>,
42 header: PdfHeader,
43 xref: XRefTable,
44 trailer: PdfTrailer,
45 object_cache: HashMap<(u32, u16), PdfObject>,
47 object_stream_cache: HashMap<u32, ObjectStream>,
49 page_tree: Option<super::page_tree::PageTree>,
51 parse_context: StackSafeContext,
53 options: super::ParseOptions,
55 encryption_handler: Option<EncryptionHandler>,
57 objects_being_reconstructed: std::sync::Mutex<std::collections::HashSet<u32>>,
59 max_reconstruction_depth: u32,
61}
62
63impl<R: Read + Seek> PdfReader<R> {
64 pub fn options(&self) -> &super::ParseOptions {
66 &self.options
67 }
68
69 pub fn is_encrypted(&self) -> bool {
71 self.encryption_handler.is_some()
72 }
73
74 pub fn is_unlocked(&self) -> bool {
76 match &self.encryption_handler {
77 Some(handler) => handler.is_unlocked(),
78 None => true, }
80 }
81
82 pub fn encryption_handler_mut(&mut self) -> Option<&mut EncryptionHandler> {
84 self.encryption_handler.as_mut()
85 }
86
87 pub fn encryption_handler(&self) -> Option<&EncryptionHandler> {
89 self.encryption_handler.as_ref()
90 }
91
92 pub fn unlock_with_password(&mut self, password: &str) -> ParseResult<bool> {
94 match &mut self.encryption_handler {
95 Some(handler) => {
96 if handler.unlock_with_user_password(password).unwrap_or(false) {
98 Ok(true)
99 } else {
100 Ok(handler
102 .unlock_with_owner_password(password)
103 .unwrap_or(false))
104 }
105 }
106 None => Ok(true), }
108 }
109
110 pub fn try_empty_password(&mut self) -> ParseResult<bool> {
112 match &mut self.encryption_handler {
113 Some(handler) => Ok(handler.try_empty_password().unwrap_or(false)),
114 None => Ok(true), }
116 }
117
118 pub fn unlock(&mut self, password: &str) -> ParseResult<()> {
149 if !self.is_encrypted() {
151 return Ok(());
152 }
153
154 if self.is_unlocked() {
156 return Ok(());
157 }
158
159 let success = self.unlock_with_password(password)?;
161
162 if success {
163 Ok(())
164 } else {
165 Err(ParseError::WrongPassword)
166 }
167 }
168
169 fn ensure_unlocked(&self) -> ParseResult<()> {
171 if self.is_encrypted() && !self.is_unlocked() {
172 return Err(ParseError::PdfLocked);
173 }
174 Ok(())
175 }
176
177 fn decrypt_object_if_needed(
183 &self,
184 obj: PdfObject,
185 obj_num: u32,
186 gen_num: u16,
187 ) -> ParseResult<PdfObject> {
188 let handler = match &self.encryption_handler {
190 Some(h) if h.is_unlocked() => h,
191 _ => return Ok(obj), };
193
194 let obj_id = ObjectId::new(obj_num, gen_num);
195
196 match obj {
197 PdfObject::String(ref s) => {
198 let decrypted_bytes = handler.decrypt_string(s.as_bytes(), &obj_id)?;
200 Ok(PdfObject::String(PdfString::new(decrypted_bytes)))
201 }
202 PdfObject::Stream(ref stream) => {
203 let should_decrypt = stream
205 .dict
206 .get("StmF")
207 .and_then(|o| o.as_name())
208 .map(|n| n.0.as_str() != "Identity")
209 .unwrap_or(true); if should_decrypt {
212 let decrypted_data = handler.decrypt_stream(&stream.data, &obj_id)?;
213
214 let mut new_stream = stream.clone();
216 new_stream.data = decrypted_data;
217 Ok(PdfObject::Stream(new_stream))
218 } else {
219 Ok(obj) }
221 }
222 PdfObject::Dictionary(ref dict) => {
223 let mut new_dict = PdfDictionary::new();
225 for (key, value) in dict.0.iter() {
226 let decrypted_value =
227 self.decrypt_object_if_needed(value.clone(), obj_num, gen_num)?;
228 new_dict.insert(key.0.clone(), decrypted_value);
229 }
230 Ok(PdfObject::Dictionary(new_dict))
231 }
232 PdfObject::Array(ref arr) => {
233 let mut new_arr = Vec::new();
235 for elem in arr.0.iter() {
236 let decrypted_elem =
237 self.decrypt_object_if_needed(elem.clone(), obj_num, gen_num)?;
238 new_arr.push(decrypted_elem);
239 }
240 Ok(PdfObject::Array(PdfArray(new_arr)))
241 }
242 _ => Ok(obj),
244 }
245 }
246}
247
248impl PdfReader<File> {
249 pub fn open<P: AsRef<Path>>(path: P) -> ParseResult<Self> {
251 use std::io::Write;
252 let mut debug_file = std::fs::File::create("/tmp/pdf_open_debug.log").ok();
253 if let Some(ref mut f) = debug_file {
254 writeln!(f, "Opening file: {:?}", path.as_ref()).ok();
255 }
256 let file = File::open(path)?;
257 if let Some(ref mut f) = debug_file {
258 writeln!(f, "File opened successfully").ok();
259 }
260 let options = super::ParseOptions::lenient();
262 Self::new_with_options(file, options)
263 }
264
265 pub fn open_strict<P: AsRef<Path>>(path: P) -> ParseResult<Self> {
267 let file = File::open(path)?;
268 let options = super::ParseOptions::strict();
269 Self::new_with_options(file, options)
270 }
271
272 pub fn open_with_options<P: AsRef<Path>>(
274 path: P,
275 options: super::ParseOptions,
276 ) -> ParseResult<Self> {
277 let file = File::open(path)?;
278 Self::new_with_options(file, options)
279 }
280
281 pub fn open_document<P: AsRef<Path>>(
283 path: P,
284 ) -> ParseResult<super::document::PdfDocument<File>> {
285 let reader = Self::open(path)?;
286 Ok(reader.into_document())
287 }
288}
289
290impl<R: Read + Seek> PdfReader<R> {
291 pub fn new(reader: R) -> ParseResult<Self> {
293 Self::new_with_options(reader, super::ParseOptions::default())
294 }
295
296 pub fn new_with_options(reader: R, options: super::ParseOptions) -> ParseResult<Self> {
298 let mut buf_reader = BufReader::new(reader);
299
300 let start_pos = buf_reader.stream_position()?;
302 buf_reader.seek(SeekFrom::End(0))?;
303 let file_size = buf_reader.stream_position()?;
304 buf_reader.seek(SeekFrom::Start(start_pos))?;
305
306 if file_size == 0 {
307 return Err(ParseError::EmptyFile);
308 }
309
310 use std::io::Write;
312 let mut debug_file = std::fs::File::create("/tmp/pdf_debug.log").ok();
313 if let Some(ref mut f) = debug_file {
314 writeln!(f, "Parsing PDF header...").ok();
315 }
316 let header = PdfHeader::parse(&mut buf_reader)?;
317 if let Some(ref mut f) = debug_file {
318 writeln!(f, "Header parsed: version {}", header.version).ok();
319 }
320
321 if let Some(ref mut f) = debug_file {
323 writeln!(f, "Parsing XRef table...").ok();
324 }
325 let xref = XRefTable::parse_with_options(&mut buf_reader, &options)?;
326 if let Some(ref mut f) = debug_file {
327 writeln!(f, "XRef table parsed with {} entries", xref.len()).ok();
328 }
329
330 let trailer_dict = xref.trailer().ok_or(ParseError::InvalidTrailer)?.clone();
332
333 let xref_offset = xref.xref_offset();
334 let trailer = PdfTrailer::from_dict(trailer_dict, xref_offset)?;
335
336 trailer.validate()?;
338
339 let encryption_handler = if EncryptionHandler::detect_encryption(trailer.dict()) {
341 if let Ok(Some((encrypt_obj_num, encrypt_gen_num))) = trailer.encrypt() {
342 let mut temp_reader = Self {
344 reader: buf_reader,
345 header: header.clone(),
346 xref: xref.clone(),
347 trailer: trailer.clone(),
348 object_cache: HashMap::new(),
349 object_stream_cache: HashMap::new(),
350 page_tree: None,
351 parse_context: StackSafeContext::new(),
352 options: options.clone(),
353 encryption_handler: None,
354 objects_being_reconstructed: std::sync::Mutex::new(
355 std::collections::HashSet::new(),
356 ),
357 max_reconstruction_depth: 100,
358 };
359
360 let encrypt_obj = temp_reader.get_object(encrypt_obj_num, encrypt_gen_num)?;
362 if let Some(encrypt_dict) = encrypt_obj.as_dict() {
363 let file_id = trailer.id().and_then(|id_obj| {
365 if let PdfObject::Array(ref id_array) = id_obj {
366 if let Some(PdfObject::String(ref id_bytes)) = id_array.get(0) {
367 Some(id_bytes.as_bytes().to_vec())
368 } else {
369 None
370 }
371 } else {
372 None
373 }
374 });
375
376 match EncryptionHandler::new(encrypt_dict, file_id) {
377 Ok(handler) => {
378 buf_reader = temp_reader.reader;
380 Some(handler)
381 }
382 Err(_) => {
383 let _ = temp_reader.reader;
385 return Err(ParseError::EncryptionNotSupported);
386 }
387 }
388 } else {
389 let _ = temp_reader.reader;
390 return Err(ParseError::EncryptionNotSupported);
391 }
392 } else {
393 return Err(ParseError::EncryptionNotSupported);
394 }
395 } else {
396 None
397 };
398
399 Ok(Self {
400 reader: buf_reader,
401 header,
402 xref,
403 trailer,
404 object_cache: HashMap::new(),
405 object_stream_cache: HashMap::new(),
406 page_tree: None,
407 parse_context: StackSafeContext::new(),
408 options,
409 encryption_handler,
410 objects_being_reconstructed: std::sync::Mutex::new(std::collections::HashSet::new()),
411 max_reconstruction_depth: 100,
412 })
413 }
414
415 pub fn version(&self) -> &super::header::PdfVersion {
417 &self.header.version
418 }
419
420 pub fn catalog(&mut self) -> ParseResult<&PdfDictionary> {
422 let (obj_num, gen_num) = match self.trailer.root() {
424 Ok(root) => {
425 if let Ok(obj) = self.get_object(root.0, root.1) {
428 if let Some(dict) = obj.as_dict() {
429 if let Some(type_obj) = dict.get("Type") {
431 if let Some(type_name) = type_obj.as_name() {
432 if type_name.0 != "Catalog" {
433 tracing::warn!("Trailer /Root points to /Type/{} (not Catalog), scanning for real catalog", type_name.0);
434 if let Ok(catalog_ref) = self.find_catalog_object() {
436 catalog_ref
437 } else {
438 root }
440 } else {
441 root }
443 } else {
444 root }
446 } else {
447 root }
449 } else {
450 root }
452 } else {
453 root }
455 }
456 Err(_) => {
457 #[cfg(debug_assertions)]
459 tracing::warn!("Trailer missing Root entry, attempting recovery");
460
461 if let Some(root) = self.trailer.find_root_fallback() {
463 root
464 } else {
465 if let Ok(catalog_ref) = self.find_catalog_object() {
467 catalog_ref
468 } else {
469 return Err(ParseError::MissingKey("Root".to_string()));
470 }
471 }
472 }
473 };
474
475 let key = (obj_num, gen_num);
477 let needs_reconstruction = {
478 match self.get_object(obj_num, gen_num) {
479 Ok(catalog) => {
480 if catalog.as_dict().is_some() {
482 false
484 } else {
485 true
487 }
488 }
489 Err(_) => {
490 true
492 }
493 }
494 };
495
496 if !needs_reconstruction {
497 let catalog = self.get_object(obj_num, gen_num)?;
499 return catalog.as_dict().ok_or_else(|| ParseError::SyntaxError {
500 position: 0,
501 message: format!("Catalog object {} {} is not a dictionary", obj_num, gen_num),
502 });
503 }
504
505 match self.extract_object_manually(obj_num) {
508 Ok(dict) => {
509 let obj = PdfObject::Dictionary(dict);
511 self.object_cache.insert(key, obj);
512
513 use crate::parser::xref::XRefEntry;
515 let xref_entry = XRefEntry {
516 offset: 0, generation: gen_num,
518 in_use: true,
519 };
520 self.xref.add_entry(obj_num, xref_entry);
521
522 if let Some(PdfObject::Dictionary(ref dict)) = self.object_cache.get(&key) {
524 return Ok(dict);
525 }
526 }
527 Err(_e) => {}
528 }
529
530 Err(ParseError::SyntaxError {
532 position: 0,
533 message: format!(
534 "Catalog object {} could not be parsed or reconstructed as a dictionary",
535 obj_num
536 ),
537 })
538 }
539
540 pub fn info(&mut self) -> ParseResult<Option<&PdfDictionary>> {
542 match self.trailer.info() {
543 Some((obj_num, gen_num)) => {
544 let info = self.get_object(obj_num, gen_num)?;
545 Ok(info.as_dict())
546 }
547 None => Ok(None),
548 }
549 }
550
551 pub fn get_object(&mut self, obj_num: u32, gen_num: u16) -> ParseResult<&PdfObject> {
553 self.ensure_unlocked()?;
555
556 let key = (obj_num, gen_num);
557
558 if self.object_cache.contains_key(&key) {
560 return Ok(&self.object_cache[&key]);
561 }
562
563 {
565 let being_loaded =
566 self.objects_being_reconstructed
567 .lock()
568 .map_err(|_| ParseError::SyntaxError {
569 position: 0,
570 message: "Mutex poisoned during circular reference check".to_string(),
571 })?;
572 if being_loaded.contains(&obj_num) {
573 drop(being_loaded);
574 if self.options.collect_warnings {}
575 self.object_cache.insert(key, PdfObject::Null);
576 return Ok(&self.object_cache[&key]);
577 }
578 }
579
580 {
582 let being_loaded =
583 self.objects_being_reconstructed
584 .lock()
585 .map_err(|_| ParseError::SyntaxError {
586 position: 0,
587 message: "Mutex poisoned during depth limit check".to_string(),
588 })?;
589 let depth = being_loaded.len() as u32;
590 if depth >= self.max_reconstruction_depth {
591 drop(being_loaded);
592 if self.options.collect_warnings {}
593 return Err(ParseError::SyntaxError {
594 position: 0,
595 message: format!(
596 "Maximum object loading depth ({}) exceeded",
597 self.max_reconstruction_depth
598 ),
599 });
600 }
601 }
602
603 self.objects_being_reconstructed
605 .lock()
606 .map_err(|_| ParseError::SyntaxError {
607 position: 0,
608 message: "Mutex poisoned while marking object as being loaded".to_string(),
609 })?
610 .insert(obj_num);
611
612 match self.load_object_from_disk(obj_num, gen_num) {
614 Ok(_) => {
615 self.objects_being_reconstructed
617 .lock()
618 .map_err(|_| ParseError::SyntaxError {
619 position: 0,
620 message: "Mutex poisoned while unmarking object after successful load"
621 .to_string(),
622 })?
623 .remove(&obj_num);
624 Ok(&self.object_cache[&key])
626 }
627 Err(e) => {
628 if let Ok(mut guard) = self.objects_being_reconstructed.lock() {
631 guard.remove(&obj_num);
632 }
633 Err(e)
634 }
635 }
636 }
637
638 fn load_object_from_disk(&mut self, obj_num: u32, gen_num: u16) -> ParseResult<&PdfObject> {
640 let key = (obj_num, gen_num);
641
642 if self.object_cache.contains_key(&key) {
644 return Ok(&self.object_cache[&key]);
645 }
646
647 if let Some(ext_entry) = self.xref.get_extended_entry(obj_num) {
649 if let Some((stream_obj_num, index_in_stream)) = ext_entry.compressed_info {
650 return self.get_compressed_object(
652 obj_num,
653 gen_num,
654 stream_obj_num,
655 index_in_stream,
656 );
657 }
658 } else {
659 }
660
661 let (current_offset, _generation) = {
663 let entry = self.xref.get_entry(obj_num);
664
665 match entry {
666 Some(entry) => {
667 if !entry.in_use {
668 self.object_cache.insert(key, PdfObject::Null);
670 return Ok(&self.object_cache[&key]);
671 }
672
673 if entry.generation != gen_num {
674 if self.options.lenient_syntax {
675 if self.options.collect_warnings {
677 tracing::warn!("Object {} generation mismatch - expected {}, found {}, using available",
678 obj_num, gen_num, entry.generation);
679 }
680 } else {
681 return Err(ParseError::InvalidReference(obj_num, gen_num));
682 }
683 }
684
685 (entry.offset, entry.generation)
686 }
687 None => {
688 if self.is_reconstructible_object(obj_num) {
690 return self.attempt_manual_object_reconstruction(obj_num, gen_num, 0);
691 } else {
692 if self.options.lenient_syntax {
693 if self.options.collect_warnings {
695 tracing::warn!(
696 "Object {} {} R not found in XRef, returning null object",
697 obj_num,
698 gen_num
699 );
700 }
701 self.object_cache.insert(key, PdfObject::Null);
702 return Ok(&self.object_cache[&key]);
703 } else {
704 return Err(ParseError::InvalidReference(obj_num, gen_num));
705 }
706 }
707 }
708 }
709 };
710
711 self.reader.seek(std::io::SeekFrom::Start(current_offset))?;
715
716 let mut lexer =
718 super::lexer::Lexer::new_with_options(&mut self.reader, self.options.clone());
719
720 {
722 let token = lexer.next_token()?;
724 let read_obj_num = match token {
725 super::lexer::Token::Integer(n) => n as u32,
726 _ => {
727 if self.options.lenient_syntax {
729 if self.options.collect_warnings {
731 tracing::debug!(
732 "Warning: Using expected object number {obj_num} instead of parsed token: {:?}",
733 token
734 );
735 }
736 obj_num
737 } else {
738 return Err(ParseError::SyntaxError {
739 position: current_offset as usize,
740 message: "Expected object number".to_string(),
741 });
742 }
743 }
744 };
745
746 if read_obj_num != obj_num && !self.options.lenient_syntax {
747 return Err(ParseError::SyntaxError {
748 position: current_offset as usize,
749 message: format!(
750 "Object number mismatch: expected {obj_num}, found {read_obj_num}"
751 ),
752 });
753 }
754
755 let token = lexer.next_token()?;
757 let _read_gen_num = match token {
758 super::lexer::Token::Integer(n) => n as u16,
759 _ => {
760 if self.options.lenient_syntax {
762 if self.options.collect_warnings {
763 tracing::warn!(
764 "Using generation 0 instead of parsed token for object {obj_num}"
765 );
766 }
767 0
768 } else {
769 return Err(ParseError::SyntaxError {
770 position: current_offset as usize,
771 message: "Expected generation number".to_string(),
772 });
773 }
774 }
775 };
776
777 let token = lexer.next_token()?;
779 match token {
780 super::lexer::Token::Obj => {}
781 _ => {
782 if self.options.lenient_syntax {
783 if self.options.collect_warnings {
785 tracing::warn!("Expected 'obj' keyword for object {obj_num} {gen_num}, continuing anyway");
786 }
787 } else {
788 return Err(ParseError::SyntaxError {
789 position: current_offset as usize,
790 message: "Expected 'obj' keyword".to_string(),
791 });
792 }
793 }
794 }
795 }
796
797 self.parse_context.enter()?;
799
800 let obj = match PdfObject::parse_with_options(&mut lexer, &self.options) {
801 Ok(obj) => {
802 self.parse_context.exit();
803 if obj_num == 102 && self.options.collect_warnings {}
805 obj
806 }
807 Err(e) => {
808 self.parse_context.exit();
809
810 if self.is_reconstructible_object(obj_num)
812 && self.can_attempt_manual_reconstruction(&e)
813 {
814 match self.attempt_manual_object_reconstruction(
815 obj_num,
816 gen_num,
817 current_offset,
818 ) {
819 Ok(reconstructed_obj) => {
820 return Ok(reconstructed_obj);
821 }
822 Err(_reconstruction_error) => {}
823 }
824 }
825
826 return Err(e);
827 }
828 };
829
830 let token = lexer.next_token()?;
832 match token {
833 super::lexer::Token::EndObj => {}
834 _ => {
835 if self.options.lenient_syntax {
836 if self.options.collect_warnings {
838 tracing::warn!("Expected 'endobj' keyword after object {obj_num} {gen_num}, continuing anyway");
839 }
840 } else {
841 return Err(ParseError::SyntaxError {
842 position: current_offset as usize,
843 message: "Expected 'endobj' keyword".to_string(),
844 });
845 }
846 }
847 };
848
849 let decrypted_obj = self.decrypt_object_if_needed(obj, obj_num, gen_num)?;
851
852 self.object_cache.insert(key, decrypted_obj);
854
855 Ok(&self.object_cache[&key])
856 }
857
858 pub fn resolve<'a>(&'a mut self, obj: &'a PdfObject) -> ParseResult<&'a PdfObject> {
860 match obj {
861 PdfObject::Reference(obj_num, gen_num) => self.get_object(*obj_num, *gen_num),
862 _ => Ok(obj),
863 }
864 }
865
866 pub fn resolve_stream_length(&mut self, obj: &PdfObject) -> ParseResult<Option<usize>> {
869 match obj {
870 PdfObject::Integer(len) => {
871 if *len >= 0 {
872 Ok(Some(*len as usize))
873 } else {
874 Ok(None)
876 }
877 }
878 PdfObject::Reference(obj_num, gen_num) => {
879 let resolved = self.get_object(*obj_num, *gen_num)?;
880 match resolved {
881 PdfObject::Integer(len) => {
882 if *len >= 0 {
883 Ok(Some(*len as usize))
884 } else {
885 Ok(None)
886 }
887 }
888 _ => {
889 Ok(None)
891 }
892 }
893 }
894 _ => {
895 Ok(None)
897 }
898 }
899 }
900
901 fn get_compressed_object(
903 &mut self,
904 obj_num: u32,
905 gen_num: u16,
906 stream_obj_num: u32,
907 _index_in_stream: u32,
908 ) -> ParseResult<&PdfObject> {
909 let key = (obj_num, gen_num);
910
911 if !self.object_stream_cache.contains_key(&stream_obj_num) {
913 let stream_obj = self.get_object(stream_obj_num, 0)?;
915
916 if let Some(stream) = stream_obj.as_stream() {
917 let obj_stream = ObjectStream::parse(stream.clone(), &self.options)?;
919 self.object_stream_cache.insert(stream_obj_num, obj_stream);
920 } else {
921 return Err(ParseError::SyntaxError {
922 position: 0,
923 message: format!("Object {stream_obj_num} is not a stream"),
924 });
925 }
926 }
927
928 let obj_stream = &self.object_stream_cache[&stream_obj_num];
930 let obj = obj_stream
931 .get_object(obj_num)
932 .ok_or_else(|| ParseError::SyntaxError {
933 position: 0,
934 message: format!("Object {obj_num} not found in object stream {stream_obj_num}"),
935 })?;
936
937 let decrypted_obj = self.decrypt_object_if_needed(obj.clone(), obj_num, gen_num)?;
939
940 self.object_cache.insert(key, decrypted_obj);
942 Ok(&self.object_cache[&key])
943 }
944
945 pub fn pages(&mut self) -> ParseResult<&PdfDictionary> {
947 let (pages_obj_num, pages_gen_num) = {
949 let catalog = self.catalog()?;
950
951 if let Some(pages_ref) = catalog.get("Pages") {
953 match pages_ref {
954 PdfObject::Reference(obj_num, gen_num) => (*obj_num, *gen_num),
955 _ => {
956 return Err(ParseError::SyntaxError {
957 position: 0,
958 message: "Pages must be a reference".to_string(),
959 })
960 }
961 }
962 } else {
963 #[cfg(debug_assertions)]
965 tracing::warn!("Catalog missing Pages entry, attempting recovery");
966
967 if let Ok(page_refs) = self.find_page_objects() {
969 if !page_refs.is_empty() {
970 return self.create_synthetic_pages_dict(&page_refs);
972 }
973 }
974
975 if self.options.lenient_syntax {
977 if self.options.collect_warnings {
978 tracing::warn!("Missing Pages in catalog, searching for page tree");
979 }
980 let mut found_pages = None;
982 for i in 1..self.xref.len() as u32 {
983 if let Ok(obj) = self.get_object(i, 0) {
984 if let Some(dict) = obj.as_dict() {
985 if let Some(obj_type) = dict.get("Type").and_then(|t| t.as_name()) {
986 if obj_type.0 == "Pages" {
987 found_pages = Some((i, 0));
988 break;
989 }
990 }
991 }
992 }
993 }
994 if let Some((obj_num, gen_num)) = found_pages {
995 (obj_num, gen_num)
996 } else {
997 return Err(ParseError::MissingKey("Pages".to_string()));
998 }
999 } else {
1000 return Err(ParseError::MissingKey("Pages".to_string()));
1001 }
1002 }
1003 };
1004
1005 let needs_double_resolve = {
1008 let pages_obj = self.get_object(pages_obj_num, pages_gen_num)?;
1009 pages_obj.as_reference()
1010 };
1011
1012 let (final_obj_num, final_gen_num) =
1014 if let Some((ref_obj_num, ref_gen_num)) = needs_double_resolve {
1015 (ref_obj_num, ref_gen_num)
1016 } else {
1017 (pages_obj_num, pages_gen_num)
1018 };
1019
1020 let actual_pages_num = {
1022 let is_valid_dict = {
1024 let pages_obj = self.get_object(final_obj_num, final_gen_num)?;
1025 pages_obj.as_dict().is_some()
1026 };
1027
1028 if is_valid_dict {
1029 final_obj_num
1031 } else {
1032 #[cfg(debug_assertions)]
1034 tracing::warn!("Pages reference invalid, searching for valid Pages object");
1035
1036 if self.options.lenient_syntax {
1037 let xref_len = self.xref.len() as u32;
1039 let mut found_pages_num = None;
1040
1041 for i in 1..xref_len {
1042 let is_pages = {
1044 if let Ok(obj) = self.get_object(i, 0) {
1045 if let Some(dict) = obj.as_dict() {
1046 if let Some(obj_type) =
1047 dict.get("Type").and_then(|t| t.as_name())
1048 {
1049 obj_type.0 == "Pages"
1050 } else {
1051 false
1052 }
1053 } else {
1054 false
1055 }
1056 } else {
1057 false
1058 }
1059 };
1060
1061 if is_pages {
1062 found_pages_num = Some(i);
1063 break;
1064 }
1065 }
1066
1067 if let Some(obj_num) = found_pages_num {
1068 #[cfg(debug_assertions)]
1069 tracing::debug!("Found valid Pages object at {} 0 R", obj_num);
1070 obj_num
1071 } else {
1072 return Err(ParseError::SyntaxError {
1074 position: 0,
1075 message: "Pages is not a dictionary and no valid Pages object found"
1076 .to_string(),
1077 });
1078 }
1079 } else {
1080 return Err(ParseError::SyntaxError {
1082 position: 0,
1083 message: "Pages is not a dictionary".to_string(),
1084 });
1085 }
1086 }
1087 };
1088
1089 let pages_obj = self.get_object(actual_pages_num, 0)?;
1091 pages_obj.as_dict().ok_or_else(|| ParseError::SyntaxError {
1092 position: 0,
1093 message: "Pages object is not a dictionary".to_string(),
1094 })
1095 }
1096
1097 pub fn page_count(&mut self) -> ParseResult<u32> {
1099 match self.pages() {
1101 Ok(pages) => {
1102 if let Some(count_obj) = pages.get("Count") {
1104 if let Some(count) = count_obj.as_integer() {
1105 return Ok(count as u32);
1106 }
1107 }
1108
1109 if let Some(kids_obj) = pages.get("Kids") {
1111 if let Some(kids_array) = kids_obj.as_array() {
1112 return Ok(kids_array.0.len() as u32);
1115 }
1116 }
1117
1118 Ok(0)
1119 }
1120 Err(_) => {
1121 tracing::debug!("Standard page extraction failed, trying direct extraction");
1123 self.page_count_fallback()
1124 }
1125 }
1126 }
1127
1128 fn page_count_fallback(&mut self) -> ParseResult<u32> {
1130 if let Some(count) = self.extract_page_count_from_linearization() {
1132 tracing::debug!("Found page count {} from linearization", count);
1133 return Ok(count);
1134 }
1135
1136 if let Some(count) = self.count_page_objects_directly() {
1138 tracing::debug!("Found {} pages by counting page objects", count);
1139 return Ok(count);
1140 }
1141
1142 Ok(0)
1143 }
1144
1145 fn extract_page_count_from_linearization(&mut self) -> Option<u32> {
1147 match self.get_object(100, 0) {
1149 Ok(obj) => {
1150 tracing::debug!("Found object 100: {:?}", obj);
1151 if let Some(dict) = obj.as_dict() {
1152 tracing::debug!("Object 100 is a dictionary with {} keys", dict.0.len());
1153 if let Some(n_obj) = dict.get("N") {
1155 tracing::debug!("Found /N field: {:?}", n_obj);
1156 if let Some(count) = n_obj.as_integer() {
1157 tracing::debug!("Extracted page count from linearization: {}", count);
1158 return Some(count as u32);
1159 }
1160 } else {
1161 tracing::debug!("No /N field found in object 100");
1162 for (key, value) in &dict.0 {
1163 tracing::debug!(" {:?}: {:?}", key, value);
1164 }
1165 }
1166 } else {
1167 tracing::debug!("Object 100 is not a dictionary: {:?}", obj);
1168 }
1169 }
1170 Err(e) => {
1171 tracing::debug!("Failed to get object 100: {:?}", e);
1172 tracing::debug!("Attempting direct content extraction...");
1173 return self.extract_n_value_from_raw_object_100();
1175 }
1176 }
1177
1178 None
1179 }
1180
1181 fn extract_n_value_from_raw_object_100(&mut self) -> Option<u32> {
1182 if let Some(entry) = self.xref.get_entry(100) {
1184 if self.reader.seek(SeekFrom::Start(entry.offset)).is_err() {
1186 return None;
1187 }
1188
1189 let mut buffer = vec![0u8; 1024];
1191 if let Ok(bytes_read) = self.reader.read(&mut buffer) {
1192 if bytes_read == 0 {
1193 return None;
1194 }
1195
1196 let content = String::from_utf8_lossy(&buffer[..bytes_read]);
1198 tracing::debug!("Raw content around object 100:\n{}", content);
1199
1200 if let Some(n_pos) = content.find("/N ") {
1202 let after_n = &content[n_pos + 3..];
1203 tracing::debug!(
1204 "Content after /N: {}",
1205 &after_n[..std::cmp::min(50, after_n.len())]
1206 );
1207
1208 let mut num_str = String::new();
1210 for ch in after_n.chars() {
1211 if ch.is_ascii_digit() {
1212 num_str.push(ch);
1213 } else if !num_str.is_empty() {
1214 break;
1216 }
1217 }
1219
1220 if !num_str.is_empty() {
1221 if let Ok(page_count) = num_str.parse::<u32>() {
1222 tracing::debug!(
1223 "Extracted page count from raw content: {}",
1224 page_count
1225 );
1226 return Some(page_count);
1227 }
1228 }
1229 }
1230 }
1231 }
1232 None
1233 }
1234
1235 #[allow(dead_code)]
1236 fn find_object_pattern(&mut self, obj_num: u32, gen_num: u16) -> Option<u64> {
1237 let pattern = format!("{} {} obj", obj_num, gen_num);
1238
1239 let original_pos = self.reader.stream_position().unwrap_or(0);
1241
1242 if self.reader.seek(SeekFrom::Start(0)).is_err() {
1244 return None;
1245 }
1246
1247 let mut buffer = vec![0u8; 8192];
1249 let mut file_content = Vec::new();
1250
1251 loop {
1252 match self.reader.read(&mut buffer) {
1253 Ok(0) => break, Ok(bytes_read) => {
1255 file_content.extend_from_slice(&buffer[..bytes_read]);
1256 }
1257 Err(_) => return None,
1258 }
1259 }
1260
1261 let content = String::from_utf8_lossy(&file_content);
1263 if let Some(pattern_pos) = content.find(&pattern) {
1264 let after_pattern = pattern_pos + pattern.len();
1266 let search_area = &content[after_pattern..];
1267
1268 if let Some(dict_start_offset) = search_area.find("<<") {
1269 let dict_start_pos = after_pattern + dict_start_offset;
1270
1271 self.reader.seek(SeekFrom::Start(original_pos)).ok();
1273 return Some(dict_start_pos as u64);
1274 } else {
1275 }
1276 }
1277
1278 self.reader.seek(SeekFrom::Start(original_pos)).ok();
1280 None
1281 }
1282
1283 fn can_attempt_manual_reconstruction(&self, error: &ParseError) -> bool {
1285 match error {
1286 ParseError::SyntaxError { .. } => true,
1288 ParseError::UnexpectedToken { .. } => true,
1289 _ => false,
1291 }
1292 }
1293
1294 fn is_reconstructible_object(&self, obj_num: u32) -> bool {
1296 if obj_num == 102 || obj_num == 113 || obj_num == 114 {
1298 return true;
1299 }
1300
1301 let page_objects = [
1304 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 30, 34, 37, 39, 42, 44, 46, 49, 52,
1305 54, 56, 58, 60, 62, 64, 67, 69, 71, 73, 75, 77, 79, 81, 83, 85, 87, 89, 91, 93, 104,
1306 ];
1307
1308 let content_objects = [
1311 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 29, 31, 32, 33, 35, 36, 38, 40, 41,
1312 43, 45, 47, 48, 50, 51, 53, 55, 57, 59, 61, 63, 65, 66, 68, 70, 72, 74, 76, 78, 80, 82,
1313 84, 86, 88, 90, 92, 94, 95, 96, 97, 98, 99, 100, 101, 105, 106, 107, 108, 109, 110,
1314 111,
1315 ];
1316
1317 page_objects.contains(&obj_num) || content_objects.contains(&obj_num)
1318 }
1319
1320 fn is_page_object(&self, obj_num: u32) -> bool {
1322 let page_objects = [
1323 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 30, 34, 37, 39, 42, 44, 46, 49, 52,
1324 54, 56, 58, 60, 62, 64, 67, 69, 71, 73, 75, 77, 79, 81, 83, 85, 87, 89, 91, 93, 104,
1325 ];
1326 page_objects.contains(&obj_num)
1327 }
1328
1329 fn parse_page_dictionary_content(
1331 &self,
1332 dict_content: &str,
1333 result_dict: &mut std::collections::HashMap<
1334 crate::parser::objects::PdfName,
1335 crate::parser::objects::PdfObject,
1336 >,
1337 _obj_num: u32,
1338 ) -> ParseResult<()> {
1339 use crate::parser::objects::{PdfArray, PdfName, PdfObject};
1340 use std::collections::HashMap;
1341
1342 if let Some(mediabox_start) = dict_content.find("/MediaBox") {
1344 let mediabox_area = &dict_content[mediabox_start..];
1345 if let Some(start_bracket) = mediabox_area.find("[") {
1346 if let Some(end_bracket) = mediabox_area.find("]") {
1347 let mediabox_content = &mediabox_area[start_bracket + 1..end_bracket];
1348 let values: Vec<f32> = mediabox_content
1349 .split_whitespace()
1350 .filter_map(|s| s.parse().ok())
1351 .collect();
1352
1353 if values.len() == 4 {
1354 let mediabox = PdfArray(vec![
1355 PdfObject::Integer(values[0] as i64),
1356 PdfObject::Integer(values[1] as i64),
1357 PdfObject::Integer(values[2] as i64),
1358 PdfObject::Integer(values[3] as i64),
1359 ]);
1360 result_dict
1361 .insert(PdfName("MediaBox".to_string()), PdfObject::Array(mediabox));
1362 }
1363 }
1364 }
1365 }
1366
1367 if let Some(contents_match) = dict_content.find("/Contents") {
1369 let contents_area = &dict_content[contents_match..];
1370 let parts: Vec<&str> = contents_area.split_whitespace().collect();
1372 if parts.len() >= 3 {
1373 if let (Ok(obj_ref), Ok(gen_ref)) =
1374 (parts[1].parse::<u32>(), parts[2].parse::<u16>())
1375 {
1376 if parts.len() > 3 && parts[3] == "R" {
1377 result_dict.insert(
1378 PdfName("Contents".to_string()),
1379 PdfObject::Reference(obj_ref, gen_ref),
1380 );
1381 }
1382 }
1383 }
1384 }
1385
1386 if dict_content.contains("/Parent") {
1388 result_dict.insert(
1389 PdfName("Parent".to_string()),
1390 PdfObject::Reference(113, 0), );
1392 }
1393
1394 if dict_content.contains("/Resources") {
1396 if let Ok(parsed_resources) = self.parse_resources_from_content(&dict_content) {
1397 result_dict.insert(PdfName("Resources".to_string()), parsed_resources);
1398 } else {
1399 let resources = HashMap::new();
1401 result_dict.insert(
1402 PdfName("Resources".to_string()),
1403 PdfObject::Dictionary(crate::parser::objects::PdfDictionary(resources)),
1404 );
1405 }
1406 }
1407
1408 Ok(())
1409 }
1410
1411 fn attempt_manual_object_reconstruction(
1413 &mut self,
1414 obj_num: u32,
1415 gen_num: u16,
1416 _current_offset: u64,
1417 ) -> ParseResult<&PdfObject> {
1418 let is_circular = self
1420 .objects_being_reconstructed
1421 .lock()
1422 .map_err(|_| ParseError::SyntaxError {
1423 position: 0,
1424 message: "Mutex poisoned during circular reference check".to_string(),
1425 })?
1426 .contains(&obj_num);
1427
1428 if is_circular {
1429 tracing::debug!(
1430 "Warning: Circular reconstruction detected for object {} {} - attempting manual extraction",
1431 obj_num, gen_num
1432 );
1433
1434 match self.extract_object_or_stream_manually(obj_num) {
1438 Ok(obj) => {
1439 tracing::debug!(
1440 " Successfully extracted object {} {} manually despite circular reference",
1441 obj_num, gen_num
1442 );
1443 self.object_cache.insert((obj_num, gen_num), obj);
1444 return Ok(&self.object_cache[&(obj_num, gen_num)]);
1445 }
1446 Err(e) => {
1447 tracing::debug!(
1448 " Manual extraction failed: {} - breaking cycle with null object",
1449 e
1450 );
1451 self.object_cache
1453 .insert((obj_num, gen_num), PdfObject::Null);
1454 return Ok(&self.object_cache[&(obj_num, gen_num)]);
1455 }
1456 }
1457 }
1458
1459 let current_depth = self
1461 .objects_being_reconstructed
1462 .lock()
1463 .map_err(|_| ParseError::SyntaxError {
1464 position: 0,
1465 message: "Mutex poisoned during depth check".to_string(),
1466 })?
1467 .len() as u32;
1468 if current_depth >= self.max_reconstruction_depth {
1469 return Err(ParseError::SyntaxError {
1470 position: 0,
1471 message: format!(
1472 "Maximum reconstruction depth ({}) exceeded for object {} {}",
1473 self.max_reconstruction_depth, obj_num, gen_num
1474 ),
1475 });
1476 }
1477
1478 self.objects_being_reconstructed
1480 .lock()
1481 .map_err(|_| ParseError::SyntaxError {
1482 position: 0,
1483 message: "Mutex poisoned while marking object as being reconstructed".to_string(),
1484 })?
1485 .insert(obj_num);
1486
1487 let reconstructed_obj = match self.smart_object_reconstruction(obj_num, gen_num) {
1489 Ok(obj) => obj,
1490 Err(_) => {
1491 match self.extract_object_or_stream_manually(obj_num) {
1493 Ok(obj) => obj,
1494 Err(e) => {
1495 if self.options.lenient_syntax {
1497 PdfObject::Null
1498 } else {
1499 if let Ok(mut guard) = self.objects_being_reconstructed.lock() {
1501 guard.remove(&obj_num);
1502 }
1503 return Err(e);
1504 }
1505 }
1506 }
1507 }
1508 };
1509
1510 self.objects_being_reconstructed
1512 .lock()
1513 .map_err(|_| ParseError::SyntaxError {
1514 position: 0,
1515 message: "Mutex poisoned while unmarking reconstructed object".to_string(),
1516 })?
1517 .remove(&obj_num);
1518
1519 self.object_cache
1520 .insert((obj_num, gen_num), reconstructed_obj);
1521
1522 use crate::parser::xref::XRefEntry;
1524 let xref_entry = XRefEntry {
1525 offset: 0, generation: gen_num,
1527 in_use: true,
1528 };
1529 self.xref.add_entry(obj_num, xref_entry);
1530
1531 self.object_cache
1532 .get(&(obj_num, gen_num))
1533 .ok_or_else(|| ParseError::SyntaxError {
1534 position: 0,
1535 message: format!(
1536 "Object {} {} not in cache after reconstruction",
1537 obj_num, gen_num
1538 ),
1539 })
1540 }
1541
1542 fn smart_object_reconstruction(
1544 &mut self,
1545 obj_num: u32,
1546 gen_num: u16,
1547 ) -> ParseResult<PdfObject> {
1548 if let Ok(inferred_obj) = self.infer_object_from_context(obj_num) {
1552 return Ok(inferred_obj);
1553 }
1554
1555 if let Ok(scanned_obj) = self.scan_for_object_patterns(obj_num) {
1557 return Ok(scanned_obj);
1558 }
1559
1560 if let Ok(synthetic_obj) = self.create_synthetic_object(obj_num) {
1562 return Ok(synthetic_obj);
1563 }
1564
1565 Err(ParseError::SyntaxError {
1566 position: 0,
1567 message: format!("Could not reconstruct object {} {}", obj_num, gen_num),
1568 })
1569 }
1570
1571 fn infer_object_from_context(&mut self, obj_num: u32) -> ParseResult<PdfObject> {
1573 for (_key, obj) in self.object_cache.iter() {
1577 if let PdfObject::Dictionary(dict) = obj {
1578 for (key, value) in dict.0.iter() {
1579 if let PdfObject::Reference(ref_num, _) = value {
1580 if *ref_num == obj_num {
1581 match key.as_str() {
1583 "Font" | "F1" | "F2" | "F3" => {
1584 return Ok(self.create_font_object(obj_num));
1585 }
1586 "XObject" | "Image" | "Im1" => {
1587 return Ok(self.create_xobject(obj_num));
1588 }
1589 "Contents" => {
1590 return Ok(self.create_content_stream(obj_num));
1591 }
1592 "Resources" => {
1593 return Ok(self.create_resources_dict(obj_num));
1594 }
1595 _ => continue,
1596 }
1597 }
1598 }
1599 }
1600 }
1601 }
1602
1603 Err(ParseError::SyntaxError {
1604 position: 0,
1605 message: "Cannot infer object type from context".to_string(),
1606 })
1607 }
1608
1609 fn scan_for_object_patterns(&mut self, obj_num: u32) -> ParseResult<PdfObject> {
1611 self.extract_object_or_stream_manually(obj_num)
1614 }
1615
1616 fn create_synthetic_object(&mut self, obj_num: u32) -> ParseResult<PdfObject> {
1618 use super::objects::{PdfDictionary, PdfName, PdfObject};
1619
1620 match obj_num {
1622 1..=10 => {
1623 let mut dict = PdfDictionary::new();
1625 dict.insert(
1626 "Type".to_string(),
1627 PdfObject::Name(PdfName("Null".to_string())),
1628 );
1629 Ok(PdfObject::Dictionary(dict))
1630 }
1631 _ => {
1632 Ok(PdfObject::Null)
1634 }
1635 }
1636 }
1637
1638 fn create_font_object(&self, _obj_num: u32) -> PdfObject {
1639 use super::objects::{PdfDictionary, PdfName, PdfObject};
1640 let mut font_dict = PdfDictionary::new();
1641 font_dict.insert(
1642 "Type".to_string(),
1643 PdfObject::Name(PdfName("Font".to_string())),
1644 );
1645 font_dict.insert(
1646 "Subtype".to_string(),
1647 PdfObject::Name(PdfName("Type1".to_string())),
1648 );
1649 font_dict.insert(
1650 "BaseFont".to_string(),
1651 PdfObject::Name(PdfName("Helvetica".to_string())),
1652 );
1653 PdfObject::Dictionary(font_dict)
1654 }
1655
1656 fn create_xobject(&self, _obj_num: u32) -> PdfObject {
1657 use super::objects::{PdfDictionary, PdfName, PdfObject};
1658 let mut xobj_dict = PdfDictionary::new();
1659 xobj_dict.insert(
1660 "Type".to_string(),
1661 PdfObject::Name(PdfName("XObject".to_string())),
1662 );
1663 xobj_dict.insert(
1664 "Subtype".to_string(),
1665 PdfObject::Name(PdfName("Form".to_string())),
1666 );
1667 PdfObject::Dictionary(xobj_dict)
1668 }
1669
1670 fn create_content_stream(&self, _obj_num: u32) -> PdfObject {
1671 use super::objects::{PdfDictionary, PdfObject, PdfStream};
1672 let mut stream_dict = PdfDictionary::new();
1673 stream_dict.insert("Length".to_string(), PdfObject::Integer(0));
1674
1675 let stream = PdfStream {
1676 dict: stream_dict,
1677 data: Vec::new(),
1678 };
1679 PdfObject::Stream(stream)
1680 }
1681
1682 fn create_resources_dict(&self, _obj_num: u32) -> PdfObject {
1683 use super::objects::{PdfArray, PdfDictionary, PdfObject};
1684 let mut res_dict = PdfDictionary::new();
1685 res_dict.insert("ProcSet".to_string(), PdfObject::Array(PdfArray::new()));
1686 PdfObject::Dictionary(res_dict)
1687 }
1688
1689 fn extract_object_manually(
1690 &mut self,
1691 obj_num: u32,
1692 ) -> ParseResult<crate::parser::objects::PdfDictionary> {
1693 use crate::parser::objects::{PdfArray, PdfDictionary, PdfName, PdfObject};
1694 use std::collections::HashMap;
1695
1696 let original_pos = self.reader.stream_position().unwrap_or(0);
1698
1699 if self.reader.seek(SeekFrom::Start(0)).is_err() {
1701 return Err(ParseError::SyntaxError {
1702 position: 0,
1703 message: "Failed to seek to beginning for manual extraction".to_string(),
1704 });
1705 }
1706
1707 let mut buffer = Vec::new();
1709 if self.reader.read_to_end(&mut buffer).is_err() {
1710 return Err(ParseError::SyntaxError {
1711 position: 0,
1712 message: "Failed to read file for manual extraction".to_string(),
1713 });
1714 }
1715
1716 let content = String::from_utf8_lossy(&buffer);
1717
1718 let pattern = format!("{} 0 obj", obj_num);
1720 if let Some(start) = content.find(&pattern) {
1721 let search_area = &content[start..];
1722 if let Some(dict_start) = search_area.find("<<") {
1723 let mut bracket_count = 1;
1725 let mut pos = dict_start + 2;
1726 let bytes = search_area.as_bytes();
1727 let mut dict_end = None;
1728
1729 while pos < bytes.len() - 1 && bracket_count > 0 {
1730 if bytes[pos] == b'<' && bytes[pos + 1] == b'<' {
1731 bracket_count += 1;
1732 pos += 2;
1733 } else if bytes[pos] == b'>' && bytes[pos + 1] == b'>' {
1734 bracket_count -= 1;
1735 if bracket_count == 0 {
1736 dict_end = Some(pos);
1737 break;
1738 }
1739 pos += 2;
1740 } else {
1741 pos += 1;
1742 }
1743 }
1744
1745 if let Some(dict_end) = dict_end {
1746 let dict_content = &search_area[dict_start + 2..dict_end];
1747
1748 let mut result_dict = HashMap::new();
1750
1751 if dict_content.contains("/Type/Catalog")
1754 || dict_content.contains("/Type /Catalog")
1755 {
1756 result_dict.insert(
1757 PdfName("Type".to_string()),
1758 PdfObject::Name(PdfName("Catalog".to_string())),
1759 );
1760
1761 if let Some(pages_start) = dict_content.find("/Pages") {
1765 let after_pages = &dict_content[pages_start + 6..]; let trimmed = after_pages.trim_start();
1768 let parts: Vec<&str> = trimmed.split_whitespace().collect();
1770 if parts.len() >= 3 {
1771 if let (Ok(obj), Ok(gen)) =
1775 (parts[0].parse::<u32>(), parts[1].parse::<u16>())
1776 {
1777 if parts[2] == "R" || parts[2].starts_with('R') {
1778 result_dict.insert(
1779 PdfName("Pages".to_string()),
1780 PdfObject::Reference(obj, gen),
1781 );
1782 }
1783 }
1784 }
1785 }
1786
1787 if let Some(ver_start) = dict_content.find("/Version") {
1790 let after_ver = &dict_content[ver_start + 8..];
1791 if let Some(ver_end) = after_ver.find(|c: char| c == '/' || c == '>') {
1792 let version_str = after_ver[..ver_end].trim();
1793 result_dict.insert(
1794 PdfName("Version".to_string()),
1795 PdfObject::Name(PdfName(
1796 version_str.trim_start_matches('/').to_string(),
1797 )),
1798 );
1799 }
1800 }
1801
1802 if let Some(meta_start) = dict_content.find("/Metadata") {
1804 let after_meta = &dict_content[meta_start + 9..];
1805 let parts: Vec<&str> = after_meta.split_whitespace().collect();
1806 if parts.len() >= 3 {
1807 if let (Ok(obj), Ok(gen)) =
1808 (parts[0].parse::<u32>(), parts[1].parse::<u16>())
1809 {
1810 if parts[2] == "R" {
1811 result_dict.insert(
1812 PdfName("Metadata".to_string()),
1813 PdfObject::Reference(obj, gen),
1814 );
1815 }
1816 }
1817 }
1818 }
1819
1820 if let Some(acro_start) = dict_content.find("/AcroForm") {
1822 let after_acro = &dict_content[acro_start + 9..];
1823 if after_acro.trim_start().starts_with("<<") {
1825 } else {
1827 let parts: Vec<&str> = after_acro.split_whitespace().collect();
1828 if parts.len() >= 3 {
1829 if let (Ok(obj), Ok(gen)) =
1830 (parts[0].parse::<u32>(), parts[1].parse::<u16>())
1831 {
1832 if parts[2] == "R" {
1833 result_dict.insert(
1834 PdfName("AcroForm".to_string()),
1835 PdfObject::Reference(obj, gen),
1836 );
1837 }
1838 }
1839 }
1840 }
1841 }
1842 } else if obj_num == 102 {
1843 if dict_content.contains("/Type /Catalog") {
1845 result_dict.insert(
1847 PdfName("Type".to_string()),
1848 PdfObject::Name(PdfName("Catalog".to_string())),
1849 );
1850
1851 if dict_content.contains("/Dests 139 0 R") {
1853 result_dict.insert(
1854 PdfName("Dests".to_string()),
1855 PdfObject::Reference(139, 0),
1856 );
1857 }
1858
1859 if dict_content.contains("/Pages 113 0 R") {
1861 result_dict.insert(
1862 PdfName("Pages".to_string()),
1863 PdfObject::Reference(113, 0),
1864 );
1865 }
1866 } else {
1867 self.reader.seek(SeekFrom::Start(original_pos)).ok();
1870 return Err(ParseError::SyntaxError {
1871 position: 0,
1872 message:
1873 "Object 102 is not a corrupted catalog, cannot reconstruct"
1874 .to_string(),
1875 });
1876 }
1877 } else if obj_num == 113 {
1878 result_dict.insert(
1881 PdfName("Type".to_string()),
1882 PdfObject::Name(PdfName("Pages".to_string())),
1883 );
1884
1885 let page_refs = match self.find_page_objects() {
1887 Ok(refs) => refs,
1888 Err(_e) => {
1889 vec![]
1890 }
1891 };
1892
1893 let page_count = if page_refs.is_empty() {
1895 44
1896 } else {
1897 page_refs.len() as i64
1898 };
1899 result_dict
1900 .insert(PdfName("Count".to_string()), PdfObject::Integer(page_count));
1901
1902 let kids_array: Vec<PdfObject> = page_refs
1904 .into_iter()
1905 .map(|(obj_num, gen_num)| PdfObject::Reference(obj_num, gen_num))
1906 .collect();
1907
1908 result_dict.insert(
1909 PdfName("Kids".to_string()),
1910 PdfObject::Array(PdfArray(kids_array)),
1911 );
1912 } else if obj_num == 114 {
1913 result_dict.insert(
1916 PdfName("Type".to_string()),
1917 PdfObject::Name(PdfName("Pages".to_string())),
1918 );
1919
1920 let page_refs = match self.find_page_objects() {
1922 Ok(refs) => refs,
1923 Err(_e) => {
1924 vec![]
1925 }
1926 };
1927
1928 let page_count = if page_refs.is_empty() {
1930 44
1931 } else {
1932 page_refs.len() as i64
1933 };
1934 result_dict
1935 .insert(PdfName("Count".to_string()), PdfObject::Integer(page_count));
1936
1937 let kids_array: Vec<PdfObject> = page_refs
1939 .into_iter()
1940 .map(|(obj_num, gen_num)| PdfObject::Reference(obj_num, gen_num))
1941 .collect();
1942
1943 result_dict.insert(
1944 PdfName("Kids".to_string()),
1945 PdfObject::Array(PdfArray(kids_array)),
1946 );
1947 } else if self.is_page_object(obj_num) {
1948 result_dict.insert(
1951 PdfName("Type".to_string()),
1952 PdfObject::Name(PdfName("Page".to_string())),
1953 );
1954
1955 self.parse_page_dictionary_content(
1957 &dict_content,
1958 &mut result_dict,
1959 obj_num,
1960 )?;
1961 }
1962
1963 self.reader.seek(SeekFrom::Start(original_pos)).ok();
1965
1966 return Ok(PdfDictionary(result_dict));
1967 }
1968 }
1969 }
1970
1971 self.reader.seek(SeekFrom::Start(original_pos)).ok();
1973
1974 if obj_num == 113 {
1976 let mut result_dict = HashMap::new();
1977 result_dict.insert(
1978 PdfName("Type".to_string()),
1979 PdfObject::Name(PdfName("Pages".to_string())),
1980 );
1981
1982 let page_refs = match self.find_page_objects() {
1984 Ok(refs) => refs,
1985 Err(_e) => {
1986 vec![]
1987 }
1988 };
1989
1990 let page_count = if page_refs.is_empty() {
1992 44
1993 } else {
1994 page_refs.len() as i64
1995 };
1996 result_dict.insert(PdfName("Count".to_string()), PdfObject::Integer(page_count));
1997
1998 let kids_array: Vec<PdfObject> = page_refs
2000 .into_iter()
2001 .map(|(obj_num, gen_num)| PdfObject::Reference(obj_num, gen_num))
2002 .collect();
2003
2004 result_dict.insert(
2005 PdfName("Kids".to_string()),
2006 PdfObject::Array(PdfArray(kids_array)),
2007 );
2008
2009 return Ok(PdfDictionary(result_dict));
2010 } else if obj_num == 114 {
2011 let mut result_dict = HashMap::new();
2012 result_dict.insert(
2013 PdfName("Type".to_string()),
2014 PdfObject::Name(PdfName("Pages".to_string())),
2015 );
2016
2017 let page_refs = match self.find_page_objects() {
2019 Ok(refs) => refs,
2020 Err(_e) => {
2021 vec![]
2022 }
2023 };
2024
2025 let page_count = if page_refs.is_empty() {
2027 44
2028 } else {
2029 page_refs.len() as i64
2030 };
2031 result_dict.insert(PdfName("Count".to_string()), PdfObject::Integer(page_count));
2032
2033 let kids_array: Vec<PdfObject> = page_refs
2035 .into_iter()
2036 .map(|(obj_num, gen_num)| PdfObject::Reference(obj_num, gen_num))
2037 .collect();
2038
2039 result_dict.insert(
2040 PdfName("Kids".to_string()),
2041 PdfObject::Array(PdfArray(kids_array)),
2042 );
2043
2044 return Ok(PdfDictionary(result_dict));
2045 }
2046
2047 Err(ParseError::SyntaxError {
2048 position: 0,
2049 message: "Could not find catalog dictionary in manual extraction".to_string(),
2050 })
2051 }
2052
2053 fn extract_object_or_stream_manually(&mut self, obj_num: u32) -> ParseResult<PdfObject> {
2055 use crate::parser::objects::PdfObject;
2056
2057 let original_pos = self.reader.stream_position().unwrap_or(0);
2059
2060 if self.reader.seek(SeekFrom::Start(0)).is_err() {
2062 return Err(ParseError::SyntaxError {
2063 position: 0,
2064 message: "Failed to seek to beginning for manual extraction".to_string(),
2065 });
2066 }
2067
2068 let mut buffer = Vec::new();
2070 if self.reader.read_to_end(&mut buffer).is_err() {
2071 return Err(ParseError::SyntaxError {
2072 position: 0,
2073 message: "Failed to read file for manual extraction".to_string(),
2074 });
2075 }
2076
2077 let pattern = format!("{} 0 obj", obj_num).into_bytes();
2079
2080 if let Some(obj_start) = find_bytes(&buffer, &pattern) {
2081 let start = obj_start + pattern.len();
2082 let search_area = &buffer[start..];
2083
2084 if let Some(dict_start) = find_bytes(search_area, b"<<") {
2085 let mut bracket_count = 1;
2087 let mut pos = dict_start + 2;
2088 let mut dict_end = None;
2089
2090 while pos < search_area.len() - 1 && bracket_count > 0 {
2091 if search_area[pos] == b'<' && search_area[pos + 1] == b'<' {
2092 bracket_count += 1;
2093 pos += 2;
2094 } else if search_area[pos] == b'>' && search_area[pos + 1] == b'>' {
2095 bracket_count -= 1;
2096 if bracket_count == 0 {
2097 dict_end = Some(pos);
2098 break;
2099 }
2100 pos += 2;
2101 } else {
2102 pos += 1;
2103 }
2104 }
2105
2106 if let Some(dict_end_pos) = dict_end {
2107 let dict_start_abs = dict_start + 2;
2108 let dict_end_abs = dict_end_pos;
2109 let dict_content_bytes = &search_area[dict_start_abs..dict_end_abs];
2110 let dict_content = String::from_utf8_lossy(dict_content_bytes);
2111
2112 let after_dict = &search_area[dict_end_abs + 2..];
2114 if is_immediate_stream_start(after_dict) {
2115 return self.reconstruct_stream_object_bytes(
2117 obj_num,
2118 &dict_content,
2119 after_dict,
2120 );
2121 } else {
2122 return self
2124 .extract_object_manually(obj_num)
2125 .map(|dict| PdfObject::Dictionary(dict));
2126 }
2127 }
2128 }
2129 }
2130
2131 self.reader.seek(SeekFrom::Start(original_pos)).ok();
2133
2134 Err(ParseError::SyntaxError {
2135 position: 0,
2136 message: format!("Could not manually extract object {}", obj_num),
2137 })
2138 }
2139
2140 fn reconstruct_stream_object_bytes(
2142 &mut self,
2143 obj_num: u32,
2144 dict_content: &str,
2145 after_dict: &[u8],
2146 ) -> ParseResult<PdfObject> {
2147 use crate::parser::objects::{PdfDictionary, PdfName, PdfObject, PdfStream};
2148 use std::collections::HashMap;
2149
2150 let mut dict = HashMap::new();
2152
2153 if dict_content.contains("/Filter /FlateDecode") {
2155 dict.insert(
2156 PdfName("Filter".to_string()),
2157 PdfObject::Name(PdfName("FlateDecode".to_string())),
2158 );
2159 }
2160
2161 if let Some(length_start) = dict_content.find("/Length ") {
2162 let length_part = &dict_content[length_start + 8..];
2163
2164 let is_indirect_ref =
2167 length_part.trim().contains(" R") || length_part.trim().contains(" 0 R");
2168
2169 if is_indirect_ref {
2170 } else if let Some(space_pos) = length_part.find(' ') {
2172 let length_str = &length_part[..space_pos];
2173 if let Ok(length) = length_str.parse::<i64>() {
2174 dict.insert(PdfName("Length".to_string()), PdfObject::Integer(length));
2175 }
2176 } else {
2177 if let Ok(length) = length_part.trim().parse::<i64>() {
2179 dict.insert(PdfName("Length".to_string()), PdfObject::Integer(length));
2180 }
2181 }
2182 } else {
2183 }
2184
2185 if let Some(stream_start) = find_bytes(after_dict, b"stream") {
2187 let stream_start_pos = stream_start + 6; let stream_data_start = if after_dict.get(stream_start_pos) == Some(&b'\n') {
2189 stream_start_pos + 1
2190 } else if after_dict.get(stream_start_pos) == Some(&b'\r') {
2191 if after_dict.get(stream_start_pos + 1) == Some(&b'\n') {
2192 stream_start_pos + 2
2193 } else {
2194 stream_start_pos + 1
2195 }
2196 } else {
2197 stream_start_pos
2198 };
2199
2200 if let Some(endstream_pos) = find_bytes(after_dict, b"endstream") {
2201 let mut stream_data = &after_dict[stream_data_start..endstream_pos];
2202
2203 if let Some(PdfObject::Integer(length)) = dict.get(&PdfName("Length".to_string())) {
2205 let expected_length = *length as usize;
2206 if stream_data.len() > expected_length {
2207 stream_data = &stream_data[..expected_length];
2208 } else if stream_data.len() < expected_length {
2209 tracing::debug!(
2210 "WARNING: Stream data ({} bytes) < Length ({} bytes)!",
2211 stream_data.len(),
2212 expected_length
2213 );
2214 }
2215 }
2216
2217 let stream = PdfStream {
2218 dict: PdfDictionary(dict),
2219 data: stream_data.to_vec(),
2220 };
2221
2222 return Ok(PdfObject::Stream(stream));
2223 } else {
2224 }
2225 }
2226
2227 Err(ParseError::SyntaxError {
2228 position: 0,
2229 message: format!("Could not reconstruct stream for object {}", obj_num),
2230 })
2231 }
2232
2233 fn parse_resources_from_content(&self, dict_content: &str) -> ParseResult<PdfObject> {
2235 use crate::parser::objects::{PdfDictionary, PdfName, PdfObject};
2236 use std::collections::HashMap;
2237
2238 if let Some(resources_start) = dict_content.find("/Resources") {
2240 if let Some(bracket_start) = dict_content[resources_start..].find("<<") {
2242 let abs_bracket_start = resources_start + bracket_start + 2;
2243
2244 let mut bracket_count = 1;
2246 let mut end_pos = abs_bracket_start;
2247 let chars: Vec<char> = dict_content.chars().collect();
2248
2249 while end_pos < chars.len() && bracket_count > 0 {
2250 if end_pos + 1 < chars.len() {
2251 if chars[end_pos] == '<' && chars[end_pos + 1] == '<' {
2252 bracket_count += 1;
2253 end_pos += 2;
2254 continue;
2255 } else if chars[end_pos] == '>' && chars[end_pos + 1] == '>' {
2256 bracket_count -= 1;
2257 end_pos += 2;
2258 continue;
2259 }
2260 }
2261 end_pos += 1;
2262 }
2263
2264 if bracket_count == 0 {
2265 let resources_content = &dict_content[abs_bracket_start..end_pos - 2];
2266
2267 let mut resources_dict = HashMap::new();
2269
2270 if let Some(font_start) = resources_content.find("/Font") {
2272 if let Some(font_bracket) = resources_content[font_start..].find("<<") {
2273 let abs_font_start = font_start + font_bracket + 2;
2274
2275 let mut font_dict = HashMap::new();
2277
2278 let font_section = &resources_content[abs_font_start..];
2280 let mut pos = 0;
2281 while let Some(f_pos) = font_section[pos..].find("/F") {
2282 let abs_f_pos = pos + f_pos;
2283 if let Some(space_pos) = font_section[abs_f_pos..].find(" ") {
2284 let font_name = &font_section[abs_f_pos..abs_f_pos + space_pos];
2285
2286 let after_name = &font_section[abs_f_pos + space_pos..];
2288 if let Some(r_pos) = after_name.find(" R") {
2289 let ref_part = after_name[..r_pos].trim();
2290 if let Some(parts) = ref_part
2291 .split_whitespace()
2292 .collect::<Vec<&str>>()
2293 .get(0..2)
2294 {
2295 if let (Ok(obj_num), Ok(gen_num)) =
2296 (parts[0].parse::<u32>(), parts[1].parse::<u16>())
2297 {
2298 font_dict.insert(
2299 PdfName(font_name[1..].to_string()), PdfObject::Reference(obj_num, gen_num),
2301 );
2302 }
2303 }
2304 }
2305 }
2306 pos = abs_f_pos + 1;
2307 }
2308
2309 if !font_dict.is_empty() {
2310 resources_dict.insert(
2311 PdfName("Font".to_string()),
2312 PdfObject::Dictionary(PdfDictionary(font_dict)),
2313 );
2314 }
2315 }
2316 }
2317
2318 return Ok(PdfObject::Dictionary(PdfDictionary(resources_dict)));
2319 }
2320 }
2321 }
2322
2323 Err(ParseError::SyntaxError {
2324 position: 0,
2325 message: "Could not parse Resources".to_string(),
2326 })
2327 }
2328
2329 #[allow(dead_code)]
2330 fn extract_catalog_directly(
2331 &mut self,
2332 obj_num: u32,
2333 gen_num: u16,
2334 ) -> ParseResult<&PdfDictionary> {
2335 if let Some(entry) = self.xref.get_entry(obj_num) {
2337 if self.reader.seek(SeekFrom::Start(entry.offset)).is_err() {
2339 return Err(ParseError::SyntaxError {
2340 position: 0,
2341 message: "Failed to seek to catalog object".to_string(),
2342 });
2343 }
2344
2345 let mut buffer = vec![0u8; 2048];
2347 if let Ok(bytes_read) = self.reader.read(&mut buffer) {
2348 let content = String::from_utf8_lossy(&buffer[..bytes_read]);
2349 tracing::debug!("Raw catalog content:\n{}", content);
2350
2351 if let Some(dict_start) = content.find("<<") {
2353 if let Some(dict_end) = content[dict_start..].find(">>") {
2354 let dict_content = &content[dict_start..dict_start + dict_end + 2];
2355 tracing::debug!("Found dictionary content: {}", dict_content);
2356
2357 if let Ok(dict) = self.parse_dictionary_from_string(dict_content) {
2359 let key = (obj_num, gen_num);
2361 self.object_cache.insert(key, PdfObject::Dictionary(dict));
2362
2363 if let Some(PdfObject::Dictionary(ref dict)) =
2365 self.object_cache.get(&key)
2366 {
2367 return Ok(dict);
2368 }
2369 }
2370 }
2371 }
2372 }
2373 }
2374
2375 Err(ParseError::SyntaxError {
2376 position: 0,
2377 message: "Failed to extract catalog directly".to_string(),
2378 })
2379 }
2380
2381 #[allow(dead_code)]
2382 fn parse_dictionary_from_string(&self, dict_str: &str) -> ParseResult<PdfDictionary> {
2383 use crate::parser::lexer::{Lexer, Token};
2384
2385 let mut cursor = std::io::Cursor::new(dict_str.as_bytes());
2387 let mut lexer = Lexer::new_with_options(&mut cursor, self.options.clone());
2388
2389 match lexer.next_token()? {
2391 Token::DictStart => {
2392 let mut dict = std::collections::HashMap::new();
2393
2394 loop {
2395 let token = lexer.next_token()?;
2396 match token {
2397 Token::DictEnd => break,
2398 Token::Name(key) => {
2399 let value = PdfObject::parse_with_options(&mut lexer, &self.options)?;
2401 dict.insert(crate::parser::objects::PdfName(key), value);
2402 }
2403 _ => {
2404 return Err(ParseError::SyntaxError {
2405 position: 0,
2406 message: "Invalid dictionary format".to_string(),
2407 });
2408 }
2409 }
2410 }
2411
2412 Ok(PdfDictionary(dict))
2413 }
2414 _ => Err(ParseError::SyntaxError {
2415 position: 0,
2416 message: "Expected dictionary start".to_string(),
2417 }),
2418 }
2419 }
2420
2421 fn count_page_objects_directly(&mut self) -> Option<u32> {
2423 let mut page_count = 0;
2424
2425 for obj_num in 1..self.xref.len() as u32 {
2427 if let Ok(obj) = self.get_object(obj_num, 0) {
2428 if let Some(dict) = obj.as_dict() {
2429 if let Some(obj_type) = dict.get("Type").and_then(|t| t.as_name()) {
2430 if obj_type.0 == "Page" {
2431 page_count += 1;
2432 }
2433 }
2434 }
2435 }
2436 }
2437
2438 if page_count > 0 {
2439 Some(page_count)
2440 } else {
2441 None
2442 }
2443 }
2444
2445 pub fn metadata(&mut self) -> ParseResult<DocumentMetadata> {
2447 let mut metadata = DocumentMetadata::default();
2448
2449 if let Some(info_dict) = self.info()? {
2450 if let Some(title) = info_dict.get("Title").and_then(|o| o.as_string()) {
2451 metadata.title = title.as_str().ok().map(|s| s.to_string());
2452 }
2453 if let Some(author) = info_dict.get("Author").and_then(|o| o.as_string()) {
2454 metadata.author = author.as_str().ok().map(|s| s.to_string());
2455 }
2456 if let Some(subject) = info_dict.get("Subject").and_then(|o| o.as_string()) {
2457 metadata.subject = subject.as_str().ok().map(|s| s.to_string());
2458 }
2459 if let Some(keywords) = info_dict.get("Keywords").and_then(|o| o.as_string()) {
2460 metadata.keywords = keywords.as_str().ok().map(|s| s.to_string());
2461 }
2462 if let Some(creator) = info_dict.get("Creator").and_then(|o| o.as_string()) {
2463 metadata.creator = creator.as_str().ok().map(|s| s.to_string());
2464 }
2465 if let Some(producer) = info_dict.get("Producer").and_then(|o| o.as_string()) {
2466 metadata.producer = producer.as_str().ok().map(|s| s.to_string());
2467 }
2468 }
2469
2470 metadata.version = self.version().to_string();
2471 metadata.page_count = self.page_count().ok();
2472
2473 Ok(metadata)
2474 }
2475
2476 fn ensure_page_tree(&mut self) -> ParseResult<()> {
2478 if self.page_tree.is_none() {
2479 let page_count = self.page_count()?;
2480 self.page_tree = Some(super::page_tree::PageTree::new(page_count));
2481 }
2482 Ok(())
2483 }
2484
2485 pub fn get_page(&mut self, _index: u32) -> ParseResult<&super::page_tree::ParsedPage> {
2491 self.ensure_page_tree()?;
2492
2493 Err(ParseError::SyntaxError {
2497 position: 0,
2498 message: "get_page not implemented due to borrow checker constraints. Use PdfDocument instead.".to_string(),
2499 })
2500 }
2501
2502 pub fn get_all_pages(&mut self) -> ParseResult<Vec<super::page_tree::ParsedPage>> {
2504 let page_count = self.page_count()?;
2505 let mut pages = Vec::with_capacity(page_count as usize);
2506
2507 for i in 0..page_count {
2508 let page = self.get_page(i)?.clone();
2509 pages.push(page);
2510 }
2511
2512 Ok(pages)
2513 }
2514
2515 pub fn into_document(self) -> super::document::PdfDocument<R> {
2517 super::document::PdfDocument::new(self)
2518 }
2519
2520 pub fn clear_parse_context(&mut self) {
2522 self.parse_context = StackSafeContext::new();
2523 }
2524
2525 pub fn parse_context_mut(&mut self) -> &mut StackSafeContext {
2527 &mut self.parse_context
2528 }
2529
2530 fn find_page_objects(&mut self) -> ParseResult<Vec<(u32, u16)>> {
2532 let original_pos = self.reader.stream_position().unwrap_or(0);
2534
2535 if self.reader.seek(SeekFrom::Start(0)).is_err() {
2537 return Ok(vec![]);
2538 }
2539
2540 let mut buffer = Vec::new();
2541 if self.reader.read_to_end(&mut buffer).is_err() {
2542 return Ok(vec![]);
2543 }
2544
2545 self.reader.seek(SeekFrom::Start(original_pos)).ok();
2547
2548 let content = String::from_utf8_lossy(&buffer);
2549 let mut page_objects = Vec::new();
2550
2551 let lines: Vec<&str> = content.lines().collect();
2553
2554 for (i, line) in lines.iter().enumerate() {
2555 if line.trim().ends_with(" 0 obj") {
2557 if let Some(obj_str) = line.trim().strip_suffix(" 0 obj") {
2558 if let Ok(obj_num) = obj_str.parse::<u32>() {
2559 for j in 1..=10 {
2561 if i + j < lines.len() {
2562 let future_line = lines[i + j];
2563 if future_line.contains("/Type /Page")
2564 && !future_line.contains("/Type /Pages")
2565 {
2566 page_objects.push((obj_num, 0));
2567 break;
2568 }
2569 if future_line.trim().ends_with(" 0 obj")
2571 || future_line.trim() == "endobj"
2572 {
2573 break;
2574 }
2575 }
2576 }
2577 }
2578 }
2579 }
2580 }
2581
2582 page_objects.sort();
2583 page_objects.dedup();
2584
2585 Ok(page_objects)
2586 }
2587
2588 fn find_catalog_object(&mut self) -> ParseResult<(u32, u16)> {
2590 let obj_numbers: Vec<u32> = self.xref.entries().keys().copied().collect();
2595
2596 for obj_num in obj_numbers {
2598 if let Ok(obj) = self.get_object(obj_num, 0) {
2600 if let Some(dict) = obj.as_dict() {
2601 if let Some(type_obj) = dict.get("Type") {
2603 if let Some(type_name) = type_obj.as_name() {
2604 if type_name.0 == "Catalog" {
2605 return Ok((obj_num, 0));
2606 }
2607 if type_name.0 == "Sig"
2609 || type_name.0 == "Pages"
2610 || type_name.0 == "Page"
2611 {
2612 continue;
2613 }
2614 }
2615 }
2616 }
2617 }
2618 }
2619
2620 for obj_num in [1, 2, 3, 4, 5] {
2622 if let Ok(obj) = self.get_object(obj_num, 0) {
2623 if let Some(dict) = obj.as_dict() {
2624 if dict.contains_key("Pages") {
2626 return Ok((obj_num, 0));
2627 }
2628 }
2629 }
2630 }
2631
2632 Err(ParseError::MissingKey(
2633 "Could not find Catalog object".to_string(),
2634 ))
2635 }
2636
2637 fn create_synthetic_pages_dict(
2639 &mut self,
2640 page_refs: &[(u32, u16)],
2641 ) -> ParseResult<&PdfDictionary> {
2642 use super::objects::{PdfArray, PdfName};
2643
2644 let mut valid_page_refs = Vec::new();
2646 for (obj_num, gen_num) in page_refs {
2647 if let Ok(page_obj) = self.get_object(*obj_num, *gen_num) {
2648 if let Some(page_dict) = page_obj.as_dict() {
2649 if let Some(obj_type) = page_dict.get("Type").and_then(|t| t.as_name()) {
2651 if obj_type.0 == "Page" {
2652 valid_page_refs.push((*obj_num, *gen_num));
2653 continue;
2654 }
2655 }
2656
2657 if page_dict.contains_key("MediaBox") || page_dict.contains_key("Contents") {
2659 valid_page_refs.push((*obj_num, *gen_num));
2660 }
2661 }
2662 }
2663 }
2664
2665 if valid_page_refs.is_empty() {
2666 return Err(ParseError::SyntaxError {
2667 position: 0,
2668 message: "No valid page objects found for synthetic Pages tree".to_string(),
2669 });
2670 }
2671
2672 if valid_page_refs.len() > 10 {
2674 return self.create_hierarchical_pages_tree(&valid_page_refs);
2675 }
2676
2677 let mut kids = PdfArray::new();
2679 for (obj_num, gen_num) in &valid_page_refs {
2680 kids.push(PdfObject::Reference(*obj_num, *gen_num));
2681 }
2682
2683 let mut pages_dict = PdfDictionary::new();
2685 pages_dict.insert(
2686 "Type".to_string(),
2687 PdfObject::Name(PdfName("Pages".to_string())),
2688 );
2689 pages_dict.insert("Kids".to_string(), PdfObject::Array(kids));
2690 pages_dict.insert(
2691 "Count".to_string(),
2692 PdfObject::Integer(valid_page_refs.len() as i64),
2693 );
2694
2695 let mut media_box = None;
2697 for (obj_num, gen_num) in valid_page_refs.iter().take(3) {
2698 if let Ok(page_obj) = self.get_object(*obj_num, *gen_num) {
2699 if let Some(page_dict) = page_obj.as_dict() {
2700 if let Some(mb) = page_dict.get("MediaBox") {
2701 media_box = Some(mb.clone());
2702 }
2703 }
2704 }
2705 }
2706
2707 if let Some(mb) = media_box {
2709 pages_dict.insert("MediaBox".to_string(), mb);
2710 } else {
2711 let mut mb_array = PdfArray::new();
2712 mb_array.push(PdfObject::Integer(0));
2713 mb_array.push(PdfObject::Integer(0));
2714 mb_array.push(PdfObject::Integer(612));
2715 mb_array.push(PdfObject::Integer(792));
2716 pages_dict.insert("MediaBox".to_string(), PdfObject::Array(mb_array));
2717 }
2718
2719 let synthetic_key = (u32::MAX - 1, 0);
2721 self.object_cache
2722 .insert(synthetic_key, PdfObject::Dictionary(pages_dict));
2723
2724 if let PdfObject::Dictionary(dict) = &self.object_cache[&synthetic_key] {
2726 Ok(dict)
2727 } else {
2728 unreachable!("Just inserted dictionary")
2729 }
2730 }
2731
2732 fn create_hierarchical_pages_tree(
2734 &mut self,
2735 page_refs: &[(u32, u16)],
2736 ) -> ParseResult<&PdfDictionary> {
2737 use super::objects::{PdfArray, PdfName};
2738
2739 const PAGES_PER_NODE: usize = 10; let chunks: Vec<&[(u32, u16)]> = page_refs.chunks(PAGES_PER_NODE).collect();
2743 let mut intermediate_nodes = Vec::new();
2744
2745 for (chunk_idx, chunk) in chunks.iter().enumerate() {
2747 let mut kids = PdfArray::new();
2748 for (obj_num, gen_num) in chunk.iter() {
2749 kids.push(PdfObject::Reference(*obj_num, *gen_num));
2750 }
2751
2752 let mut intermediate_dict = PdfDictionary::new();
2753 intermediate_dict.insert(
2754 "Type".to_string(),
2755 PdfObject::Name(PdfName("Pages".to_string())),
2756 );
2757 intermediate_dict.insert("Kids".to_string(), PdfObject::Array(kids));
2758 intermediate_dict.insert("Count".to_string(), PdfObject::Integer(chunk.len() as i64));
2759
2760 let intermediate_key = (u32::MAX - 2 - chunk_idx as u32, 0);
2762 self.object_cache
2763 .insert(intermediate_key, PdfObject::Dictionary(intermediate_dict));
2764
2765 intermediate_nodes.push(intermediate_key);
2766 }
2767
2768 let mut root_kids = PdfArray::new();
2770 for (obj_num, gen_num) in &intermediate_nodes {
2771 root_kids.push(PdfObject::Reference(*obj_num, *gen_num));
2772 }
2773
2774 let mut root_pages_dict = PdfDictionary::new();
2775 root_pages_dict.insert(
2776 "Type".to_string(),
2777 PdfObject::Name(PdfName("Pages".to_string())),
2778 );
2779 root_pages_dict.insert("Kids".to_string(), PdfObject::Array(root_kids));
2780 root_pages_dict.insert(
2781 "Count".to_string(),
2782 PdfObject::Integer(page_refs.len() as i64),
2783 );
2784
2785 if let Some((obj_num, gen_num)) = page_refs.first() {
2787 if let Ok(page_obj) = self.get_object(*obj_num, *gen_num) {
2788 if let Some(page_dict) = page_obj.as_dict() {
2789 if let Some(mb) = page_dict.get("MediaBox") {
2790 root_pages_dict.insert("MediaBox".to_string(), mb.clone());
2791 }
2792 }
2793 }
2794 }
2795
2796 let root_key = (u32::MAX - 1, 0);
2798 self.object_cache
2799 .insert(root_key, PdfObject::Dictionary(root_pages_dict));
2800
2801 if let PdfObject::Dictionary(dict) = &self.object_cache[&root_key] {
2803 Ok(dict)
2804 } else {
2805 unreachable!("Just inserted dictionary")
2806 }
2807 }
2808}
2809
2810#[derive(Debug, Default, Clone)]
2812pub struct DocumentMetadata {
2813 pub title: Option<String>,
2814 pub author: Option<String>,
2815 pub subject: Option<String>,
2816 pub keywords: Option<String>,
2817 pub creator: Option<String>,
2818 pub producer: Option<String>,
2819 pub creation_date: Option<String>,
2820 pub modification_date: Option<String>,
2821 pub version: String,
2822 pub page_count: Option<u32>,
2823}
2824
2825pub struct EOLIter<'s> {
2826 remainder: &'s str,
2827}
2828impl<'s> Iterator for EOLIter<'s> {
2829 type Item = &'s str;
2830
2831 fn next(&mut self) -> Option<Self::Item> {
2832 if self.remainder.is_empty() {
2833 return None;
2834 }
2835
2836 if let Some((i, sep)) = ["\r\n", "\n", "\r"]
2837 .iter()
2838 .filter_map(|&sep| self.remainder.find(sep).map(|i| (i, sep)))
2839 .min_by_key(|(i, _)| *i)
2840 {
2841 let (line, rest) = self.remainder.split_at(i);
2842 self.remainder = &rest[sep.len()..];
2843 Some(line)
2844 } else {
2845 let line = self.remainder;
2846 self.remainder = "";
2847 Some(line)
2848 }
2849 }
2850}
2851pub trait PDFLines: AsRef<str> {
2852 fn pdf_lines(&self) -> EOLIter<'_> {
2853 EOLIter {
2854 remainder: self.as_ref(),
2855 }
2856 }
2857}
2858impl PDFLines for &str {}
2859impl<'a> PDFLines for std::borrow::Cow<'a, str> {}
2860impl PDFLines for String {}
2861
2862#[cfg(test)]
2863mod tests {
2864
2865 use super::*;
2866 use crate::parser::objects::{PdfName, PdfString};
2867 use crate::parser::test_helpers::*;
2868 use crate::parser::ParseOptions;
2869 use std::io::Cursor;
2870
2871 #[test]
2872 fn test_reader_construction() {
2873 let pdf_data = create_minimal_pdf();
2874 let cursor = Cursor::new(pdf_data);
2875 let result = PdfReader::new(cursor);
2876 assert!(result.is_ok());
2877 }
2878
2879 #[test]
2880 fn test_reader_version() {
2881 let pdf_data = create_minimal_pdf();
2882 let cursor = Cursor::new(pdf_data);
2883 let reader = PdfReader::new(cursor).unwrap();
2884 assert_eq!(reader.version().major, 1);
2885 assert_eq!(reader.version().minor, 4);
2886 }
2887
2888 #[test]
2889 fn test_reader_different_versions() {
2890 let versions = vec![
2891 "1.0", "1.1", "1.2", "1.3", "1.4", "1.5", "1.6", "1.7", "2.0",
2892 ];
2893
2894 for version in versions {
2895 let pdf_data = create_pdf_with_version(version);
2896 let cursor = Cursor::new(pdf_data);
2897 let reader = PdfReader::new(cursor).unwrap();
2898
2899 let parts: Vec<&str> = version.split('.').collect();
2900 assert_eq!(reader.version().major, parts[0].parse::<u8>().unwrap());
2901 assert_eq!(reader.version().minor, parts[1].parse::<u8>().unwrap());
2902 }
2903 }
2904
2905 #[test]
2906 fn test_reader_catalog() {
2907 let pdf_data = create_minimal_pdf();
2908 let cursor = Cursor::new(pdf_data);
2909 let mut reader = PdfReader::new(cursor).unwrap();
2910
2911 let catalog = reader.catalog();
2912 assert!(catalog.is_ok());
2913
2914 let catalog_dict = catalog.unwrap();
2915 assert_eq!(
2916 catalog_dict.get("Type"),
2917 Some(&PdfObject::Name(PdfName("Catalog".to_string())))
2918 );
2919 }
2920
2921 #[test]
2922 fn test_reader_info_none() {
2923 let pdf_data = create_minimal_pdf();
2924 let cursor = Cursor::new(pdf_data);
2925 let mut reader = PdfReader::new(cursor).unwrap();
2926
2927 let info = reader.info().unwrap();
2928 assert!(info.is_none());
2929 }
2930
2931 #[test]
2932 fn test_reader_info_present() {
2933 let pdf_data = create_pdf_with_info();
2934 let cursor = Cursor::new(pdf_data);
2935 let mut reader = PdfReader::new(cursor).unwrap();
2936
2937 let info = reader.info().unwrap();
2938 assert!(info.is_some());
2939
2940 let info_dict = info.unwrap();
2941 assert_eq!(
2942 info_dict.get("Title"),
2943 Some(&PdfObject::String(PdfString(
2944 "Test PDF".to_string().into_bytes()
2945 )))
2946 );
2947 assert_eq!(
2948 info_dict.get("Author"),
2949 Some(&PdfObject::String(PdfString(
2950 "Test Author".to_string().into_bytes()
2951 )))
2952 );
2953 }
2954
2955 #[test]
2956 fn test_reader_get_object() {
2957 let pdf_data = create_minimal_pdf();
2958 let cursor = Cursor::new(pdf_data);
2959 let mut reader = PdfReader::new(cursor).unwrap();
2960
2961 let obj = reader.get_object(1, 0);
2963 assert!(obj.is_ok());
2964
2965 let catalog = obj.unwrap();
2966 assert!(catalog.as_dict().is_some());
2967 }
2968
2969 #[test]
2970 fn test_reader_get_invalid_object() {
2971 let pdf_data = create_minimal_pdf();
2972 let cursor = Cursor::new(pdf_data);
2973 let mut reader = PdfReader::new(cursor).unwrap();
2974
2975 let obj = reader.get_object(999, 0);
2977 assert!(obj.is_err());
2978 }
2979
2980 #[test]
2981 fn test_reader_get_free_object() {
2982 let pdf_data = create_minimal_pdf();
2983 let cursor = Cursor::new(pdf_data);
2984 let mut reader = PdfReader::new(cursor).unwrap();
2985
2986 let obj = reader.get_object(0, 65535);
2988 assert!(obj.is_ok());
2989 assert_eq!(obj.unwrap(), &PdfObject::Null);
2990 }
2991
2992 #[test]
2993 fn test_reader_resolve_reference() {
2994 let pdf_data = create_minimal_pdf();
2995 let cursor = Cursor::new(pdf_data);
2996 let mut reader = PdfReader::new(cursor).unwrap();
2997
2998 let ref_obj = PdfObject::Reference(1, 0);
3000 let resolved = reader.resolve(&ref_obj);
3001
3002 assert!(resolved.is_ok());
3003 assert!(resolved.unwrap().as_dict().is_some());
3004 }
3005
3006 #[test]
3007 fn test_reader_resolve_non_reference() {
3008 let pdf_data = create_minimal_pdf();
3009 let cursor = Cursor::new(pdf_data);
3010 let mut reader = PdfReader::new(cursor).unwrap();
3011
3012 let int_obj = PdfObject::Integer(42);
3014 let resolved = reader.resolve(&int_obj).unwrap();
3015
3016 assert_eq!(resolved, &PdfObject::Integer(42));
3017 }
3018
3019 #[test]
3020 fn test_reader_cache_behavior() {
3021 let pdf_data = create_minimal_pdf();
3022 let cursor = Cursor::new(pdf_data);
3023 let mut reader = PdfReader::new(cursor).unwrap();
3024
3025 let obj1 = reader.get_object(1, 0).unwrap();
3027 assert!(obj1.as_dict().is_some());
3028
3029 let obj2 = reader.get_object(1, 0).unwrap();
3031 assert!(obj2.as_dict().is_some());
3032 }
3033
3034 #[test]
3035 fn test_reader_wrong_generation() {
3036 let pdf_data = create_minimal_pdf();
3037 let cursor = Cursor::new(pdf_data);
3038 let mut reader = PdfReader::new(cursor).unwrap();
3039
3040 let obj = reader.get_object(1, 99);
3042 assert!(obj.is_err());
3043 }
3044
3045 #[test]
3046 fn test_reader_invalid_pdf() {
3047 let invalid_data = b"This is not a PDF file";
3048 let cursor = Cursor::new(invalid_data.to_vec());
3049 let result = PdfReader::new(cursor);
3050
3051 assert!(result.is_err());
3052 }
3053
3054 #[test]
3055 fn test_reader_corrupt_xref() {
3056 let corrupt_pdf = b"%PDF-1.4
30571 0 obj
3058<< /Type /Catalog >>
3059endobj
3060xref
3061corrupted xref table
3062trailer
3063<< /Size 2 /Root 1 0 R >>
3064startxref
306524
3066%%EOF"
3067 .to_vec();
3068
3069 let cursor = Cursor::new(corrupt_pdf);
3070 let result = PdfReader::new(cursor);
3071 assert!(result.is_err());
3074 }
3075
3076 #[test]
3077 fn test_reader_missing_trailer() {
3078 let pdf_no_trailer = b"%PDF-1.4
30791 0 obj
3080<< /Type /Catalog >>
3081endobj
3082xref
30830 2
30840000000000 65535 f
30850000000009 00000 n
3086startxref
308724
3088%%EOF"
3089 .to_vec();
3090
3091 let cursor = Cursor::new(pdf_no_trailer);
3092 let result = PdfReader::new(cursor);
3093 assert!(result.is_err());
3096 }
3097
3098 #[test]
3099 fn test_reader_empty_pdf() {
3100 let cursor = Cursor::new(Vec::new());
3101 let result = PdfReader::new(cursor);
3102 assert!(result.is_err());
3103 }
3104
3105 #[test]
3106 fn test_reader_page_count() {
3107 let pdf_data = create_minimal_pdf();
3108 let cursor = Cursor::new(pdf_data);
3109 let mut reader = PdfReader::new(cursor).unwrap();
3110
3111 let count = reader.page_count();
3112 assert!(count.is_ok());
3113 assert_eq!(count.unwrap(), 0); }
3115
3116 #[test]
3117 fn test_reader_into_document() {
3118 let pdf_data = create_minimal_pdf();
3119 let cursor = Cursor::new(pdf_data);
3120 let reader = PdfReader::new(cursor).unwrap();
3121
3122 let document = reader.into_document();
3123 let page_count = document.page_count();
3125 assert!(page_count.is_ok());
3126 }
3127
3128 #[test]
3129 fn test_reader_pages_dict() {
3130 let pdf_data = create_minimal_pdf();
3131 let cursor = Cursor::new(pdf_data);
3132 let mut reader = PdfReader::new(cursor).unwrap();
3133
3134 let pages = reader.pages();
3135 assert!(pages.is_ok());
3136 let pages_dict = pages.unwrap();
3137 assert_eq!(
3138 pages_dict.get("Type"),
3139 Some(&PdfObject::Name(PdfName("Pages".to_string())))
3140 );
3141 }
3142
3143 #[test]
3144 fn test_reader_pdf_with_binary_data() {
3145 let pdf_data = create_pdf_with_binary_marker();
3146
3147 let cursor = Cursor::new(pdf_data);
3148 let result = PdfReader::new(cursor);
3149 assert!(result.is_ok());
3150 }
3151
3152 #[test]
3153 fn test_reader_metadata() {
3154 let pdf_data = create_pdf_with_info();
3155 let cursor = Cursor::new(pdf_data);
3156 let mut reader = PdfReader::new(cursor).unwrap();
3157
3158 let metadata = reader.metadata().unwrap();
3159 assert_eq!(metadata.title, Some("Test PDF".to_string()));
3160 assert_eq!(metadata.author, Some("Test Author".to_string()));
3161 assert_eq!(metadata.subject, Some("Testing".to_string()));
3162 assert_eq!(metadata.version, "1.4".to_string());
3163 }
3164
3165 #[test]
3166 fn test_reader_metadata_empty() {
3167 let pdf_data = create_minimal_pdf();
3168 let cursor = Cursor::new(pdf_data);
3169 let mut reader = PdfReader::new(cursor).unwrap();
3170
3171 let metadata = reader.metadata().unwrap();
3172 assert!(metadata.title.is_none());
3173 assert!(metadata.author.is_none());
3174 assert_eq!(metadata.version, "1.4".to_string());
3175 assert_eq!(metadata.page_count, Some(0));
3176 }
3177
3178 #[test]
3179 fn test_reader_object_number_mismatch() {
3180 let pdf_data = create_minimal_pdf();
3184 let cursor = Cursor::new(pdf_data);
3185 let mut reader = PdfReader::new(cursor).unwrap();
3186
3187 let result = reader.get_object(1, 99);
3190 assert!(result.is_err());
3191
3192 let result2 = reader.get_object(999, 0);
3194 assert!(result2.is_err());
3195 }
3196
3197 #[test]
3198 fn test_document_metadata_struct() {
3199 let metadata = DocumentMetadata {
3200 title: Some("Title".to_string()),
3201 author: Some("Author".to_string()),
3202 subject: Some("Subject".to_string()),
3203 keywords: Some("Keywords".to_string()),
3204 creator: Some("Creator".to_string()),
3205 producer: Some("Producer".to_string()),
3206 creation_date: Some("D:20240101".to_string()),
3207 modification_date: Some("D:20240102".to_string()),
3208 version: "1.5".to_string(),
3209 page_count: Some(10),
3210 };
3211
3212 assert_eq!(metadata.title, Some("Title".to_string()));
3213 assert_eq!(metadata.page_count, Some(10));
3214 }
3215
3216 #[test]
3217 fn test_document_metadata_default() {
3218 let metadata = DocumentMetadata::default();
3219 assert!(metadata.title.is_none());
3220 assert!(metadata.author.is_none());
3221 assert!(metadata.subject.is_none());
3222 assert!(metadata.keywords.is_none());
3223 assert!(metadata.creator.is_none());
3224 assert!(metadata.producer.is_none());
3225 assert!(metadata.creation_date.is_none());
3226 assert!(metadata.modification_date.is_none());
3227 assert_eq!(metadata.version, "".to_string());
3228 assert!(metadata.page_count.is_none());
3229 }
3230
3231 #[test]
3232 fn test_document_metadata_clone() {
3233 let metadata = DocumentMetadata {
3234 title: Some("Test".to_string()),
3235 version: "1.4".to_string(),
3236 ..Default::default()
3237 };
3238
3239 let cloned = metadata;
3240 assert_eq!(cloned.title, Some("Test".to_string()));
3241 assert_eq!(cloned.version, "1.4".to_string());
3242 }
3243
3244 #[test]
3245 fn test_reader_trailer_validation_error() {
3246 let bad_pdf = b"%PDF-1.4
32481 0 obj
3249<< /Type /Catalog >>
3250endobj
3251xref
32520 2
32530000000000 65535 f
32540000000009 00000 n
3255trailer
3256<< /Size 2 >>
3257startxref
325846
3259%%EOF"
3260 .to_vec();
3261
3262 let cursor = Cursor::new(bad_pdf);
3263 let result = PdfReader::new(cursor);
3264 assert!(result.is_err());
3267 }
3268
3269 #[test]
3270 fn test_reader_with_options() {
3271 let pdf_data = create_minimal_pdf();
3272 let cursor = Cursor::new(pdf_data);
3273 let mut options = ParseOptions::default();
3274 options.lenient_streams = true;
3275 options.max_recovery_bytes = 2000;
3276 options.collect_warnings = true;
3277
3278 let reader = PdfReader::new_with_options(cursor, options);
3279 assert!(reader.is_ok());
3280 }
3281
3282 #[test]
3283 fn test_lenient_stream_parsing() {
3284 let pdf_data = b"%PDF-1.4
32861 0 obj
3287<< /Type /Catalog /Pages 2 0 R >>
3288endobj
32892 0 obj
3290<< /Type /Pages /Kids [3 0 R] /Count 1 >>
3291endobj
32923 0 obj
3293<< /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792] /Contents 4 0 R >>
3294endobj
32954 0 obj
3296<< /Length 10 >>
3297stream
3298This is a longer stream than 10 bytes
3299endstream
3300endobj
3301xref
33020 5
33030000000000 65535 f
33040000000009 00000 n
33050000000058 00000 n
33060000000116 00000 n
33070000000219 00000 n
3308trailer
3309<< /Size 5 /Root 1 0 R >>
3310startxref
3311299
3312%%EOF"
3313 .to_vec();
3314
3315 let cursor = Cursor::new(pdf_data.clone());
3317 let strict_options = ParseOptions::strict();
3318 let strict_reader = PdfReader::new_with_options(cursor, strict_options);
3319 assert!(strict_reader.is_err());
3321
3322 let cursor = Cursor::new(pdf_data);
3324 let mut options = ParseOptions::default();
3325 options.lenient_streams = true;
3326 options.max_recovery_bytes = 1000;
3327 options.collect_warnings = false;
3328 let lenient_reader = PdfReader::new_with_options(cursor, options);
3329 assert!(lenient_reader.is_err());
3330 }
3331
3332 #[test]
3333 fn test_parse_options_default() {
3334 let options = ParseOptions::default();
3335 assert!(!options.lenient_streams);
3336 assert_eq!(options.max_recovery_bytes, 1000);
3337 assert!(!options.collect_warnings);
3338 }
3339
3340 #[test]
3341 fn test_parse_options_clone() {
3342 let mut options = ParseOptions::default();
3343 options.lenient_streams = true;
3344 options.max_recovery_bytes = 2000;
3345 options.collect_warnings = true;
3346 let cloned = options;
3347 assert!(cloned.lenient_streams);
3348 assert_eq!(cloned.max_recovery_bytes, 2000);
3349 assert!(cloned.collect_warnings);
3350 }
3351
3352 #[allow(dead_code)]
3355 fn create_encrypted_pdf_dict() -> PdfDictionary {
3356 let mut dict = PdfDictionary::new();
3357 dict.insert(
3358 "Filter".to_string(),
3359 PdfObject::Name(PdfName("Standard".to_string())),
3360 );
3361 dict.insert("V".to_string(), PdfObject::Integer(1));
3362 dict.insert("R".to_string(), PdfObject::Integer(2));
3363 dict.insert("O".to_string(), PdfObject::String(PdfString(vec![0u8; 32])));
3364 dict.insert("U".to_string(), PdfObject::String(PdfString(vec![0u8; 32])));
3365 dict.insert("P".to_string(), PdfObject::Integer(-4));
3366 dict
3367 }
3368
3369 fn create_pdf_with_encryption() -> Vec<u8> {
3370 b"%PDF-1.4
33721 0 obj
3373<< /Type /Catalog /Pages 2 0 R >>
3374endobj
33752 0 obj
3376<< /Type /Pages /Kids [3 0 R] /Count 1 >>
3377endobj
33783 0 obj
3379<< /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792] >>
3380endobj
33814 0 obj
3382<< /Filter /Standard /V 1 /R 2 /O (32 bytes of owner password hash data) /U (32 bytes of user password hash data) /P -4 >>
3383endobj
3384xref
33850 5
33860000000000 65535 f
33870000000009 00000 n
33880000000058 00000 n
33890000000116 00000 n
33900000000201 00000 n
3391trailer
3392<< /Size 5 /Root 1 0 R /Encrypt 4 0 R /ID [(file id)] >>
3393startxref
3394295
3395%%EOF"
3396 .to_vec()
3397 }
3398
3399 #[test]
3400 fn test_reader_encryption_detection() {
3401 let unencrypted_pdf = create_minimal_pdf();
3403 let cursor = Cursor::new(unencrypted_pdf);
3404 let reader = PdfReader::new(cursor).unwrap();
3405 assert!(!reader.is_encrypted());
3406 assert!(reader.is_unlocked()); let encrypted_pdf = create_pdf_with_encryption();
3410 let cursor = Cursor::new(encrypted_pdf);
3411 let result = PdfReader::new(cursor);
3412 assert!(result.is_err());
3414 }
3415
3416 #[test]
3417 fn test_reader_encryption_methods_unencrypted() {
3418 let pdf_data = create_minimal_pdf();
3419 let cursor = Cursor::new(pdf_data);
3420 let mut reader = PdfReader::new(cursor).unwrap();
3421
3422 assert!(!reader.is_encrypted());
3424 assert!(reader.is_unlocked());
3425 assert!(reader.encryption_handler().is_none());
3426 assert!(reader.encryption_handler_mut().is_none());
3427
3428 assert!(reader.unlock_with_password("any_password").unwrap());
3430 assert!(reader.try_empty_password().unwrap());
3431 }
3432
3433 #[test]
3434 fn test_reader_encryption_handler_access() {
3435 let pdf_data = create_minimal_pdf();
3436 let cursor = Cursor::new(pdf_data);
3437 let mut reader = PdfReader::new(cursor).unwrap();
3438
3439 assert!(reader.encryption_handler().is_none());
3441 assert!(reader.encryption_handler_mut().is_none());
3442
3443 assert!(!reader.is_encrypted());
3445 assert!(reader.is_unlocked());
3446 }
3447
3448 #[test]
3449 fn test_reader_multiple_password_attempts() {
3450 let pdf_data = create_minimal_pdf();
3451 let cursor = Cursor::new(pdf_data);
3452 let mut reader = PdfReader::new(cursor).unwrap();
3453
3454 let passwords = vec!["test1", "test2", "admin", "", "password"];
3456 for password in passwords {
3457 assert!(reader.unlock_with_password(password).unwrap());
3458 }
3459
3460 for _ in 0..5 {
3462 assert!(reader.try_empty_password().unwrap());
3463 }
3464 }
3465
3466 #[test]
3467 fn test_reader_encryption_state_consistency() {
3468 let pdf_data = create_minimal_pdf();
3469 let cursor = Cursor::new(pdf_data);
3470 let mut reader = PdfReader::new(cursor).unwrap();
3471
3472 assert!(!reader.is_encrypted());
3474 assert!(reader.is_unlocked());
3475 assert!(reader.encryption_handler().is_none());
3476
3477 let _ = reader.unlock_with_password("test");
3479 assert!(!reader.is_encrypted());
3480 assert!(reader.is_unlocked());
3481 assert!(reader.encryption_handler().is_none());
3482
3483 let _ = reader.try_empty_password();
3484 assert!(!reader.is_encrypted());
3485 assert!(reader.is_unlocked());
3486 assert!(reader.encryption_handler().is_none());
3487 }
3488
3489 #[test]
3490 fn test_reader_encryption_error_handling() {
3491 let encrypted_pdf = create_pdf_with_encryption();
3493 let cursor = Cursor::new(encrypted_pdf);
3494
3495 let result = PdfReader::new(cursor);
3497 match result {
3498 Err(ParseError::EncryptionNotSupported) => {
3499 }
3501 Err(_) => {
3502 }
3504 Ok(_) => {
3505 panic!("Should not successfully create reader for encrypted PDF without password");
3506 }
3507 }
3508 }
3509
3510 #[test]
3511 fn test_reader_encryption_with_options() {
3512 let pdf_data = create_minimal_pdf();
3513 let cursor = Cursor::new(pdf_data);
3514
3515 let strict_options = ParseOptions::strict();
3517 let strict_reader = PdfReader::new_with_options(cursor, strict_options).unwrap();
3518 assert!(!strict_reader.is_encrypted());
3519 assert!(strict_reader.is_unlocked());
3520
3521 let pdf_data = create_minimal_pdf();
3522 let cursor = Cursor::new(pdf_data);
3523 let lenient_options = ParseOptions::lenient();
3524 let lenient_reader = PdfReader::new_with_options(cursor, lenient_options).unwrap();
3525 assert!(!lenient_reader.is_encrypted());
3526 assert!(lenient_reader.is_unlocked());
3527 }
3528
3529 #[test]
3530 fn test_reader_encryption_integration_edge_cases() {
3531 let pdf_data = create_minimal_pdf();
3532 let cursor = Cursor::new(pdf_data);
3533 let mut reader = PdfReader::new(cursor).unwrap();
3534
3535 assert!(reader.unlock_with_password("").unwrap());
3537 assert!(reader.unlock_with_password(" ").unwrap()); assert!(reader
3539 .unlock_with_password("very_long_password_that_exceeds_normal_length")
3540 .unwrap());
3541 assert!(reader.unlock_with_password("unicode_test_ñáéíóú").unwrap());
3542
3543 assert!(reader.unlock_with_password("pass@#$%^&*()").unwrap());
3545 assert!(reader.unlock_with_password("pass\nwith\nnewlines").unwrap());
3546 assert!(reader.unlock_with_password("pass\twith\ttabs").unwrap());
3547 }
3548
3549 mod rigorous {
3550 use super::*;
3551
3552 #[test]
3557 fn test_reader_invalid_pdf_header() {
3558 let invalid_data = b"This is not a PDF file";
3560 let cursor = Cursor::new(invalid_data.to_vec());
3561 let result = PdfReader::new(cursor);
3562
3563 assert!(result.is_err(), "Should fail on invalid PDF header");
3564 }
3565
3566 #[test]
3567 fn test_reader_truncated_header() {
3568 let truncated = b"%PDF";
3570 let cursor = Cursor::new(truncated.to_vec());
3571 let result = PdfReader::new(cursor);
3572
3573 assert!(result.is_err(), "Should fail on truncated header");
3574 }
3575
3576 #[test]
3577 fn test_reader_empty_file() {
3578 let empty = Vec::new();
3579 let cursor = Cursor::new(empty);
3580 let result = PdfReader::new(cursor);
3581
3582 assert!(result.is_err(), "Should fail on empty file");
3583 }
3584
3585 #[test]
3586 fn test_reader_malformed_version() {
3587 let malformed = b"%PDF-X.Y\n%%\xE2\xE3\xCF\xD3\n";
3589 let cursor = Cursor::new(malformed.to_vec());
3590 let result = PdfReader::new(cursor);
3591
3592 if let Ok(reader) = result {
3594 let _version = reader.version();
3596 }
3597 }
3598
3599 #[test]
3600 fn test_reader_get_nonexistent_object() {
3601 let pdf_data = create_minimal_pdf();
3602 let cursor = Cursor::new(pdf_data);
3603 let mut reader = PdfReader::new(cursor).unwrap();
3604
3605 let result = reader.get_object(999, 0);
3607
3608 assert!(result.is_err(), "Should fail when object doesn't exist");
3609 }
3610
3611 #[test]
3612 fn test_reader_get_object_wrong_generation() {
3613 let pdf_data = create_minimal_pdf();
3614 let cursor = Cursor::new(pdf_data);
3615 let mut reader = PdfReader::new(cursor).unwrap();
3616
3617 let result = reader.get_object(1, 99);
3619
3620 if let Err(e) = result {
3622 let _ = e;
3624 }
3625 }
3626
3627 #[test]
3632 fn test_resolve_direct_object() {
3633 let pdf_data = create_minimal_pdf();
3634 let cursor = Cursor::new(pdf_data);
3635 let mut reader = PdfReader::new(cursor).unwrap();
3636
3637 let direct_obj = PdfObject::Integer(42);
3639
3640 let resolved = reader.resolve(&direct_obj).unwrap();
3641
3642 assert_eq!(resolved, &PdfObject::Integer(42));
3644 }
3645
3646 #[test]
3647 fn test_resolve_reference() {
3648 let pdf_data = create_minimal_pdf();
3649 let cursor = Cursor::new(pdf_data);
3650 let mut reader = PdfReader::new(cursor).unwrap();
3651
3652 let pages_ref = {
3654 let catalog = reader.catalog().unwrap();
3655 if let Some(PdfObject::Reference(obj_num, gen_num)) = catalog.get("Pages") {
3656 PdfObject::Reference(*obj_num, *gen_num)
3657 } else {
3658 panic!("Catalog /Pages must be a Reference");
3659 }
3660 };
3661
3662 let resolved = reader.resolve(&pages_ref).unwrap();
3664
3665 if let PdfObject::Dictionary(dict) = resolved {
3667 assert_eq!(
3668 dict.get("Type"),
3669 Some(&PdfObject::Name(PdfName("Pages".to_string())))
3670 );
3671 } else {
3672 panic!("Expected dictionary, got: {:?}", resolved);
3673 }
3674 }
3675
3676 #[test]
3681 fn test_is_encrypted_on_unencrypted() {
3682 let pdf_data = create_minimal_pdf();
3683 let cursor = Cursor::new(pdf_data);
3684 let reader = PdfReader::new(cursor).unwrap();
3685
3686 assert!(
3687 !reader.is_encrypted(),
3688 "Minimal PDF should not be encrypted"
3689 );
3690 }
3691
3692 #[test]
3693 fn test_is_unlocked_on_unencrypted() {
3694 let pdf_data = create_minimal_pdf();
3695 let cursor = Cursor::new(pdf_data);
3696 let reader = PdfReader::new(cursor).unwrap();
3697
3698 assert!(reader.is_unlocked(), "Unencrypted PDF should be unlocked");
3700 }
3701
3702 #[test]
3703 fn test_try_empty_password_on_unencrypted() {
3704 let pdf_data = create_minimal_pdf();
3705 let cursor = Cursor::new(pdf_data);
3706 let mut reader = PdfReader::new(cursor).unwrap();
3707
3708 let result = reader.try_empty_password();
3710 assert!(result.is_ok());
3711 }
3712
3713 #[test]
3718 fn test_reader_with_strict_options() {
3719 let pdf_data = create_minimal_pdf();
3720 let cursor = Cursor::new(pdf_data);
3721
3722 let options = ParseOptions::strict();
3723 let result = PdfReader::new_with_options(cursor, options);
3724
3725 assert!(result.is_ok(), "Minimal PDF should parse in strict mode");
3726 }
3727
3728 #[test]
3729 fn test_reader_with_lenient_options() {
3730 let pdf_data = create_minimal_pdf();
3731 let cursor = Cursor::new(pdf_data);
3732
3733 let options = ParseOptions::lenient();
3734 let result = PdfReader::new_with_options(cursor, options);
3735
3736 assert!(result.is_ok(), "Minimal PDF should parse in lenient mode");
3737 }
3738
3739 #[test]
3740 fn test_reader_options_accessible() {
3741 let pdf_data = create_minimal_pdf();
3742 let cursor = Cursor::new(pdf_data);
3743
3744 let options = ParseOptions::lenient();
3745 let reader = PdfReader::new_with_options(cursor, options.clone()).unwrap();
3746
3747 let reader_options = reader.options();
3749 assert_eq!(reader_options.strict_mode, options.strict_mode);
3750 }
3751
3752 #[test]
3757 fn test_catalog_has_required_fields() {
3758 let pdf_data = create_minimal_pdf();
3759 let cursor = Cursor::new(pdf_data);
3760 let mut reader = PdfReader::new(cursor).unwrap();
3761
3762 let catalog = reader.catalog().unwrap();
3763
3764 assert_eq!(
3766 catalog.get("Type"),
3767 Some(&PdfObject::Name(PdfName("Catalog".to_string()))),
3768 "Catalog must have /Type /Catalog"
3769 );
3770
3771 assert!(
3773 catalog.contains_key("Pages"),
3774 "Catalog must have /Pages entry"
3775 );
3776 }
3777
3778 #[test]
3779 fn test_info_fields_when_present() {
3780 let pdf_data = create_pdf_with_info();
3781 let cursor = Cursor::new(pdf_data);
3782 let mut reader = PdfReader::new(cursor).unwrap();
3783
3784 let info = reader.info().unwrap();
3785 assert!(info.is_some(), "PDF should have Info dictionary");
3786
3787 let info_dict = info.unwrap();
3788
3789 assert!(info_dict.contains_key("Title"), "Info should have Title");
3791 assert!(info_dict.contains_key("Author"), "Info should have Author");
3792 }
3793
3794 #[test]
3795 fn test_info_none_when_absent() {
3796 let pdf_data = create_minimal_pdf();
3797 let cursor = Cursor::new(pdf_data);
3798 let mut reader = PdfReader::new(cursor).unwrap();
3799
3800 let info = reader.info().unwrap();
3801 assert!(info.is_none(), "Minimal PDF should not have Info");
3802 }
3803
3804 #[test]
3809 fn test_version_exact_values() {
3810 let pdf_data = create_pdf_with_version("1.7");
3811 let cursor = Cursor::new(pdf_data);
3812 let reader = PdfReader::new(cursor).unwrap();
3813
3814 let version = reader.version();
3815 assert_eq!(version.major, 1, "Major version must be exact");
3816 assert_eq!(version.minor, 7, "Minor version must be exact");
3817 }
3818
3819 #[test]
3820 fn test_version_pdf_20() {
3821 let pdf_data = create_pdf_with_version("2.0");
3822 let cursor = Cursor::new(pdf_data);
3823 let reader = PdfReader::new(cursor).unwrap();
3824
3825 let version = reader.version();
3826 assert_eq!(version.major, 2, "PDF 2.0 major version");
3827 assert_eq!(version.minor, 0, "PDF 2.0 minor version");
3828 }
3829
3830 #[test]
3835 fn test_pages_returns_pages_dict() {
3836 let pdf_data = create_minimal_pdf();
3837 let cursor = Cursor::new(pdf_data);
3838 let mut reader = PdfReader::new(cursor).unwrap();
3839
3840 let pages_dict = reader
3841 .pages()
3842 .expect("pages() must return Pages dictionary");
3843
3844 assert_eq!(
3845 pages_dict.get("Type"),
3846 Some(&PdfObject::Name(PdfName("Pages".to_string()))),
3847 "Pages dict must have /Type /Pages"
3848 );
3849 }
3850
3851 #[test]
3852 fn test_page_count_minimal_pdf() {
3853 let pdf_data = create_minimal_pdf();
3854 let cursor = Cursor::new(pdf_data);
3855 let mut reader = PdfReader::new(cursor).unwrap();
3856
3857 let count = reader.page_count().expect("page_count() must succeed");
3858 assert_eq!(count, 0, "Minimal PDF has 0 pages");
3859 }
3860
3861 #[test]
3862 fn test_page_count_with_info_pdf() {
3863 let pdf_data = create_pdf_with_info();
3864 let cursor = Cursor::new(pdf_data);
3865 let mut reader = PdfReader::new(cursor).unwrap();
3866
3867 let count = reader.page_count().expect("page_count() must succeed");
3868 assert_eq!(count, 0, "create_pdf_with_info() has Count 0 in Pages dict");
3869 }
3870
3871 #[test]
3876 fn test_metadata_minimal_pdf() {
3877 let pdf_data = create_minimal_pdf();
3878 let cursor = Cursor::new(pdf_data);
3879 let mut reader = PdfReader::new(cursor).unwrap();
3880
3881 let meta = reader.metadata().expect("metadata() must succeed");
3882
3883 assert!(meta.title.is_none(), "Minimal PDF has no title");
3885 assert!(meta.author.is_none(), "Minimal PDF has no author");
3886 }
3887
3888 #[test]
3889 fn test_metadata_with_info() {
3890 let pdf_data = create_pdf_with_info();
3891 let cursor = Cursor::new(pdf_data);
3892 let mut reader = PdfReader::new(cursor).unwrap();
3893
3894 let meta = reader.metadata().expect("metadata() must succeed");
3895
3896 assert!(meta.title.is_some(), "PDF with Info has title");
3897 assert_eq!(meta.title.unwrap(), "Test PDF", "Title must match");
3898 assert!(meta.author.is_some(), "PDF with Info has author");
3899 assert_eq!(meta.author.unwrap(), "Test Author", "Author must match");
3900 }
3901
3902 #[test]
3907 fn test_resolve_stream_length_direct_integer() {
3908 let pdf_data = create_minimal_pdf();
3909 let cursor = Cursor::new(pdf_data);
3910 let mut reader = PdfReader::new(cursor).unwrap();
3911
3912 let length_obj = PdfObject::Integer(100);
3914
3915 let length = reader
3916 .resolve_stream_length(&length_obj)
3917 .expect("resolve_stream_length must succeed");
3918 assert_eq!(length, Some(100), "Direct integer must be resolved");
3919 }
3920
3921 #[test]
3922 fn test_resolve_stream_length_negative_integer() {
3923 let pdf_data = create_minimal_pdf();
3924 let cursor = Cursor::new(pdf_data);
3925 let mut reader = PdfReader::new(cursor).unwrap();
3926
3927 let length_obj = PdfObject::Integer(-10);
3929
3930 let length = reader
3931 .resolve_stream_length(&length_obj)
3932 .expect("resolve_stream_length must succeed");
3933 assert_eq!(length, None, "Negative integer returns None");
3934 }
3935
3936 #[test]
3937 fn test_resolve_stream_length_non_integer() {
3938 let pdf_data = create_minimal_pdf();
3939 let cursor = Cursor::new(pdf_data);
3940 let mut reader = PdfReader::new(cursor).unwrap();
3941
3942 let name_obj = PdfObject::Name(PdfName("Test".to_string()));
3944
3945 let length = reader
3946 .resolve_stream_length(&name_obj)
3947 .expect("resolve_stream_length must succeed");
3948 assert_eq!(length, None, "Non-integer object returns None");
3949 }
3950
3951 #[test]
3956 fn test_get_all_pages_empty_pdf() {
3957 let pdf_data = create_minimal_pdf();
3958 let cursor = Cursor::new(pdf_data);
3959 let mut reader = PdfReader::new(cursor).unwrap();
3960
3961 let pages = reader
3962 .get_all_pages()
3963 .expect("get_all_pages() must succeed");
3964 assert_eq!(pages.len(), 0, "Minimal PDF has 0 pages");
3965 }
3966
3967 #[test]
3968 fn test_get_all_pages_with_info() {
3969 let pdf_data = create_pdf_with_info();
3970 let cursor = Cursor::new(pdf_data);
3971 let mut reader = PdfReader::new(cursor).unwrap();
3972
3973 let pages = reader
3974 .get_all_pages()
3975 .expect("get_all_pages() must succeed");
3976 assert_eq!(
3977 pages.len(),
3978 0,
3979 "create_pdf_with_info() has 0 pages (Count 0)"
3980 );
3981 }
3982
3983 #[test]
3988 fn test_into_document_consumes_reader() {
3989 let pdf_data = create_minimal_pdf();
3990 let cursor = Cursor::new(pdf_data);
3991 let reader = PdfReader::new(cursor).unwrap();
3992
3993 let document = reader.into_document();
3994
3995 let version = document.version().expect("Document must have version");
3997 assert!(
3998 version.starts_with("1."),
3999 "Document must have PDF 1.x version, got: {}",
4000 version
4001 );
4002
4003 let page_count = document
4005 .page_count()
4006 .expect("Document must allow page_count()");
4007 assert_eq!(
4008 page_count, 0,
4009 "Minimal PDF has 0 pages (Count 0 in test helper)"
4010 );
4011 }
4012
4013 #[test]
4018 fn test_clear_parse_context() {
4019 let pdf_data = create_minimal_pdf();
4020 let cursor = Cursor::new(pdf_data);
4021 let mut reader = PdfReader::new(cursor).unwrap();
4022
4023 reader.clear_parse_context();
4025
4026 let version = reader.version();
4028 assert_eq!(version.major, 1, "Reader must still work after clear");
4029 }
4030
4031 #[test]
4032 fn test_parse_context_mut_accessible() {
4033 let pdf_data = create_minimal_pdf();
4034 let cursor = Cursor::new(pdf_data);
4035 let mut reader = PdfReader::new(cursor).unwrap();
4036
4037 let context = reader.parse_context_mut();
4038
4039 let initial_depth = context.depth;
4041 assert_eq!(initial_depth, 0, "Parse context must start with depth 0");
4042
4043 assert!(
4045 context.max_depth > 0,
4046 "Parse context must have positive max_depth"
4047 );
4048 }
4049
4050 #[test]
4055 fn test_find_bytes_basic() {
4056 let haystack = b"Hello World";
4057 let needle = b"World";
4058 let pos = find_bytes(haystack, needle);
4059 assert_eq!(pos, Some(6), "Must find 'World' at position 6");
4060 }
4061
4062 #[test]
4063 fn test_find_bytes_not_found() {
4064 let haystack = b"Hello World";
4065 let needle = b"Rust";
4066 let pos = find_bytes(haystack, needle);
4067 assert_eq!(pos, None, "Must return None when not found");
4068 }
4069
4070 #[test]
4071 fn test_find_bytes_at_start() {
4072 let haystack = b"Hello World";
4073 let needle = b"Hello";
4074 let pos = find_bytes(haystack, needle);
4075 assert_eq!(pos, Some(0), "Must find at position 0");
4076 }
4077
4078 #[test]
4079 fn test_is_immediate_stream_start_with_stream() {
4080 let data = b"stream\ndata";
4081 assert!(
4082 is_immediate_stream_start(data),
4083 "Must detect 'stream' at start"
4084 );
4085 }
4086
4087 #[test]
4088 fn test_is_immediate_stream_start_with_whitespace() {
4089 let data = b" \n\tstream\ndata";
4090 assert!(
4091 is_immediate_stream_start(data),
4092 "Must detect 'stream' after whitespace"
4093 );
4094 }
4095
4096 #[test]
4097 fn test_is_immediate_stream_start_no_stream() {
4098 let data = b"endobj";
4099 assert!(
4100 !is_immediate_stream_start(data),
4101 "Must return false when 'stream' absent"
4102 );
4103 }
4104 }
4105}