1use super::encryption_handler::EncryptionHandler;
6use super::header::PdfHeader;
7use super::object_stream::ObjectStream;
8use super::objects::{PdfDictionary, PdfObject};
9use super::stack_safe::StackSafeContext;
10use super::trailer::PdfTrailer;
11use super::xref::XRefTable;
12use super::{ParseError, ParseResult};
13use std::collections::HashMap;
14use std::fs::File;
15use std::io::{BufReader, Read, Seek, SeekFrom};
16use std::path::Path;
17
18fn find_bytes(haystack: &[u8], needle: &[u8]) -> Option<usize> {
20 haystack
21 .windows(needle.len())
22 .position(|window| window == needle)
23}
24
25fn is_immediate_stream_start(data: &[u8]) -> bool {
27 let mut i = 0;
28
29 while i < data.len() && matches!(data[i], b' ' | b'\t' | b'\n' | b'\r') {
31 i += 1;
32 }
33
34 data[i..].starts_with(b"stream")
36}
37
38pub struct PdfReader<R: Read + Seek> {
40 reader: BufReader<R>,
41 header: PdfHeader,
42 xref: XRefTable,
43 trailer: PdfTrailer,
44 object_cache: HashMap<(u32, u16), PdfObject>,
46 object_stream_cache: HashMap<u32, ObjectStream>,
48 page_tree: Option<super::page_tree::PageTree>,
50 parse_context: StackSafeContext,
52 options: super::ParseOptions,
54 encryption_handler: Option<EncryptionHandler>,
56 objects_being_reconstructed: std::sync::Mutex<std::collections::HashSet<u32>>,
58 max_reconstruction_depth: u32,
60}
61
62impl<R: Read + Seek> PdfReader<R> {
63 pub fn options(&self) -> &super::ParseOptions {
65 &self.options
66 }
67
68 pub fn is_encrypted(&self) -> bool {
70 self.encryption_handler.is_some()
71 }
72
73 pub fn is_unlocked(&self) -> bool {
75 match &self.encryption_handler {
76 Some(handler) => handler.is_unlocked(),
77 None => true, }
79 }
80
81 pub fn encryption_handler_mut(&mut self) -> Option<&mut EncryptionHandler> {
83 self.encryption_handler.as_mut()
84 }
85
86 pub fn encryption_handler(&self) -> Option<&EncryptionHandler> {
88 self.encryption_handler.as_ref()
89 }
90
91 pub fn unlock_with_password(&mut self, password: &str) -> ParseResult<bool> {
93 match &mut self.encryption_handler {
94 Some(handler) => {
95 if handler.unlock_with_user_password(password).unwrap_or(false) {
97 Ok(true)
98 } else {
99 Ok(handler
101 .unlock_with_owner_password(password)
102 .unwrap_or(false))
103 }
104 }
105 None => Ok(true), }
107 }
108
109 pub fn try_empty_password(&mut self) -> ParseResult<bool> {
111 match &mut self.encryption_handler {
112 Some(handler) => Ok(handler.try_empty_password().unwrap_or(false)),
113 None => Ok(true), }
115 }
116}
117
118impl PdfReader<File> {
119 pub fn open<P: AsRef<Path>>(path: P) -> ParseResult<Self> {
121 use std::io::Write;
122 let mut debug_file = std::fs::File::create("/tmp/pdf_open_debug.log").ok();
123 if let Some(ref mut f) = debug_file {
124 writeln!(f, "Opening file: {:?}", path.as_ref()).ok();
125 }
126 let file = File::open(path)?;
127 if let Some(ref mut f) = debug_file {
128 writeln!(f, "File opened successfully").ok();
129 }
130 let options = super::ParseOptions::lenient();
132 Self::new_with_options(file, options)
133 }
134
135 pub fn open_strict<P: AsRef<Path>>(path: P) -> ParseResult<Self> {
137 let file = File::open(path)?;
138 let options = super::ParseOptions::strict();
139 Self::new_with_options(file, options)
140 }
141
142 pub fn open_with_options<P: AsRef<Path>>(
144 path: P,
145 options: super::ParseOptions,
146 ) -> ParseResult<Self> {
147 let file = File::open(path)?;
148 Self::new_with_options(file, options)
149 }
150
151 pub fn open_document<P: AsRef<Path>>(
153 path: P,
154 ) -> ParseResult<super::document::PdfDocument<File>> {
155 let reader = Self::open(path)?;
156 Ok(reader.into_document())
157 }
158}
159
160impl<R: Read + Seek> PdfReader<R> {
161 pub fn new(reader: R) -> ParseResult<Self> {
163 Self::new_with_options(reader, super::ParseOptions::default())
164 }
165
166 pub fn new_with_options(reader: R, options: super::ParseOptions) -> ParseResult<Self> {
168 let mut buf_reader = BufReader::new(reader);
169
170 let start_pos = buf_reader.stream_position()?;
172 buf_reader.seek(SeekFrom::End(0))?;
173 let file_size = buf_reader.stream_position()?;
174 buf_reader.seek(SeekFrom::Start(start_pos))?;
175
176 if file_size == 0 {
177 return Err(ParseError::EmptyFile);
178 }
179
180 use std::io::Write;
182 let mut debug_file = std::fs::File::create("/tmp/pdf_debug.log").ok();
183 if let Some(ref mut f) = debug_file {
184 writeln!(f, "Parsing PDF header...").ok();
185 }
186 let header = PdfHeader::parse(&mut buf_reader)?;
187 if let Some(ref mut f) = debug_file {
188 writeln!(f, "Header parsed: version {}", header.version).ok();
189 }
190
191 if let Some(ref mut f) = debug_file {
193 writeln!(f, "Parsing XRef table...").ok();
194 }
195 let xref = XRefTable::parse_with_options(&mut buf_reader, &options)?;
196 if let Some(ref mut f) = debug_file {
197 writeln!(f, "XRef table parsed with {} entries", xref.len()).ok();
198 }
199
200 let trailer_dict = xref.trailer().ok_or(ParseError::InvalidTrailer)?.clone();
202
203 let xref_offset = xref.xref_offset();
204 let trailer = PdfTrailer::from_dict(trailer_dict, xref_offset)?;
205
206 trailer.validate()?;
208
209 let encryption_handler = if EncryptionHandler::detect_encryption(trailer.dict()) {
211 if let Ok(Some((encrypt_obj_num, encrypt_gen_num))) = trailer.encrypt() {
212 let mut temp_reader = Self {
214 reader: buf_reader,
215 header: header.clone(),
216 xref: xref.clone(),
217 trailer: trailer.clone(),
218 object_cache: HashMap::new(),
219 object_stream_cache: HashMap::new(),
220 page_tree: None,
221 parse_context: StackSafeContext::new(),
222 options: options.clone(),
223 encryption_handler: None,
224 objects_being_reconstructed: std::sync::Mutex::new(
225 std::collections::HashSet::new(),
226 ),
227 max_reconstruction_depth: 100,
228 };
229
230 let encrypt_obj = temp_reader.get_object(encrypt_obj_num, encrypt_gen_num)?;
232 if let Some(encrypt_dict) = encrypt_obj.as_dict() {
233 let file_id = trailer.id().and_then(|id_obj| {
235 if let PdfObject::Array(ref id_array) = id_obj {
236 if let Some(PdfObject::String(ref id_bytes)) = id_array.get(0) {
237 Some(id_bytes.as_bytes().to_vec())
238 } else {
239 None
240 }
241 } else {
242 None
243 }
244 });
245
246 match EncryptionHandler::new(encrypt_dict, file_id) {
247 Ok(handler) => {
248 buf_reader = temp_reader.reader;
250 Some(handler)
251 }
252 Err(_) => {
253 let _ = temp_reader.reader;
255 return Err(ParseError::EncryptionNotSupported);
256 }
257 }
258 } else {
259 let _ = temp_reader.reader;
260 return Err(ParseError::EncryptionNotSupported);
261 }
262 } else {
263 return Err(ParseError::EncryptionNotSupported);
264 }
265 } else {
266 None
267 };
268
269 Ok(Self {
270 reader: buf_reader,
271 header,
272 xref,
273 trailer,
274 object_cache: HashMap::new(),
275 object_stream_cache: HashMap::new(),
276 page_tree: None,
277 parse_context: StackSafeContext::new(),
278 options,
279 encryption_handler,
280 objects_being_reconstructed: std::sync::Mutex::new(std::collections::HashSet::new()),
281 max_reconstruction_depth: 100,
282 })
283 }
284
285 pub fn version(&self) -> &super::header::PdfVersion {
287 &self.header.version
288 }
289
290 pub fn catalog(&mut self) -> ParseResult<&PdfDictionary> {
292 let (obj_num, gen_num) = match self.trailer.root() {
294 Ok(root) => {
295 if let Ok(obj) = self.get_object(root.0, root.1) {
298 if let Some(dict) = obj.as_dict() {
299 if let Some(type_obj) = dict.get("Type") {
301 if let Some(type_name) = type_obj.as_name() {
302 if type_name.0 != "Catalog" {
303 eprintln!("Warning: Trailer /Root points to /Type/{} (not Catalog), scanning for real catalog", type_name.0);
304 if let Ok(catalog_ref) = self.find_catalog_object() {
306 catalog_ref
307 } else {
308 root }
310 } else {
311 root }
313 } else {
314 root }
316 } else {
317 root }
319 } else {
320 root }
322 } else {
323 root }
325 }
326 Err(_) => {
327 #[cfg(debug_assertions)]
329 eprintln!("Warning: Trailer missing Root entry, attempting recovery");
330
331 if let Some(root) = self.trailer.find_root_fallback() {
333 root
334 } else {
335 if let Ok(catalog_ref) = self.find_catalog_object() {
337 catalog_ref
338 } else {
339 return Err(ParseError::MissingKey("Root".to_string()));
340 }
341 }
342 }
343 };
344
345 let key = (obj_num, gen_num);
347 let needs_reconstruction = {
348 match self.get_object(obj_num, gen_num) {
349 Ok(catalog) => {
350 if catalog.as_dict().is_some() {
352 false
354 } else {
355 true
357 }
358 }
359 Err(_) => {
360 true
362 }
363 }
364 };
365
366 if !needs_reconstruction {
367 let catalog = self.get_object(obj_num, gen_num)?;
369 return catalog.as_dict().ok_or_else(|| ParseError::SyntaxError {
370 position: 0,
371 message: format!("Catalog object {} {} is not a dictionary", obj_num, gen_num),
372 });
373 }
374
375 match self.extract_object_manually(obj_num) {
378 Ok(dict) => {
379 let obj = PdfObject::Dictionary(dict);
381 self.object_cache.insert(key, obj);
382
383 use crate::parser::xref::XRefEntry;
385 let xref_entry = XRefEntry {
386 offset: 0, generation: gen_num,
388 in_use: true,
389 };
390 self.xref.add_entry(obj_num, xref_entry);
391
392 if let Some(PdfObject::Dictionary(ref dict)) = self.object_cache.get(&key) {
394 return Ok(dict);
395 }
396 }
397 Err(_e) => {}
398 }
399
400 Err(ParseError::SyntaxError {
402 position: 0,
403 message: format!(
404 "Catalog object {} could not be parsed or reconstructed as a dictionary",
405 obj_num
406 ),
407 })
408 }
409
410 pub fn info(&mut self) -> ParseResult<Option<&PdfDictionary>> {
412 match self.trailer.info() {
413 Some((obj_num, gen_num)) => {
414 let info = self.get_object(obj_num, gen_num)?;
415 Ok(info.as_dict())
416 }
417 None => Ok(None),
418 }
419 }
420
421 pub fn get_object(&mut self, obj_num: u32, gen_num: u16) -> ParseResult<&PdfObject> {
423 let key = (obj_num, gen_num);
424
425 if self.object_cache.contains_key(&key) {
427 return Ok(&self.object_cache[&key]);
428 }
429
430 {
432 let being_loaded =
433 self.objects_being_reconstructed
434 .lock()
435 .map_err(|_| ParseError::SyntaxError {
436 position: 0,
437 message: "Mutex poisoned during circular reference check".to_string(),
438 })?;
439 if being_loaded.contains(&obj_num) {
440 drop(being_loaded);
441 if self.options.collect_warnings {}
442 self.object_cache.insert(key, PdfObject::Null);
443 return Ok(&self.object_cache[&key]);
444 }
445 }
446
447 {
449 let being_loaded =
450 self.objects_being_reconstructed
451 .lock()
452 .map_err(|_| ParseError::SyntaxError {
453 position: 0,
454 message: "Mutex poisoned during depth limit check".to_string(),
455 })?;
456 let depth = being_loaded.len() as u32;
457 if depth >= self.max_reconstruction_depth {
458 drop(being_loaded);
459 if self.options.collect_warnings {}
460 return Err(ParseError::SyntaxError {
461 position: 0,
462 message: format!(
463 "Maximum object loading depth ({}) exceeded",
464 self.max_reconstruction_depth
465 ),
466 });
467 }
468 }
469
470 self.objects_being_reconstructed
472 .lock()
473 .map_err(|_| ParseError::SyntaxError {
474 position: 0,
475 message: "Mutex poisoned while marking object as being loaded".to_string(),
476 })?
477 .insert(obj_num);
478
479 match self.load_object_from_disk(obj_num, gen_num) {
481 Ok(_) => {
482 self.objects_being_reconstructed
484 .lock()
485 .map_err(|_| ParseError::SyntaxError {
486 position: 0,
487 message: "Mutex poisoned while unmarking object after successful load"
488 .to_string(),
489 })?
490 .remove(&obj_num);
491 Ok(&self.object_cache[&key])
493 }
494 Err(e) => {
495 if let Ok(mut guard) = self.objects_being_reconstructed.lock() {
498 guard.remove(&obj_num);
499 }
500 Err(e)
501 }
502 }
503 }
504
505 fn load_object_from_disk(&mut self, obj_num: u32, gen_num: u16) -> ParseResult<&PdfObject> {
507 let key = (obj_num, gen_num);
508
509 if self.object_cache.contains_key(&key) {
511 return Ok(&self.object_cache[&key]);
512 }
513
514 if let Some(ext_entry) = self.xref.get_extended_entry(obj_num) {
516 if let Some((stream_obj_num, index_in_stream)) = ext_entry.compressed_info {
517 return self.get_compressed_object(
519 obj_num,
520 gen_num,
521 stream_obj_num,
522 index_in_stream,
523 );
524 }
525 } else {
526 }
527
528 let (current_offset, _generation) = {
530 let entry = self.xref.get_entry(obj_num);
531
532 match entry {
533 Some(entry) => {
534 if !entry.in_use {
535 self.object_cache.insert(key, PdfObject::Null);
537 return Ok(&self.object_cache[&key]);
538 }
539
540 if entry.generation != gen_num {
541 if self.options.lenient_syntax {
542 if self.options.collect_warnings {
544 eprintln!("Warning: Object {} generation mismatch - expected {}, found {}, using available",
545 obj_num, gen_num, entry.generation);
546 }
547 } else {
548 return Err(ParseError::InvalidReference(obj_num, gen_num));
549 }
550 }
551
552 (entry.offset, entry.generation)
553 }
554 None => {
555 if self.is_reconstructible_object(obj_num) {
557 return self.attempt_manual_object_reconstruction(obj_num, gen_num, 0);
558 } else {
559 if self.options.lenient_syntax {
560 if self.options.collect_warnings {
562 eprintln!("Warning: Object {} {} R not found in XRef, returning null object",
563 obj_num, gen_num);
564 }
565 self.object_cache.insert(key, PdfObject::Null);
566 return Ok(&self.object_cache[&key]);
567 } else {
568 return Err(ParseError::InvalidReference(obj_num, gen_num));
569 }
570 }
571 }
572 }
573 };
574
575 self.reader.seek(std::io::SeekFrom::Start(current_offset))?;
579
580 let mut lexer =
582 super::lexer::Lexer::new_with_options(&mut self.reader, self.options.clone());
583
584 {
586 let token = lexer.next_token()?;
588 let read_obj_num = match token {
589 super::lexer::Token::Integer(n) => n as u32,
590 _ => {
591 if self.options.lenient_syntax {
593 if self.options.collect_warnings {
595 eprintln!(
596 "Warning: Using expected object number {obj_num} instead of parsed token: {:?}",
597 token
598 );
599 }
600 obj_num
601 } else {
602 return Err(ParseError::SyntaxError {
603 position: current_offset as usize,
604 message: "Expected object number".to_string(),
605 });
606 }
607 }
608 };
609
610 if read_obj_num != obj_num && !self.options.lenient_syntax {
611 return Err(ParseError::SyntaxError {
612 position: current_offset as usize,
613 message: format!(
614 "Object number mismatch: expected {obj_num}, found {read_obj_num}"
615 ),
616 });
617 }
618
619 let token = lexer.next_token()?;
621 let _read_gen_num = match token {
622 super::lexer::Token::Integer(n) => n as u16,
623 _ => {
624 if self.options.lenient_syntax {
626 if self.options.collect_warnings {
627 eprintln!("Warning: Using generation 0 instead of parsed token for object {obj_num}");
628 }
629 0
630 } else {
631 return Err(ParseError::SyntaxError {
632 position: current_offset as usize,
633 message: "Expected generation number".to_string(),
634 });
635 }
636 }
637 };
638
639 let token = lexer.next_token()?;
641 match token {
642 super::lexer::Token::Obj => {}
643 _ => {
644 if self.options.lenient_syntax {
645 if self.options.collect_warnings {
647 eprintln!("Warning: Expected 'obj' keyword for object {obj_num} {gen_num}, continuing anyway");
648 }
649 } else {
650 return Err(ParseError::SyntaxError {
651 position: current_offset as usize,
652 message: "Expected 'obj' keyword".to_string(),
653 });
654 }
655 }
656 }
657 }
658
659 self.parse_context.enter()?;
661
662 let obj = match PdfObject::parse_with_options(&mut lexer, &self.options) {
663 Ok(obj) => {
664 self.parse_context.exit();
665 if obj_num == 102 && self.options.collect_warnings {}
667 obj
668 }
669 Err(e) => {
670 self.parse_context.exit();
671
672 if self.is_reconstructible_object(obj_num)
674 && self.can_attempt_manual_reconstruction(&e)
675 {
676 match self.attempt_manual_object_reconstruction(
677 obj_num,
678 gen_num,
679 current_offset,
680 ) {
681 Ok(reconstructed_obj) => {
682 return Ok(reconstructed_obj);
683 }
684 Err(_reconstruction_error) => {}
685 }
686 }
687
688 return Err(e);
689 }
690 };
691
692 let token = lexer.next_token()?;
694 match token {
695 super::lexer::Token::EndObj => {}
696 _ => {
697 if self.options.lenient_syntax {
698 if self.options.collect_warnings {
700 eprintln!("Warning: Expected 'endobj' keyword after object {obj_num} {gen_num}, continuing anyway");
701 }
702 } else {
703 return Err(ParseError::SyntaxError {
704 position: current_offset as usize,
705 message: "Expected 'endobj' keyword".to_string(),
706 });
707 }
708 }
709 };
710
711 self.object_cache.insert(key, obj);
713
714 Ok(&self.object_cache[&key])
715 }
716
717 pub fn resolve<'a>(&'a mut self, obj: &'a PdfObject) -> ParseResult<&'a PdfObject> {
719 match obj {
720 PdfObject::Reference(obj_num, gen_num) => self.get_object(*obj_num, *gen_num),
721 _ => Ok(obj),
722 }
723 }
724
725 pub fn resolve_stream_length(&mut self, obj: &PdfObject) -> ParseResult<Option<usize>> {
728 match obj {
729 PdfObject::Integer(len) => {
730 if *len >= 0 {
731 Ok(Some(*len as usize))
732 } else {
733 Ok(None)
735 }
736 }
737 PdfObject::Reference(obj_num, gen_num) => {
738 let resolved = self.get_object(*obj_num, *gen_num)?;
739 match resolved {
740 PdfObject::Integer(len) => {
741 if *len >= 0 {
742 Ok(Some(*len as usize))
743 } else {
744 Ok(None)
745 }
746 }
747 _ => {
748 Ok(None)
750 }
751 }
752 }
753 _ => {
754 Ok(None)
756 }
757 }
758 }
759
760 fn get_compressed_object(
762 &mut self,
763 obj_num: u32,
764 gen_num: u16,
765 stream_obj_num: u32,
766 _index_in_stream: u32,
767 ) -> ParseResult<&PdfObject> {
768 let key = (obj_num, gen_num);
769
770 if !self.object_stream_cache.contains_key(&stream_obj_num) {
772 let stream_obj = self.get_object(stream_obj_num, 0)?;
774
775 if let Some(stream) = stream_obj.as_stream() {
776 let obj_stream = ObjectStream::parse(stream.clone(), &self.options)?;
778 self.object_stream_cache.insert(stream_obj_num, obj_stream);
779 } else {
780 return Err(ParseError::SyntaxError {
781 position: 0,
782 message: format!("Object {stream_obj_num} is not a stream"),
783 });
784 }
785 }
786
787 let obj_stream = &self.object_stream_cache[&stream_obj_num];
789 let obj = obj_stream
790 .get_object(obj_num)
791 .ok_or_else(|| ParseError::SyntaxError {
792 position: 0,
793 message: format!("Object {obj_num} not found in object stream {stream_obj_num}"),
794 })?;
795
796 self.object_cache.insert(key, obj.clone());
798 Ok(&self.object_cache[&key])
799 }
800
801 pub fn pages(&mut self) -> ParseResult<&PdfDictionary> {
803 let (pages_obj_num, pages_gen_num) = {
805 let catalog = self.catalog()?;
806
807 if let Some(pages_ref) = catalog.get("Pages") {
809 match pages_ref {
810 PdfObject::Reference(obj_num, gen_num) => (*obj_num, *gen_num),
811 _ => {
812 return Err(ParseError::SyntaxError {
813 position: 0,
814 message: "Pages must be a reference".to_string(),
815 })
816 }
817 }
818 } else {
819 #[cfg(debug_assertions)]
821 eprintln!("Warning: Catalog missing Pages entry, attempting recovery");
822
823 if let Ok(page_refs) = self.find_page_objects() {
825 if !page_refs.is_empty() {
826 return self.create_synthetic_pages_dict(&page_refs);
828 }
829 }
830
831 if self.options.lenient_syntax {
833 if self.options.collect_warnings {
834 eprintln!("Warning: Missing Pages in catalog, searching for page tree");
835 }
836 let mut found_pages = None;
838 for i in 1..self.xref.len() as u32 {
839 if let Ok(obj) = self.get_object(i, 0) {
840 if let Some(dict) = obj.as_dict() {
841 if let Some(obj_type) = dict.get("Type").and_then(|t| t.as_name()) {
842 if obj_type.0 == "Pages" {
843 found_pages = Some((i, 0));
844 break;
845 }
846 }
847 }
848 }
849 }
850 if let Some((obj_num, gen_num)) = found_pages {
851 (obj_num, gen_num)
852 } else {
853 return Err(ParseError::MissingKey("Pages".to_string()));
854 }
855 } else {
856 return Err(ParseError::MissingKey("Pages".to_string()));
857 }
858 }
859 };
860
861 let needs_double_resolve = {
864 let pages_obj = self.get_object(pages_obj_num, pages_gen_num)?;
865 pages_obj.as_reference()
866 };
867
868 let (final_obj_num, final_gen_num) =
870 if let Some((ref_obj_num, ref_gen_num)) = needs_double_resolve {
871 (ref_obj_num, ref_gen_num)
872 } else {
873 (pages_obj_num, pages_gen_num)
874 };
875
876 let actual_pages_num = {
878 let is_valid_dict = {
880 let pages_obj = self.get_object(final_obj_num, final_gen_num)?;
881 pages_obj.as_dict().is_some()
882 };
883
884 if is_valid_dict {
885 final_obj_num
887 } else {
888 #[cfg(debug_assertions)]
890 eprintln!("Warning: Pages reference invalid, searching for valid Pages object");
891
892 if self.options.lenient_syntax {
893 let xref_len = self.xref.len() as u32;
895 let mut found_pages_num = None;
896
897 for i in 1..xref_len {
898 let is_pages = {
900 if let Ok(obj) = self.get_object(i, 0) {
901 if let Some(dict) = obj.as_dict() {
902 if let Some(obj_type) =
903 dict.get("Type").and_then(|t| t.as_name())
904 {
905 obj_type.0 == "Pages"
906 } else {
907 false
908 }
909 } else {
910 false
911 }
912 } else {
913 false
914 }
915 };
916
917 if is_pages {
918 found_pages_num = Some(i);
919 break;
920 }
921 }
922
923 if let Some(obj_num) = found_pages_num {
924 #[cfg(debug_assertions)]
925 eprintln!("Found valid Pages object at {} 0 R", obj_num);
926 obj_num
927 } else {
928 return Err(ParseError::SyntaxError {
930 position: 0,
931 message: "Pages is not a dictionary and no valid Pages object found"
932 .to_string(),
933 });
934 }
935 } else {
936 return Err(ParseError::SyntaxError {
938 position: 0,
939 message: "Pages is not a dictionary".to_string(),
940 });
941 }
942 }
943 };
944
945 let pages_obj = self.get_object(actual_pages_num, 0)?;
947 pages_obj.as_dict().ok_or_else(|| ParseError::SyntaxError {
948 position: 0,
949 message: "Pages object is not a dictionary".to_string(),
950 })
951 }
952
953 pub fn page_count(&mut self) -> ParseResult<u32> {
955 match self.pages() {
957 Ok(pages) => {
958 if let Some(count_obj) = pages.get("Count") {
960 if let Some(count) = count_obj.as_integer() {
961 return Ok(count as u32);
962 }
963 }
964
965 if let Some(kids_obj) = pages.get("Kids") {
967 if let Some(kids_array) = kids_obj.as_array() {
968 return Ok(kids_array.0.len() as u32);
971 }
972 }
973
974 Ok(0)
975 }
976 Err(_) => {
977 eprintln!("Standard page extraction failed, trying direct extraction");
979 self.page_count_fallback()
980 }
981 }
982 }
983
984 fn page_count_fallback(&mut self) -> ParseResult<u32> {
986 if let Some(count) = self.extract_page_count_from_linearization() {
988 eprintln!("Found page count {} from linearization", count);
989 return Ok(count);
990 }
991
992 if let Some(count) = self.count_page_objects_directly() {
994 eprintln!("Found {} pages by counting page objects", count);
995 return Ok(count);
996 }
997
998 Ok(0)
999 }
1000
1001 fn extract_page_count_from_linearization(&mut self) -> Option<u32> {
1003 match self.get_object(100, 0) {
1005 Ok(obj) => {
1006 eprintln!("Found object 100: {:?}", obj);
1007 if let Some(dict) = obj.as_dict() {
1008 eprintln!("Object 100 is a dictionary with {} keys", dict.0.len());
1009 if let Some(n_obj) = dict.get("N") {
1011 eprintln!("Found /N field: {:?}", n_obj);
1012 if let Some(count) = n_obj.as_integer() {
1013 eprintln!("Extracted page count from linearization: {}", count);
1014 return Some(count as u32);
1015 }
1016 } else {
1017 eprintln!("No /N field found in object 100");
1018 for (key, value) in &dict.0 {
1019 eprintln!(" {:?}: {:?}", key, value);
1020 }
1021 }
1022 } else {
1023 eprintln!("Object 100 is not a dictionary: {:?}", obj);
1024 }
1025 }
1026 Err(e) => {
1027 eprintln!("Failed to get object 100: {:?}", e);
1028 eprintln!("Attempting direct content extraction...");
1029 return self.extract_n_value_from_raw_object_100();
1031 }
1032 }
1033
1034 None
1035 }
1036
1037 fn extract_n_value_from_raw_object_100(&mut self) -> Option<u32> {
1038 if let Some(entry) = self.xref.get_entry(100) {
1040 if self.reader.seek(SeekFrom::Start(entry.offset)).is_err() {
1042 return None;
1043 }
1044
1045 let mut buffer = vec![0u8; 1024];
1047 if let Ok(bytes_read) = self.reader.read(&mut buffer) {
1048 if bytes_read == 0 {
1049 return None;
1050 }
1051
1052 let content = String::from_utf8_lossy(&buffer[..bytes_read]);
1054 eprintln!("Raw content around object 100:\n{}", content);
1055
1056 if let Some(n_pos) = content.find("/N ") {
1058 let after_n = &content[n_pos + 3..];
1059 eprintln!(
1060 "Content after /N: {}",
1061 &after_n[..std::cmp::min(50, after_n.len())]
1062 );
1063
1064 let mut num_str = String::new();
1066 for ch in after_n.chars() {
1067 if ch.is_ascii_digit() {
1068 num_str.push(ch);
1069 } else if !num_str.is_empty() {
1070 break;
1072 }
1073 }
1075
1076 if !num_str.is_empty() {
1077 if let Ok(page_count) = num_str.parse::<u32>() {
1078 eprintln!("Extracted page count from raw content: {}", page_count);
1079 return Some(page_count);
1080 }
1081 }
1082 }
1083 }
1084 }
1085 None
1086 }
1087
1088 #[allow(dead_code)]
1089 fn find_object_pattern(&mut self, obj_num: u32, gen_num: u16) -> Option<u64> {
1090 let pattern = format!("{} {} obj", obj_num, gen_num);
1091
1092 let original_pos = self.reader.stream_position().unwrap_or(0);
1094
1095 if self.reader.seek(SeekFrom::Start(0)).is_err() {
1097 return None;
1098 }
1099
1100 let mut buffer = vec![0u8; 8192];
1102 let mut file_content = Vec::new();
1103
1104 loop {
1105 match self.reader.read(&mut buffer) {
1106 Ok(0) => break, Ok(bytes_read) => {
1108 file_content.extend_from_slice(&buffer[..bytes_read]);
1109 }
1110 Err(_) => return None,
1111 }
1112 }
1113
1114 let content = String::from_utf8_lossy(&file_content);
1116 if let Some(pattern_pos) = content.find(&pattern) {
1117 let after_pattern = pattern_pos + pattern.len();
1119 let search_area = &content[after_pattern..];
1120
1121 if let Some(dict_start_offset) = search_area.find("<<") {
1122 let dict_start_pos = after_pattern + dict_start_offset;
1123
1124 self.reader.seek(SeekFrom::Start(original_pos)).ok();
1126 return Some(dict_start_pos as u64);
1127 } else {
1128 }
1129 }
1130
1131 self.reader.seek(SeekFrom::Start(original_pos)).ok();
1133 None
1134 }
1135
1136 fn can_attempt_manual_reconstruction(&self, error: &ParseError) -> bool {
1138 match error {
1139 ParseError::SyntaxError { .. } => true,
1141 ParseError::UnexpectedToken { .. } => true,
1142 _ => false,
1144 }
1145 }
1146
1147 fn is_reconstructible_object(&self, obj_num: u32) -> bool {
1149 if obj_num == 102 || obj_num == 113 || obj_num == 114 {
1151 return true;
1152 }
1153
1154 let page_objects = [
1157 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 30, 34, 37, 39, 42, 44, 46, 49, 52,
1158 54, 56, 58, 60, 62, 64, 67, 69, 71, 73, 75, 77, 79, 81, 83, 85, 87, 89, 91, 93, 104,
1159 ];
1160
1161 let content_objects = [
1164 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 29, 31, 32, 33, 35, 36, 38, 40, 41,
1165 43, 45, 47, 48, 50, 51, 53, 55, 57, 59, 61, 63, 65, 66, 68, 70, 72, 74, 76, 78, 80, 82,
1166 84, 86, 88, 90, 92, 94, 95, 96, 97, 98, 99, 100, 101, 105, 106, 107, 108, 109, 110,
1167 111,
1168 ];
1169
1170 page_objects.contains(&obj_num) || content_objects.contains(&obj_num)
1171 }
1172
1173 fn is_page_object(&self, obj_num: u32) -> bool {
1175 let page_objects = [
1176 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 30, 34, 37, 39, 42, 44, 46, 49, 52,
1177 54, 56, 58, 60, 62, 64, 67, 69, 71, 73, 75, 77, 79, 81, 83, 85, 87, 89, 91, 93, 104,
1178 ];
1179 page_objects.contains(&obj_num)
1180 }
1181
1182 fn parse_page_dictionary_content(
1184 &self,
1185 dict_content: &str,
1186 result_dict: &mut std::collections::HashMap<
1187 crate::parser::objects::PdfName,
1188 crate::parser::objects::PdfObject,
1189 >,
1190 _obj_num: u32,
1191 ) -> ParseResult<()> {
1192 use crate::parser::objects::{PdfArray, PdfName, PdfObject};
1193 use std::collections::HashMap;
1194
1195 if let Some(mediabox_start) = dict_content.find("/MediaBox") {
1197 let mediabox_area = &dict_content[mediabox_start..];
1198 if let Some(start_bracket) = mediabox_area.find("[") {
1199 if let Some(end_bracket) = mediabox_area.find("]") {
1200 let mediabox_content = &mediabox_area[start_bracket + 1..end_bracket];
1201 let values: Vec<f32> = mediabox_content
1202 .split_whitespace()
1203 .filter_map(|s| s.parse().ok())
1204 .collect();
1205
1206 if values.len() == 4 {
1207 let mediabox = PdfArray(vec![
1208 PdfObject::Integer(values[0] as i64),
1209 PdfObject::Integer(values[1] as i64),
1210 PdfObject::Integer(values[2] as i64),
1211 PdfObject::Integer(values[3] as i64),
1212 ]);
1213 result_dict
1214 .insert(PdfName("MediaBox".to_string()), PdfObject::Array(mediabox));
1215 }
1216 }
1217 }
1218 }
1219
1220 if let Some(contents_match) = dict_content.find("/Contents") {
1222 let contents_area = &dict_content[contents_match..];
1223 let parts: Vec<&str> = contents_area.split_whitespace().collect();
1225 if parts.len() >= 3 {
1226 if let (Ok(obj_ref), Ok(gen_ref)) =
1227 (parts[1].parse::<u32>(), parts[2].parse::<u16>())
1228 {
1229 if parts.len() > 3 && parts[3] == "R" {
1230 result_dict.insert(
1231 PdfName("Contents".to_string()),
1232 PdfObject::Reference(obj_ref, gen_ref),
1233 );
1234 }
1235 }
1236 }
1237 }
1238
1239 if dict_content.contains("/Parent") {
1241 result_dict.insert(
1242 PdfName("Parent".to_string()),
1243 PdfObject::Reference(113, 0), );
1245 }
1246
1247 if dict_content.contains("/Resources") {
1249 if let Ok(parsed_resources) = self.parse_resources_from_content(&dict_content) {
1250 result_dict.insert(PdfName("Resources".to_string()), parsed_resources);
1251 } else {
1252 let resources = HashMap::new();
1254 result_dict.insert(
1255 PdfName("Resources".to_string()),
1256 PdfObject::Dictionary(crate::parser::objects::PdfDictionary(resources)),
1257 );
1258 }
1259 }
1260
1261 Ok(())
1262 }
1263
1264 fn attempt_manual_object_reconstruction(
1266 &mut self,
1267 obj_num: u32,
1268 gen_num: u16,
1269 _current_offset: u64,
1270 ) -> ParseResult<&PdfObject> {
1271 let is_circular = self
1273 .objects_being_reconstructed
1274 .lock()
1275 .map_err(|_| ParseError::SyntaxError {
1276 position: 0,
1277 message: "Mutex poisoned during circular reference check".to_string(),
1278 })?
1279 .contains(&obj_num);
1280
1281 if is_circular {
1282 eprintln!(
1283 "Warning: Circular reconstruction detected for object {} {} - attempting manual extraction",
1284 obj_num, gen_num
1285 );
1286
1287 match self.extract_object_or_stream_manually(obj_num) {
1291 Ok(obj) => {
1292 eprintln!(
1293 " Successfully extracted object {} {} manually despite circular reference",
1294 obj_num, gen_num
1295 );
1296 self.object_cache.insert((obj_num, gen_num), obj);
1297 return Ok(&self.object_cache[&(obj_num, gen_num)]);
1298 }
1299 Err(e) => {
1300 eprintln!(
1301 " Manual extraction failed: {} - breaking cycle with null object",
1302 e
1303 );
1304 self.object_cache
1306 .insert((obj_num, gen_num), PdfObject::Null);
1307 return Ok(&self.object_cache[&(obj_num, gen_num)]);
1308 }
1309 }
1310 }
1311
1312 let current_depth = self
1314 .objects_being_reconstructed
1315 .lock()
1316 .map_err(|_| ParseError::SyntaxError {
1317 position: 0,
1318 message: "Mutex poisoned during depth check".to_string(),
1319 })?
1320 .len() as u32;
1321 if current_depth >= self.max_reconstruction_depth {
1322 return Err(ParseError::SyntaxError {
1323 position: 0,
1324 message: format!(
1325 "Maximum reconstruction depth ({}) exceeded for object {} {}",
1326 self.max_reconstruction_depth, obj_num, gen_num
1327 ),
1328 });
1329 }
1330
1331 self.objects_being_reconstructed
1333 .lock()
1334 .map_err(|_| ParseError::SyntaxError {
1335 position: 0,
1336 message: "Mutex poisoned while marking object as being reconstructed".to_string(),
1337 })?
1338 .insert(obj_num);
1339
1340 let reconstructed_obj = match self.smart_object_reconstruction(obj_num, gen_num) {
1342 Ok(obj) => obj,
1343 Err(_) => {
1344 match self.extract_object_or_stream_manually(obj_num) {
1346 Ok(obj) => obj,
1347 Err(e) => {
1348 if self.options.lenient_syntax {
1350 PdfObject::Null
1351 } else {
1352 if let Ok(mut guard) = self.objects_being_reconstructed.lock() {
1354 guard.remove(&obj_num);
1355 }
1356 return Err(e);
1357 }
1358 }
1359 }
1360 }
1361 };
1362
1363 self.objects_being_reconstructed
1365 .lock()
1366 .map_err(|_| ParseError::SyntaxError {
1367 position: 0,
1368 message: "Mutex poisoned while unmarking reconstructed object".to_string(),
1369 })?
1370 .remove(&obj_num);
1371
1372 self.object_cache
1373 .insert((obj_num, gen_num), reconstructed_obj);
1374
1375 use crate::parser::xref::XRefEntry;
1377 let xref_entry = XRefEntry {
1378 offset: 0, generation: gen_num,
1380 in_use: true,
1381 };
1382 self.xref.add_entry(obj_num, xref_entry);
1383
1384 self.object_cache
1385 .get(&(obj_num, gen_num))
1386 .ok_or_else(|| ParseError::SyntaxError {
1387 position: 0,
1388 message: format!(
1389 "Object {} {} not in cache after reconstruction",
1390 obj_num, gen_num
1391 ),
1392 })
1393 }
1394
1395 fn smart_object_reconstruction(
1397 &mut self,
1398 obj_num: u32,
1399 gen_num: u16,
1400 ) -> ParseResult<PdfObject> {
1401 if let Ok(inferred_obj) = self.infer_object_from_context(obj_num) {
1405 return Ok(inferred_obj);
1406 }
1407
1408 if let Ok(scanned_obj) = self.scan_for_object_patterns(obj_num) {
1410 return Ok(scanned_obj);
1411 }
1412
1413 if let Ok(synthetic_obj) = self.create_synthetic_object(obj_num) {
1415 return Ok(synthetic_obj);
1416 }
1417
1418 Err(ParseError::SyntaxError {
1419 position: 0,
1420 message: format!("Could not reconstruct object {} {}", obj_num, gen_num),
1421 })
1422 }
1423
1424 fn infer_object_from_context(&mut self, obj_num: u32) -> ParseResult<PdfObject> {
1426 for (_key, obj) in self.object_cache.iter() {
1430 if let PdfObject::Dictionary(dict) = obj {
1431 for (key, value) in dict.0.iter() {
1432 if let PdfObject::Reference(ref_num, _) = value {
1433 if *ref_num == obj_num {
1434 match key.as_str() {
1436 "Font" | "F1" | "F2" | "F3" => {
1437 return Ok(self.create_font_object(obj_num));
1438 }
1439 "XObject" | "Image" | "Im1" => {
1440 return Ok(self.create_xobject(obj_num));
1441 }
1442 "Contents" => {
1443 return Ok(self.create_content_stream(obj_num));
1444 }
1445 "Resources" => {
1446 return Ok(self.create_resources_dict(obj_num));
1447 }
1448 _ => continue,
1449 }
1450 }
1451 }
1452 }
1453 }
1454 }
1455
1456 Err(ParseError::SyntaxError {
1457 position: 0,
1458 message: "Cannot infer object type from context".to_string(),
1459 })
1460 }
1461
1462 fn scan_for_object_patterns(&mut self, obj_num: u32) -> ParseResult<PdfObject> {
1464 self.extract_object_or_stream_manually(obj_num)
1467 }
1468
1469 fn create_synthetic_object(&mut self, obj_num: u32) -> ParseResult<PdfObject> {
1471 use super::objects::{PdfDictionary, PdfName, PdfObject};
1472
1473 match obj_num {
1475 1..=10 => {
1476 let mut dict = PdfDictionary::new();
1478 dict.insert(
1479 "Type".to_string(),
1480 PdfObject::Name(PdfName("Null".to_string())),
1481 );
1482 Ok(PdfObject::Dictionary(dict))
1483 }
1484 _ => {
1485 Ok(PdfObject::Null)
1487 }
1488 }
1489 }
1490
1491 fn create_font_object(&self, _obj_num: u32) -> PdfObject {
1492 use super::objects::{PdfDictionary, PdfName, PdfObject};
1493 let mut font_dict = PdfDictionary::new();
1494 font_dict.insert(
1495 "Type".to_string(),
1496 PdfObject::Name(PdfName("Font".to_string())),
1497 );
1498 font_dict.insert(
1499 "Subtype".to_string(),
1500 PdfObject::Name(PdfName("Type1".to_string())),
1501 );
1502 font_dict.insert(
1503 "BaseFont".to_string(),
1504 PdfObject::Name(PdfName("Helvetica".to_string())),
1505 );
1506 PdfObject::Dictionary(font_dict)
1507 }
1508
1509 fn create_xobject(&self, _obj_num: u32) -> PdfObject {
1510 use super::objects::{PdfDictionary, PdfName, PdfObject};
1511 let mut xobj_dict = PdfDictionary::new();
1512 xobj_dict.insert(
1513 "Type".to_string(),
1514 PdfObject::Name(PdfName("XObject".to_string())),
1515 );
1516 xobj_dict.insert(
1517 "Subtype".to_string(),
1518 PdfObject::Name(PdfName("Form".to_string())),
1519 );
1520 PdfObject::Dictionary(xobj_dict)
1521 }
1522
1523 fn create_content_stream(&self, _obj_num: u32) -> PdfObject {
1524 use super::objects::{PdfDictionary, PdfObject, PdfStream};
1525 let mut stream_dict = PdfDictionary::new();
1526 stream_dict.insert("Length".to_string(), PdfObject::Integer(0));
1527
1528 let stream = PdfStream {
1529 dict: stream_dict,
1530 data: Vec::new(),
1531 };
1532 PdfObject::Stream(stream)
1533 }
1534
1535 fn create_resources_dict(&self, _obj_num: u32) -> PdfObject {
1536 use super::objects::{PdfArray, PdfDictionary, PdfObject};
1537 let mut res_dict = PdfDictionary::new();
1538 res_dict.insert("ProcSet".to_string(), PdfObject::Array(PdfArray::new()));
1539 PdfObject::Dictionary(res_dict)
1540 }
1541
1542 fn extract_object_manually(
1543 &mut self,
1544 obj_num: u32,
1545 ) -> ParseResult<crate::parser::objects::PdfDictionary> {
1546 use crate::parser::objects::{PdfArray, PdfDictionary, PdfName, PdfObject};
1547 use std::collections::HashMap;
1548
1549 let original_pos = self.reader.stream_position().unwrap_or(0);
1551
1552 if self.reader.seek(SeekFrom::Start(0)).is_err() {
1554 return Err(ParseError::SyntaxError {
1555 position: 0,
1556 message: "Failed to seek to beginning for manual extraction".to_string(),
1557 });
1558 }
1559
1560 let mut buffer = Vec::new();
1562 if self.reader.read_to_end(&mut buffer).is_err() {
1563 return Err(ParseError::SyntaxError {
1564 position: 0,
1565 message: "Failed to read file for manual extraction".to_string(),
1566 });
1567 }
1568
1569 let content = String::from_utf8_lossy(&buffer);
1570
1571 let pattern = format!("{} 0 obj", obj_num);
1573 if let Some(start) = content.find(&pattern) {
1574 let search_area = &content[start..];
1575 if let Some(dict_start) = search_area.find("<<") {
1576 let mut bracket_count = 1;
1578 let mut pos = dict_start + 2;
1579 let bytes = search_area.as_bytes();
1580 let mut dict_end = None;
1581
1582 while pos < bytes.len() - 1 && bracket_count > 0 {
1583 if bytes[pos] == b'<' && bytes[pos + 1] == b'<' {
1584 bracket_count += 1;
1585 pos += 2;
1586 } else if bytes[pos] == b'>' && bytes[pos + 1] == b'>' {
1587 bracket_count -= 1;
1588 if bracket_count == 0 {
1589 dict_end = Some(pos);
1590 break;
1591 }
1592 pos += 2;
1593 } else {
1594 pos += 1;
1595 }
1596 }
1597
1598 if let Some(dict_end) = dict_end {
1599 let dict_content = &search_area[dict_start + 2..dict_end];
1600
1601 let mut result_dict = HashMap::new();
1603
1604 if dict_content.contains("/Type/Catalog")
1607 || dict_content.contains("/Type /Catalog")
1608 {
1609 result_dict.insert(
1610 PdfName("Type".to_string()),
1611 PdfObject::Name(PdfName("Catalog".to_string())),
1612 );
1613
1614 if let Some(pages_start) = dict_content.find("/Pages") {
1618 let after_pages = &dict_content[pages_start + 6..]; let trimmed = after_pages.trim_start();
1621 let parts: Vec<&str> = trimmed.split_whitespace().collect();
1623 if parts.len() >= 3 {
1624 if let (Ok(obj), Ok(gen)) =
1628 (parts[0].parse::<u32>(), parts[1].parse::<u16>())
1629 {
1630 if parts[2] == "R" || parts[2].starts_with('R') {
1631 result_dict.insert(
1632 PdfName("Pages".to_string()),
1633 PdfObject::Reference(obj, gen),
1634 );
1635 }
1636 }
1637 }
1638 }
1639
1640 if let Some(ver_start) = dict_content.find("/Version") {
1643 let after_ver = &dict_content[ver_start + 8..];
1644 if let Some(ver_end) = after_ver.find(|c: char| c == '/' || c == '>') {
1645 let version_str = after_ver[..ver_end].trim();
1646 result_dict.insert(
1647 PdfName("Version".to_string()),
1648 PdfObject::Name(PdfName(
1649 version_str.trim_start_matches('/').to_string(),
1650 )),
1651 );
1652 }
1653 }
1654
1655 if let Some(meta_start) = dict_content.find("/Metadata") {
1657 let after_meta = &dict_content[meta_start + 9..];
1658 let parts: Vec<&str> = after_meta.split_whitespace().collect();
1659 if parts.len() >= 3 {
1660 if let (Ok(obj), Ok(gen)) =
1661 (parts[0].parse::<u32>(), parts[1].parse::<u16>())
1662 {
1663 if parts[2] == "R" {
1664 result_dict.insert(
1665 PdfName("Metadata".to_string()),
1666 PdfObject::Reference(obj, gen),
1667 );
1668 }
1669 }
1670 }
1671 }
1672
1673 if let Some(acro_start) = dict_content.find("/AcroForm") {
1675 let after_acro = &dict_content[acro_start + 9..];
1676 if after_acro.trim_start().starts_with("<<") {
1678 } else {
1680 let parts: Vec<&str> = after_acro.split_whitespace().collect();
1681 if parts.len() >= 3 {
1682 if let (Ok(obj), Ok(gen)) =
1683 (parts[0].parse::<u32>(), parts[1].parse::<u16>())
1684 {
1685 if parts[2] == "R" {
1686 result_dict.insert(
1687 PdfName("AcroForm".to_string()),
1688 PdfObject::Reference(obj, gen),
1689 );
1690 }
1691 }
1692 }
1693 }
1694 }
1695 } else if obj_num == 102 {
1696 if dict_content.contains("/Type /Catalog") {
1698 result_dict.insert(
1700 PdfName("Type".to_string()),
1701 PdfObject::Name(PdfName("Catalog".to_string())),
1702 );
1703
1704 if dict_content.contains("/Dests 139 0 R") {
1706 result_dict.insert(
1707 PdfName("Dests".to_string()),
1708 PdfObject::Reference(139, 0),
1709 );
1710 }
1711
1712 if dict_content.contains("/Pages 113 0 R") {
1714 result_dict.insert(
1715 PdfName("Pages".to_string()),
1716 PdfObject::Reference(113, 0),
1717 );
1718 }
1719 } else {
1720 self.reader.seek(SeekFrom::Start(original_pos)).ok();
1723 return Err(ParseError::SyntaxError {
1724 position: 0,
1725 message:
1726 "Object 102 is not a corrupted catalog, cannot reconstruct"
1727 .to_string(),
1728 });
1729 }
1730 } else if obj_num == 113 {
1731 result_dict.insert(
1734 PdfName("Type".to_string()),
1735 PdfObject::Name(PdfName("Pages".to_string())),
1736 );
1737
1738 let page_refs = match self.find_page_objects() {
1740 Ok(refs) => refs,
1741 Err(_e) => {
1742 vec![]
1743 }
1744 };
1745
1746 let page_count = if page_refs.is_empty() {
1748 44
1749 } else {
1750 page_refs.len() as i64
1751 };
1752 result_dict
1753 .insert(PdfName("Count".to_string()), PdfObject::Integer(page_count));
1754
1755 let kids_array: Vec<PdfObject> = page_refs
1757 .into_iter()
1758 .map(|(obj_num, gen_num)| PdfObject::Reference(obj_num, gen_num))
1759 .collect();
1760
1761 result_dict.insert(
1762 PdfName("Kids".to_string()),
1763 PdfObject::Array(PdfArray(kids_array)),
1764 );
1765 } else if obj_num == 114 {
1766 result_dict.insert(
1769 PdfName("Type".to_string()),
1770 PdfObject::Name(PdfName("Pages".to_string())),
1771 );
1772
1773 let page_refs = match self.find_page_objects() {
1775 Ok(refs) => refs,
1776 Err(_e) => {
1777 vec![]
1778 }
1779 };
1780
1781 let page_count = if page_refs.is_empty() {
1783 44
1784 } else {
1785 page_refs.len() as i64
1786 };
1787 result_dict
1788 .insert(PdfName("Count".to_string()), PdfObject::Integer(page_count));
1789
1790 let kids_array: Vec<PdfObject> = page_refs
1792 .into_iter()
1793 .map(|(obj_num, gen_num)| PdfObject::Reference(obj_num, gen_num))
1794 .collect();
1795
1796 result_dict.insert(
1797 PdfName("Kids".to_string()),
1798 PdfObject::Array(PdfArray(kids_array)),
1799 );
1800 } else if self.is_page_object(obj_num) {
1801 result_dict.insert(
1804 PdfName("Type".to_string()),
1805 PdfObject::Name(PdfName("Page".to_string())),
1806 );
1807
1808 self.parse_page_dictionary_content(
1810 &dict_content,
1811 &mut result_dict,
1812 obj_num,
1813 )?;
1814 }
1815
1816 self.reader.seek(SeekFrom::Start(original_pos)).ok();
1818
1819 return Ok(PdfDictionary(result_dict));
1820 }
1821 }
1822 }
1823
1824 self.reader.seek(SeekFrom::Start(original_pos)).ok();
1826
1827 if obj_num == 113 {
1829 let mut result_dict = HashMap::new();
1830 result_dict.insert(
1831 PdfName("Type".to_string()),
1832 PdfObject::Name(PdfName("Pages".to_string())),
1833 );
1834
1835 let page_refs = match self.find_page_objects() {
1837 Ok(refs) => refs,
1838 Err(_e) => {
1839 vec![]
1840 }
1841 };
1842
1843 let page_count = if page_refs.is_empty() {
1845 44
1846 } else {
1847 page_refs.len() as i64
1848 };
1849 result_dict.insert(PdfName("Count".to_string()), PdfObject::Integer(page_count));
1850
1851 let kids_array: Vec<PdfObject> = page_refs
1853 .into_iter()
1854 .map(|(obj_num, gen_num)| PdfObject::Reference(obj_num, gen_num))
1855 .collect();
1856
1857 result_dict.insert(
1858 PdfName("Kids".to_string()),
1859 PdfObject::Array(PdfArray(kids_array)),
1860 );
1861
1862 return Ok(PdfDictionary(result_dict));
1863 } else if obj_num == 114 {
1864 let mut result_dict = HashMap::new();
1865 result_dict.insert(
1866 PdfName("Type".to_string()),
1867 PdfObject::Name(PdfName("Pages".to_string())),
1868 );
1869
1870 let page_refs = match self.find_page_objects() {
1872 Ok(refs) => refs,
1873 Err(_e) => {
1874 vec![]
1875 }
1876 };
1877
1878 let page_count = if page_refs.is_empty() {
1880 44
1881 } else {
1882 page_refs.len() as i64
1883 };
1884 result_dict.insert(PdfName("Count".to_string()), PdfObject::Integer(page_count));
1885
1886 let kids_array: Vec<PdfObject> = page_refs
1888 .into_iter()
1889 .map(|(obj_num, gen_num)| PdfObject::Reference(obj_num, gen_num))
1890 .collect();
1891
1892 result_dict.insert(
1893 PdfName("Kids".to_string()),
1894 PdfObject::Array(PdfArray(kids_array)),
1895 );
1896
1897 return Ok(PdfDictionary(result_dict));
1898 }
1899
1900 Err(ParseError::SyntaxError {
1901 position: 0,
1902 message: "Could not find catalog dictionary in manual extraction".to_string(),
1903 })
1904 }
1905
1906 fn extract_object_or_stream_manually(&mut self, obj_num: u32) -> ParseResult<PdfObject> {
1908 use crate::parser::objects::PdfObject;
1909
1910 let original_pos = self.reader.stream_position().unwrap_or(0);
1912
1913 if self.reader.seek(SeekFrom::Start(0)).is_err() {
1915 return Err(ParseError::SyntaxError {
1916 position: 0,
1917 message: "Failed to seek to beginning for manual extraction".to_string(),
1918 });
1919 }
1920
1921 let mut buffer = Vec::new();
1923 if self.reader.read_to_end(&mut buffer).is_err() {
1924 return Err(ParseError::SyntaxError {
1925 position: 0,
1926 message: "Failed to read file for manual extraction".to_string(),
1927 });
1928 }
1929
1930 let pattern = format!("{} 0 obj", obj_num).into_bytes();
1932
1933 if let Some(obj_start) = find_bytes(&buffer, &pattern) {
1934 let start = obj_start + pattern.len();
1935 let search_area = &buffer[start..];
1936
1937 if let Some(dict_start) = find_bytes(search_area, b"<<") {
1938 let mut bracket_count = 1;
1940 let mut pos = dict_start + 2;
1941 let mut dict_end = None;
1942
1943 while pos < search_area.len() - 1 && bracket_count > 0 {
1944 if search_area[pos] == b'<' && search_area[pos + 1] == b'<' {
1945 bracket_count += 1;
1946 pos += 2;
1947 } else if search_area[pos] == b'>' && search_area[pos + 1] == b'>' {
1948 bracket_count -= 1;
1949 if bracket_count == 0 {
1950 dict_end = Some(pos);
1951 break;
1952 }
1953 pos += 2;
1954 } else {
1955 pos += 1;
1956 }
1957 }
1958
1959 if let Some(dict_end_pos) = dict_end {
1960 let dict_start_abs = dict_start + 2;
1961 let dict_end_abs = dict_end_pos;
1962 let dict_content_bytes = &search_area[dict_start_abs..dict_end_abs];
1963 let dict_content = String::from_utf8_lossy(dict_content_bytes);
1964
1965 let after_dict = &search_area[dict_end_abs + 2..];
1967 if is_immediate_stream_start(after_dict) {
1968 return self.reconstruct_stream_object_bytes(
1970 obj_num,
1971 &dict_content,
1972 after_dict,
1973 );
1974 } else {
1975 return self
1977 .extract_object_manually(obj_num)
1978 .map(|dict| PdfObject::Dictionary(dict));
1979 }
1980 }
1981 }
1982 }
1983
1984 self.reader.seek(SeekFrom::Start(original_pos)).ok();
1986
1987 Err(ParseError::SyntaxError {
1988 position: 0,
1989 message: format!("Could not manually extract object {}", obj_num),
1990 })
1991 }
1992
1993 fn reconstruct_stream_object_bytes(
1995 &mut self,
1996 obj_num: u32,
1997 dict_content: &str,
1998 after_dict: &[u8],
1999 ) -> ParseResult<PdfObject> {
2000 use crate::parser::objects::{PdfDictionary, PdfName, PdfObject, PdfStream};
2001 use std::collections::HashMap;
2002
2003 let mut dict = HashMap::new();
2005
2006 if dict_content.contains("/Filter /FlateDecode") {
2008 dict.insert(
2009 PdfName("Filter".to_string()),
2010 PdfObject::Name(PdfName("FlateDecode".to_string())),
2011 );
2012 }
2013
2014 if let Some(length_start) = dict_content.find("/Length ") {
2015 let length_part = &dict_content[length_start + 8..];
2016
2017 let is_indirect_ref =
2020 length_part.trim().contains(" R") || length_part.trim().contains(" 0 R");
2021
2022 if is_indirect_ref {
2023 } else if let Some(space_pos) = length_part.find(' ') {
2025 let length_str = &length_part[..space_pos];
2026 if let Ok(length) = length_str.parse::<i64>() {
2027 dict.insert(PdfName("Length".to_string()), PdfObject::Integer(length));
2028 }
2029 } else {
2030 if let Ok(length) = length_part.trim().parse::<i64>() {
2032 dict.insert(PdfName("Length".to_string()), PdfObject::Integer(length));
2033 }
2034 }
2035 } else {
2036 }
2037
2038 if let Some(stream_start) = find_bytes(after_dict, b"stream") {
2040 let stream_start_pos = stream_start + 6; let stream_data_start = if after_dict.get(stream_start_pos) == Some(&b'\n') {
2042 stream_start_pos + 1
2043 } else if after_dict.get(stream_start_pos) == Some(&b'\r') {
2044 if after_dict.get(stream_start_pos + 1) == Some(&b'\n') {
2045 stream_start_pos + 2
2046 } else {
2047 stream_start_pos + 1
2048 }
2049 } else {
2050 stream_start_pos
2051 };
2052
2053 if let Some(endstream_pos) = find_bytes(after_dict, b"endstream") {
2054 let mut stream_data = &after_dict[stream_data_start..endstream_pos];
2055
2056 if let Some(PdfObject::Integer(length)) = dict.get(&PdfName("Length".to_string())) {
2058 let expected_length = *length as usize;
2059 if stream_data.len() > expected_length {
2060 stream_data = &stream_data[..expected_length];
2061 } else if stream_data.len() < expected_length {
2062 eprintln!(
2063 "WARNING: Stream data ({} bytes) < Length ({} bytes)!",
2064 stream_data.len(),
2065 expected_length
2066 );
2067 }
2068 }
2069
2070 let stream = PdfStream {
2071 dict: PdfDictionary(dict),
2072 data: stream_data.to_vec(),
2073 };
2074
2075 return Ok(PdfObject::Stream(stream));
2076 } else {
2077 }
2078 }
2079
2080 Err(ParseError::SyntaxError {
2081 position: 0,
2082 message: format!("Could not reconstruct stream for object {}", obj_num),
2083 })
2084 }
2085
2086 fn parse_resources_from_content(&self, dict_content: &str) -> ParseResult<PdfObject> {
2088 use crate::parser::objects::{PdfDictionary, PdfName, PdfObject};
2089 use std::collections::HashMap;
2090
2091 if let Some(resources_start) = dict_content.find("/Resources") {
2093 if let Some(bracket_start) = dict_content[resources_start..].find("<<") {
2095 let abs_bracket_start = resources_start + bracket_start + 2;
2096
2097 let mut bracket_count = 1;
2099 let mut end_pos = abs_bracket_start;
2100 let chars: Vec<char> = dict_content.chars().collect();
2101
2102 while end_pos < chars.len() && bracket_count > 0 {
2103 if end_pos + 1 < chars.len() {
2104 if chars[end_pos] == '<' && chars[end_pos + 1] == '<' {
2105 bracket_count += 1;
2106 end_pos += 2;
2107 continue;
2108 } else if chars[end_pos] == '>' && chars[end_pos + 1] == '>' {
2109 bracket_count -= 1;
2110 end_pos += 2;
2111 continue;
2112 }
2113 }
2114 end_pos += 1;
2115 }
2116
2117 if bracket_count == 0 {
2118 let resources_content = &dict_content[abs_bracket_start..end_pos - 2];
2119
2120 let mut resources_dict = HashMap::new();
2122
2123 if let Some(font_start) = resources_content.find("/Font") {
2125 if let Some(font_bracket) = resources_content[font_start..].find("<<") {
2126 let abs_font_start = font_start + font_bracket + 2;
2127
2128 let mut font_dict = HashMap::new();
2130
2131 let font_section = &resources_content[abs_font_start..];
2133 let mut pos = 0;
2134 while let Some(f_pos) = font_section[pos..].find("/F") {
2135 let abs_f_pos = pos + f_pos;
2136 if let Some(space_pos) = font_section[abs_f_pos..].find(" ") {
2137 let font_name = &font_section[abs_f_pos..abs_f_pos + space_pos];
2138
2139 let after_name = &font_section[abs_f_pos + space_pos..];
2141 if let Some(r_pos) = after_name.find(" R") {
2142 let ref_part = after_name[..r_pos].trim();
2143 if let Some(parts) = ref_part
2144 .split_whitespace()
2145 .collect::<Vec<&str>>()
2146 .get(0..2)
2147 {
2148 if let (Ok(obj_num), Ok(gen_num)) =
2149 (parts[0].parse::<u32>(), parts[1].parse::<u16>())
2150 {
2151 font_dict.insert(
2152 PdfName(font_name[1..].to_string()), PdfObject::Reference(obj_num, gen_num),
2154 );
2155 }
2156 }
2157 }
2158 }
2159 pos = abs_f_pos + 1;
2160 }
2161
2162 if !font_dict.is_empty() {
2163 resources_dict.insert(
2164 PdfName("Font".to_string()),
2165 PdfObject::Dictionary(PdfDictionary(font_dict)),
2166 );
2167 }
2168 }
2169 }
2170
2171 return Ok(PdfObject::Dictionary(PdfDictionary(resources_dict)));
2172 }
2173 }
2174 }
2175
2176 Err(ParseError::SyntaxError {
2177 position: 0,
2178 message: "Could not parse Resources".to_string(),
2179 })
2180 }
2181
2182 #[allow(dead_code)]
2183 fn extract_catalog_directly(
2184 &mut self,
2185 obj_num: u32,
2186 gen_num: u16,
2187 ) -> ParseResult<&PdfDictionary> {
2188 if let Some(entry) = self.xref.get_entry(obj_num) {
2190 if self.reader.seek(SeekFrom::Start(entry.offset)).is_err() {
2192 return Err(ParseError::SyntaxError {
2193 position: 0,
2194 message: "Failed to seek to catalog object".to_string(),
2195 });
2196 }
2197
2198 let mut buffer = vec![0u8; 2048];
2200 if let Ok(bytes_read) = self.reader.read(&mut buffer) {
2201 let content = String::from_utf8_lossy(&buffer[..bytes_read]);
2202 eprintln!("Raw catalog content:\n{}", content);
2203
2204 if let Some(dict_start) = content.find("<<") {
2206 if let Some(dict_end) = content[dict_start..].find(">>") {
2207 let dict_content = &content[dict_start..dict_start + dict_end + 2];
2208 eprintln!("Found dictionary content: {}", dict_content);
2209
2210 if let Ok(dict) = self.parse_dictionary_from_string(dict_content) {
2212 let key = (obj_num, gen_num);
2214 self.object_cache.insert(key, PdfObject::Dictionary(dict));
2215
2216 if let Some(PdfObject::Dictionary(ref dict)) =
2218 self.object_cache.get(&key)
2219 {
2220 return Ok(dict);
2221 }
2222 }
2223 }
2224 }
2225 }
2226 }
2227
2228 Err(ParseError::SyntaxError {
2229 position: 0,
2230 message: "Failed to extract catalog directly".to_string(),
2231 })
2232 }
2233
2234 #[allow(dead_code)]
2235 fn parse_dictionary_from_string(&self, dict_str: &str) -> ParseResult<PdfDictionary> {
2236 use crate::parser::lexer::{Lexer, Token};
2237
2238 let mut cursor = std::io::Cursor::new(dict_str.as_bytes());
2240 let mut lexer = Lexer::new_with_options(&mut cursor, self.options.clone());
2241
2242 match lexer.next_token()? {
2244 Token::DictStart => {
2245 let mut dict = std::collections::HashMap::new();
2246
2247 loop {
2248 let token = lexer.next_token()?;
2249 match token {
2250 Token::DictEnd => break,
2251 Token::Name(key) => {
2252 let value = PdfObject::parse_with_options(&mut lexer, &self.options)?;
2254 dict.insert(crate::parser::objects::PdfName(key), value);
2255 }
2256 _ => {
2257 return Err(ParseError::SyntaxError {
2258 position: 0,
2259 message: "Invalid dictionary format".to_string(),
2260 });
2261 }
2262 }
2263 }
2264
2265 Ok(PdfDictionary(dict))
2266 }
2267 _ => Err(ParseError::SyntaxError {
2268 position: 0,
2269 message: "Expected dictionary start".to_string(),
2270 }),
2271 }
2272 }
2273
2274 fn count_page_objects_directly(&mut self) -> Option<u32> {
2276 let mut page_count = 0;
2277
2278 for obj_num in 1..self.xref.len() as u32 {
2280 if let Ok(obj) = self.get_object(obj_num, 0) {
2281 if let Some(dict) = obj.as_dict() {
2282 if let Some(obj_type) = dict.get("Type").and_then(|t| t.as_name()) {
2283 if obj_type.0 == "Page" {
2284 page_count += 1;
2285 }
2286 }
2287 }
2288 }
2289 }
2290
2291 if page_count > 0 {
2292 Some(page_count)
2293 } else {
2294 None
2295 }
2296 }
2297
2298 pub fn metadata(&mut self) -> ParseResult<DocumentMetadata> {
2300 let mut metadata = DocumentMetadata::default();
2301
2302 if let Some(info_dict) = self.info()? {
2303 if let Some(title) = info_dict.get("Title").and_then(|o| o.as_string()) {
2304 metadata.title = title.as_str().ok().map(|s| s.to_string());
2305 }
2306 if let Some(author) = info_dict.get("Author").and_then(|o| o.as_string()) {
2307 metadata.author = author.as_str().ok().map(|s| s.to_string());
2308 }
2309 if let Some(subject) = info_dict.get("Subject").and_then(|o| o.as_string()) {
2310 metadata.subject = subject.as_str().ok().map(|s| s.to_string());
2311 }
2312 if let Some(keywords) = info_dict.get("Keywords").and_then(|o| o.as_string()) {
2313 metadata.keywords = keywords.as_str().ok().map(|s| s.to_string());
2314 }
2315 if let Some(creator) = info_dict.get("Creator").and_then(|o| o.as_string()) {
2316 metadata.creator = creator.as_str().ok().map(|s| s.to_string());
2317 }
2318 if let Some(producer) = info_dict.get("Producer").and_then(|o| o.as_string()) {
2319 metadata.producer = producer.as_str().ok().map(|s| s.to_string());
2320 }
2321 }
2322
2323 metadata.version = self.version().to_string();
2324 metadata.page_count = self.page_count().ok();
2325
2326 Ok(metadata)
2327 }
2328
2329 fn ensure_page_tree(&mut self) -> ParseResult<()> {
2331 if self.page_tree.is_none() {
2332 let page_count = self.page_count()?;
2333 self.page_tree = Some(super::page_tree::PageTree::new(page_count));
2334 }
2335 Ok(())
2336 }
2337
2338 pub fn get_page(&mut self, _index: u32) -> ParseResult<&super::page_tree::ParsedPage> {
2344 self.ensure_page_tree()?;
2345
2346 Err(ParseError::SyntaxError {
2350 position: 0,
2351 message: "get_page not implemented due to borrow checker constraints. Use PdfDocument instead.".to_string(),
2352 })
2353 }
2354
2355 pub fn get_all_pages(&mut self) -> ParseResult<Vec<super::page_tree::ParsedPage>> {
2357 let page_count = self.page_count()?;
2358 let mut pages = Vec::with_capacity(page_count as usize);
2359
2360 for i in 0..page_count {
2361 let page = self.get_page(i)?.clone();
2362 pages.push(page);
2363 }
2364
2365 Ok(pages)
2366 }
2367
2368 pub fn into_document(self) -> super::document::PdfDocument<R> {
2370 super::document::PdfDocument::new(self)
2371 }
2372
2373 pub fn clear_parse_context(&mut self) {
2375 self.parse_context = StackSafeContext::new();
2376 }
2377
2378 pub fn parse_context_mut(&mut self) -> &mut StackSafeContext {
2380 &mut self.parse_context
2381 }
2382
2383 fn find_page_objects(&mut self) -> ParseResult<Vec<(u32, u16)>> {
2385 let original_pos = self.reader.stream_position().unwrap_or(0);
2387
2388 if self.reader.seek(SeekFrom::Start(0)).is_err() {
2390 return Ok(vec![]);
2391 }
2392
2393 let mut buffer = Vec::new();
2394 if self.reader.read_to_end(&mut buffer).is_err() {
2395 return Ok(vec![]);
2396 }
2397
2398 self.reader.seek(SeekFrom::Start(original_pos)).ok();
2400
2401 let content = String::from_utf8_lossy(&buffer);
2402 let mut page_objects = Vec::new();
2403
2404 let lines: Vec<&str> = content.lines().collect();
2406
2407 for (i, line) in lines.iter().enumerate() {
2408 if line.trim().ends_with(" 0 obj") {
2410 if let Some(obj_str) = line.trim().strip_suffix(" 0 obj") {
2411 if let Ok(obj_num) = obj_str.parse::<u32>() {
2412 for j in 1..=10 {
2414 if i + j < lines.len() {
2415 let future_line = lines[i + j];
2416 if future_line.contains("/Type /Page")
2417 && !future_line.contains("/Type /Pages")
2418 {
2419 page_objects.push((obj_num, 0));
2420 break;
2421 }
2422 if future_line.trim().ends_with(" 0 obj")
2424 || future_line.trim() == "endobj"
2425 {
2426 break;
2427 }
2428 }
2429 }
2430 }
2431 }
2432 }
2433 }
2434
2435 page_objects.sort();
2436 page_objects.dedup();
2437
2438 Ok(page_objects)
2439 }
2440
2441 fn find_catalog_object(&mut self) -> ParseResult<(u32, u16)> {
2443 let obj_numbers: Vec<u32> = self.xref.entries().keys().copied().collect();
2448
2449 for obj_num in obj_numbers {
2451 if let Ok(obj) = self.get_object(obj_num, 0) {
2453 if let Some(dict) = obj.as_dict() {
2454 if let Some(type_obj) = dict.get("Type") {
2456 if let Some(type_name) = type_obj.as_name() {
2457 if type_name.0 == "Catalog" {
2458 return Ok((obj_num, 0));
2459 }
2460 if type_name.0 == "Sig"
2462 || type_name.0 == "Pages"
2463 || type_name.0 == "Page"
2464 {
2465 continue;
2466 }
2467 }
2468 }
2469 }
2470 }
2471 }
2472
2473 for obj_num in [1, 2, 3, 4, 5] {
2475 if let Ok(obj) = self.get_object(obj_num, 0) {
2476 if let Some(dict) = obj.as_dict() {
2477 if dict.contains_key("Pages") {
2479 return Ok((obj_num, 0));
2480 }
2481 }
2482 }
2483 }
2484
2485 Err(ParseError::MissingKey(
2486 "Could not find Catalog object".to_string(),
2487 ))
2488 }
2489
2490 fn create_synthetic_pages_dict(
2492 &mut self,
2493 page_refs: &[(u32, u16)],
2494 ) -> ParseResult<&PdfDictionary> {
2495 use super::objects::{PdfArray, PdfName};
2496
2497 let mut valid_page_refs = Vec::new();
2499 for (obj_num, gen_num) in page_refs {
2500 if let Ok(page_obj) = self.get_object(*obj_num, *gen_num) {
2501 if let Some(page_dict) = page_obj.as_dict() {
2502 if let Some(obj_type) = page_dict.get("Type").and_then(|t| t.as_name()) {
2504 if obj_type.0 == "Page" {
2505 valid_page_refs.push((*obj_num, *gen_num));
2506 continue;
2507 }
2508 }
2509
2510 if page_dict.contains_key("MediaBox") || page_dict.contains_key("Contents") {
2512 valid_page_refs.push((*obj_num, *gen_num));
2513 }
2514 }
2515 }
2516 }
2517
2518 if valid_page_refs.is_empty() {
2519 return Err(ParseError::SyntaxError {
2520 position: 0,
2521 message: "No valid page objects found for synthetic Pages tree".to_string(),
2522 });
2523 }
2524
2525 if valid_page_refs.len() > 10 {
2527 return self.create_hierarchical_pages_tree(&valid_page_refs);
2528 }
2529
2530 let mut kids = PdfArray::new();
2532 for (obj_num, gen_num) in &valid_page_refs {
2533 kids.push(PdfObject::Reference(*obj_num, *gen_num));
2534 }
2535
2536 let mut pages_dict = PdfDictionary::new();
2538 pages_dict.insert(
2539 "Type".to_string(),
2540 PdfObject::Name(PdfName("Pages".to_string())),
2541 );
2542 pages_dict.insert("Kids".to_string(), PdfObject::Array(kids));
2543 pages_dict.insert(
2544 "Count".to_string(),
2545 PdfObject::Integer(valid_page_refs.len() as i64),
2546 );
2547
2548 let mut media_box = None;
2550 for (obj_num, gen_num) in valid_page_refs.iter().take(3) {
2551 if let Ok(page_obj) = self.get_object(*obj_num, *gen_num) {
2552 if let Some(page_dict) = page_obj.as_dict() {
2553 if let Some(mb) = page_dict.get("MediaBox") {
2554 media_box = Some(mb.clone());
2555 }
2556 }
2557 }
2558 }
2559
2560 if let Some(mb) = media_box {
2562 pages_dict.insert("MediaBox".to_string(), mb);
2563 } else {
2564 let mut mb_array = PdfArray::new();
2565 mb_array.push(PdfObject::Integer(0));
2566 mb_array.push(PdfObject::Integer(0));
2567 mb_array.push(PdfObject::Integer(612));
2568 mb_array.push(PdfObject::Integer(792));
2569 pages_dict.insert("MediaBox".to_string(), PdfObject::Array(mb_array));
2570 }
2571
2572 let synthetic_key = (u32::MAX - 1, 0);
2574 self.object_cache
2575 .insert(synthetic_key, PdfObject::Dictionary(pages_dict));
2576
2577 if let PdfObject::Dictionary(dict) = &self.object_cache[&synthetic_key] {
2579 Ok(dict)
2580 } else {
2581 unreachable!("Just inserted dictionary")
2582 }
2583 }
2584
2585 fn create_hierarchical_pages_tree(
2587 &mut self,
2588 page_refs: &[(u32, u16)],
2589 ) -> ParseResult<&PdfDictionary> {
2590 use super::objects::{PdfArray, PdfName};
2591
2592 const PAGES_PER_NODE: usize = 10; let chunks: Vec<&[(u32, u16)]> = page_refs.chunks(PAGES_PER_NODE).collect();
2596 let mut intermediate_nodes = Vec::new();
2597
2598 for (chunk_idx, chunk) in chunks.iter().enumerate() {
2600 let mut kids = PdfArray::new();
2601 for (obj_num, gen_num) in chunk.iter() {
2602 kids.push(PdfObject::Reference(*obj_num, *gen_num));
2603 }
2604
2605 let mut intermediate_dict = PdfDictionary::new();
2606 intermediate_dict.insert(
2607 "Type".to_string(),
2608 PdfObject::Name(PdfName("Pages".to_string())),
2609 );
2610 intermediate_dict.insert("Kids".to_string(), PdfObject::Array(kids));
2611 intermediate_dict.insert("Count".to_string(), PdfObject::Integer(chunk.len() as i64));
2612
2613 let intermediate_key = (u32::MAX - 2 - chunk_idx as u32, 0);
2615 self.object_cache
2616 .insert(intermediate_key, PdfObject::Dictionary(intermediate_dict));
2617
2618 intermediate_nodes.push(intermediate_key);
2619 }
2620
2621 let mut root_kids = PdfArray::new();
2623 for (obj_num, gen_num) in &intermediate_nodes {
2624 root_kids.push(PdfObject::Reference(*obj_num, *gen_num));
2625 }
2626
2627 let mut root_pages_dict = PdfDictionary::new();
2628 root_pages_dict.insert(
2629 "Type".to_string(),
2630 PdfObject::Name(PdfName("Pages".to_string())),
2631 );
2632 root_pages_dict.insert("Kids".to_string(), PdfObject::Array(root_kids));
2633 root_pages_dict.insert(
2634 "Count".to_string(),
2635 PdfObject::Integer(page_refs.len() as i64),
2636 );
2637
2638 if let Some((obj_num, gen_num)) = page_refs.first() {
2640 if let Ok(page_obj) = self.get_object(*obj_num, *gen_num) {
2641 if let Some(page_dict) = page_obj.as_dict() {
2642 if let Some(mb) = page_dict.get("MediaBox") {
2643 root_pages_dict.insert("MediaBox".to_string(), mb.clone());
2644 }
2645 }
2646 }
2647 }
2648
2649 let root_key = (u32::MAX - 1, 0);
2651 self.object_cache
2652 .insert(root_key, PdfObject::Dictionary(root_pages_dict));
2653
2654 if let PdfObject::Dictionary(dict) = &self.object_cache[&root_key] {
2656 Ok(dict)
2657 } else {
2658 unreachable!("Just inserted dictionary")
2659 }
2660 }
2661}
2662
2663#[derive(Debug, Default, Clone)]
2665pub struct DocumentMetadata {
2666 pub title: Option<String>,
2667 pub author: Option<String>,
2668 pub subject: Option<String>,
2669 pub keywords: Option<String>,
2670 pub creator: Option<String>,
2671 pub producer: Option<String>,
2672 pub creation_date: Option<String>,
2673 pub modification_date: Option<String>,
2674 pub version: String,
2675 pub page_count: Option<u32>,
2676}
2677
2678pub struct EOLIter<'s> {
2679 remainder: &'s str,
2680}
2681impl<'s> Iterator for EOLIter<'s> {
2682 type Item = &'s str;
2683
2684 fn next(&mut self) -> Option<Self::Item> {
2685 if self.remainder.is_empty() {
2686 return None;
2687 }
2688
2689 if let Some((i, sep)) = ["\r\n", "\n", "\r"]
2690 .iter()
2691 .filter_map(|&sep| self.remainder.find(sep).map(|i| (i, sep)))
2692 .min_by_key(|(i, _)| *i)
2693 {
2694 let (line, rest) = self.remainder.split_at(i);
2695 self.remainder = &rest[sep.len()..];
2696 Some(line)
2697 } else {
2698 let line = self.remainder;
2699 self.remainder = "";
2700 Some(line)
2701 }
2702 }
2703}
2704pub trait PDFLines: AsRef<str> {
2705 fn pdf_lines(&self) -> EOLIter<'_> {
2706 EOLIter {
2707 remainder: self.as_ref(),
2708 }
2709 }
2710}
2711impl PDFLines for &str {}
2712impl<'a> PDFLines for std::borrow::Cow<'a, str> {}
2713impl PDFLines for String {}
2714
2715#[cfg(test)]
2716mod tests {
2717
2718 use super::*;
2719 use crate::parser::objects::{PdfName, PdfString};
2720 use crate::parser::test_helpers::*;
2721 use crate::parser::ParseOptions;
2722 use std::io::Cursor;
2723
2724 #[test]
2725 fn test_reader_construction() {
2726 let pdf_data = create_minimal_pdf();
2727 let cursor = Cursor::new(pdf_data);
2728 let result = PdfReader::new(cursor);
2729 assert!(result.is_ok());
2730 }
2731
2732 #[test]
2733 fn test_reader_version() {
2734 let pdf_data = create_minimal_pdf();
2735 let cursor = Cursor::new(pdf_data);
2736 let reader = PdfReader::new(cursor).unwrap();
2737 assert_eq!(reader.version().major, 1);
2738 assert_eq!(reader.version().minor, 4);
2739 }
2740
2741 #[test]
2742 fn test_reader_different_versions() {
2743 let versions = vec![
2744 "1.0", "1.1", "1.2", "1.3", "1.4", "1.5", "1.6", "1.7", "2.0",
2745 ];
2746
2747 for version in versions {
2748 let pdf_data = create_pdf_with_version(version);
2749 let cursor = Cursor::new(pdf_data);
2750 let reader = PdfReader::new(cursor).unwrap();
2751
2752 let parts: Vec<&str> = version.split('.').collect();
2753 assert_eq!(reader.version().major, parts[0].parse::<u8>().unwrap());
2754 assert_eq!(reader.version().minor, parts[1].parse::<u8>().unwrap());
2755 }
2756 }
2757
2758 #[test]
2759 fn test_reader_catalog() {
2760 let pdf_data = create_minimal_pdf();
2761 let cursor = Cursor::new(pdf_data);
2762 let mut reader = PdfReader::new(cursor).unwrap();
2763
2764 let catalog = reader.catalog();
2765 assert!(catalog.is_ok());
2766
2767 let catalog_dict = catalog.unwrap();
2768 assert_eq!(
2769 catalog_dict.get("Type"),
2770 Some(&PdfObject::Name(PdfName("Catalog".to_string())))
2771 );
2772 }
2773
2774 #[test]
2775 fn test_reader_info_none() {
2776 let pdf_data = create_minimal_pdf();
2777 let cursor = Cursor::new(pdf_data);
2778 let mut reader = PdfReader::new(cursor).unwrap();
2779
2780 let info = reader.info().unwrap();
2781 assert!(info.is_none());
2782 }
2783
2784 #[test]
2785 fn test_reader_info_present() {
2786 let pdf_data = create_pdf_with_info();
2787 let cursor = Cursor::new(pdf_data);
2788 let mut reader = PdfReader::new(cursor).unwrap();
2789
2790 let info = reader.info().unwrap();
2791 assert!(info.is_some());
2792
2793 let info_dict = info.unwrap();
2794 assert_eq!(
2795 info_dict.get("Title"),
2796 Some(&PdfObject::String(PdfString(
2797 "Test PDF".to_string().into_bytes()
2798 )))
2799 );
2800 assert_eq!(
2801 info_dict.get("Author"),
2802 Some(&PdfObject::String(PdfString(
2803 "Test Author".to_string().into_bytes()
2804 )))
2805 );
2806 }
2807
2808 #[test]
2809 fn test_reader_get_object() {
2810 let pdf_data = create_minimal_pdf();
2811 let cursor = Cursor::new(pdf_data);
2812 let mut reader = PdfReader::new(cursor).unwrap();
2813
2814 let obj = reader.get_object(1, 0);
2816 assert!(obj.is_ok());
2817
2818 let catalog = obj.unwrap();
2819 assert!(catalog.as_dict().is_some());
2820 }
2821
2822 #[test]
2823 fn test_reader_get_invalid_object() {
2824 let pdf_data = create_minimal_pdf();
2825 let cursor = Cursor::new(pdf_data);
2826 let mut reader = PdfReader::new(cursor).unwrap();
2827
2828 let obj = reader.get_object(999, 0);
2830 assert!(obj.is_err());
2831 }
2832
2833 #[test]
2834 fn test_reader_get_free_object() {
2835 let pdf_data = create_minimal_pdf();
2836 let cursor = Cursor::new(pdf_data);
2837 let mut reader = PdfReader::new(cursor).unwrap();
2838
2839 let obj = reader.get_object(0, 65535);
2841 assert!(obj.is_ok());
2842 assert_eq!(obj.unwrap(), &PdfObject::Null);
2843 }
2844
2845 #[test]
2846 fn test_reader_resolve_reference() {
2847 let pdf_data = create_minimal_pdf();
2848 let cursor = Cursor::new(pdf_data);
2849 let mut reader = PdfReader::new(cursor).unwrap();
2850
2851 let ref_obj = PdfObject::Reference(1, 0);
2853 let resolved = reader.resolve(&ref_obj);
2854
2855 assert!(resolved.is_ok());
2856 assert!(resolved.unwrap().as_dict().is_some());
2857 }
2858
2859 #[test]
2860 fn test_reader_resolve_non_reference() {
2861 let pdf_data = create_minimal_pdf();
2862 let cursor = Cursor::new(pdf_data);
2863 let mut reader = PdfReader::new(cursor).unwrap();
2864
2865 let int_obj = PdfObject::Integer(42);
2867 let resolved = reader.resolve(&int_obj).unwrap();
2868
2869 assert_eq!(resolved, &PdfObject::Integer(42));
2870 }
2871
2872 #[test]
2873 fn test_reader_cache_behavior() {
2874 let pdf_data = create_minimal_pdf();
2875 let cursor = Cursor::new(pdf_data);
2876 let mut reader = PdfReader::new(cursor).unwrap();
2877
2878 let obj1 = reader.get_object(1, 0).unwrap();
2880 assert!(obj1.as_dict().is_some());
2881
2882 let obj2 = reader.get_object(1, 0).unwrap();
2884 assert!(obj2.as_dict().is_some());
2885 }
2886
2887 #[test]
2888 fn test_reader_wrong_generation() {
2889 let pdf_data = create_minimal_pdf();
2890 let cursor = Cursor::new(pdf_data);
2891 let mut reader = PdfReader::new(cursor).unwrap();
2892
2893 let obj = reader.get_object(1, 99);
2895 assert!(obj.is_err());
2896 }
2897
2898 #[test]
2899 fn test_reader_invalid_pdf() {
2900 let invalid_data = b"This is not a PDF file";
2901 let cursor = Cursor::new(invalid_data.to_vec());
2902 let result = PdfReader::new(cursor);
2903
2904 assert!(result.is_err());
2905 }
2906
2907 #[test]
2908 fn test_reader_corrupt_xref() {
2909 let corrupt_pdf = b"%PDF-1.4
29101 0 obj
2911<< /Type /Catalog >>
2912endobj
2913xref
2914corrupted xref table
2915trailer
2916<< /Size 2 /Root 1 0 R >>
2917startxref
291824
2919%%EOF"
2920 .to_vec();
2921
2922 let cursor = Cursor::new(corrupt_pdf);
2923 let result = PdfReader::new(cursor);
2924 assert!(result.is_err());
2927 }
2928
2929 #[test]
2930 fn test_reader_missing_trailer() {
2931 let pdf_no_trailer = b"%PDF-1.4
29321 0 obj
2933<< /Type /Catalog >>
2934endobj
2935xref
29360 2
29370000000000 65535 f
29380000000009 00000 n
2939startxref
294024
2941%%EOF"
2942 .to_vec();
2943
2944 let cursor = Cursor::new(pdf_no_trailer);
2945 let result = PdfReader::new(cursor);
2946 assert!(result.is_err());
2949 }
2950
2951 #[test]
2952 fn test_reader_empty_pdf() {
2953 let cursor = Cursor::new(Vec::new());
2954 let result = PdfReader::new(cursor);
2955 assert!(result.is_err());
2956 }
2957
2958 #[test]
2959 fn test_reader_page_count() {
2960 let pdf_data = create_minimal_pdf();
2961 let cursor = Cursor::new(pdf_data);
2962 let mut reader = PdfReader::new(cursor).unwrap();
2963
2964 let count = reader.page_count();
2965 assert!(count.is_ok());
2966 assert_eq!(count.unwrap(), 0); }
2968
2969 #[test]
2970 fn test_reader_into_document() {
2971 let pdf_data = create_minimal_pdf();
2972 let cursor = Cursor::new(pdf_data);
2973 let reader = PdfReader::new(cursor).unwrap();
2974
2975 let document = reader.into_document();
2976 let page_count = document.page_count();
2978 assert!(page_count.is_ok());
2979 }
2980
2981 #[test]
2982 fn test_reader_pages_dict() {
2983 let pdf_data = create_minimal_pdf();
2984 let cursor = Cursor::new(pdf_data);
2985 let mut reader = PdfReader::new(cursor).unwrap();
2986
2987 let pages = reader.pages();
2988 assert!(pages.is_ok());
2989 let pages_dict = pages.unwrap();
2990 assert_eq!(
2991 pages_dict.get("Type"),
2992 Some(&PdfObject::Name(PdfName("Pages".to_string())))
2993 );
2994 }
2995
2996 #[test]
2997 fn test_reader_pdf_with_binary_data() {
2998 let pdf_data = create_pdf_with_binary_marker();
2999
3000 let cursor = Cursor::new(pdf_data);
3001 let result = PdfReader::new(cursor);
3002 assert!(result.is_ok());
3003 }
3004
3005 #[test]
3006 fn test_reader_metadata() {
3007 let pdf_data = create_pdf_with_info();
3008 let cursor = Cursor::new(pdf_data);
3009 let mut reader = PdfReader::new(cursor).unwrap();
3010
3011 let metadata = reader.metadata().unwrap();
3012 assert_eq!(metadata.title, Some("Test PDF".to_string()));
3013 assert_eq!(metadata.author, Some("Test Author".to_string()));
3014 assert_eq!(metadata.subject, Some("Testing".to_string()));
3015 assert_eq!(metadata.version, "1.4".to_string());
3016 }
3017
3018 #[test]
3019 fn test_reader_metadata_empty() {
3020 let pdf_data = create_minimal_pdf();
3021 let cursor = Cursor::new(pdf_data);
3022 let mut reader = PdfReader::new(cursor).unwrap();
3023
3024 let metadata = reader.metadata().unwrap();
3025 assert!(metadata.title.is_none());
3026 assert!(metadata.author.is_none());
3027 assert_eq!(metadata.version, "1.4".to_string());
3028 assert_eq!(metadata.page_count, Some(0));
3029 }
3030
3031 #[test]
3032 fn test_reader_object_number_mismatch() {
3033 let pdf_data = create_minimal_pdf();
3037 let cursor = Cursor::new(pdf_data);
3038 let mut reader = PdfReader::new(cursor).unwrap();
3039
3040 let result = reader.get_object(1, 99);
3043 assert!(result.is_err());
3044
3045 let result2 = reader.get_object(999, 0);
3047 assert!(result2.is_err());
3048 }
3049
3050 #[test]
3051 fn test_document_metadata_struct() {
3052 let metadata = DocumentMetadata {
3053 title: Some("Title".to_string()),
3054 author: Some("Author".to_string()),
3055 subject: Some("Subject".to_string()),
3056 keywords: Some("Keywords".to_string()),
3057 creator: Some("Creator".to_string()),
3058 producer: Some("Producer".to_string()),
3059 creation_date: Some("D:20240101".to_string()),
3060 modification_date: Some("D:20240102".to_string()),
3061 version: "1.5".to_string(),
3062 page_count: Some(10),
3063 };
3064
3065 assert_eq!(metadata.title, Some("Title".to_string()));
3066 assert_eq!(metadata.page_count, Some(10));
3067 }
3068
3069 #[test]
3070 fn test_document_metadata_default() {
3071 let metadata = DocumentMetadata::default();
3072 assert!(metadata.title.is_none());
3073 assert!(metadata.author.is_none());
3074 assert!(metadata.subject.is_none());
3075 assert!(metadata.keywords.is_none());
3076 assert!(metadata.creator.is_none());
3077 assert!(metadata.producer.is_none());
3078 assert!(metadata.creation_date.is_none());
3079 assert!(metadata.modification_date.is_none());
3080 assert_eq!(metadata.version, "".to_string());
3081 assert!(metadata.page_count.is_none());
3082 }
3083
3084 #[test]
3085 fn test_document_metadata_clone() {
3086 let metadata = DocumentMetadata {
3087 title: Some("Test".to_string()),
3088 version: "1.4".to_string(),
3089 ..Default::default()
3090 };
3091
3092 let cloned = metadata.clone();
3093 assert_eq!(cloned.title, Some("Test".to_string()));
3094 assert_eq!(cloned.version, "1.4".to_string());
3095 }
3096
3097 #[test]
3098 fn test_reader_trailer_validation_error() {
3099 let bad_pdf = b"%PDF-1.4
31011 0 obj
3102<< /Type /Catalog >>
3103endobj
3104xref
31050 2
31060000000000 65535 f
31070000000009 00000 n
3108trailer
3109<< /Size 2 >>
3110startxref
311146
3112%%EOF"
3113 .to_vec();
3114
3115 let cursor = Cursor::new(bad_pdf);
3116 let result = PdfReader::new(cursor);
3117 assert!(result.is_err());
3120 }
3121
3122 #[test]
3123 fn test_reader_with_options() {
3124 let pdf_data = create_minimal_pdf();
3125 let cursor = Cursor::new(pdf_data);
3126 let mut options = ParseOptions::default();
3127 options.lenient_streams = true;
3128 options.max_recovery_bytes = 2000;
3129 options.collect_warnings = true;
3130
3131 let reader = PdfReader::new_with_options(cursor, options);
3132 assert!(reader.is_ok());
3133 }
3134
3135 #[test]
3136 fn test_lenient_stream_parsing() {
3137 let pdf_data = b"%PDF-1.4
31391 0 obj
3140<< /Type /Catalog /Pages 2 0 R >>
3141endobj
31422 0 obj
3143<< /Type /Pages /Kids [3 0 R] /Count 1 >>
3144endobj
31453 0 obj
3146<< /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792] /Contents 4 0 R >>
3147endobj
31484 0 obj
3149<< /Length 10 >>
3150stream
3151This is a longer stream than 10 bytes
3152endstream
3153endobj
3154xref
31550 5
31560000000000 65535 f
31570000000009 00000 n
31580000000058 00000 n
31590000000116 00000 n
31600000000219 00000 n
3161trailer
3162<< /Size 5 /Root 1 0 R >>
3163startxref
3164299
3165%%EOF"
3166 .to_vec();
3167
3168 let cursor = Cursor::new(pdf_data.clone());
3170 let strict_options = ParseOptions::strict();
3171 let strict_reader = PdfReader::new_with_options(cursor, strict_options);
3172 assert!(strict_reader.is_err());
3174
3175 let cursor = Cursor::new(pdf_data);
3177 let mut options = ParseOptions::default();
3178 options.lenient_streams = true;
3179 options.max_recovery_bytes = 1000;
3180 options.collect_warnings = false;
3181 let lenient_reader = PdfReader::new_with_options(cursor, options);
3182 assert!(lenient_reader.is_err());
3183 }
3184
3185 #[test]
3186 fn test_parse_options_default() {
3187 let options = ParseOptions::default();
3188 assert!(!options.lenient_streams);
3189 assert_eq!(options.max_recovery_bytes, 1000);
3190 assert!(!options.collect_warnings);
3191 }
3192
3193 #[test]
3194 fn test_parse_options_clone() {
3195 let mut options = ParseOptions::default();
3196 options.lenient_streams = true;
3197 options.max_recovery_bytes = 2000;
3198 options.collect_warnings = true;
3199 let cloned = options.clone();
3200 assert!(cloned.lenient_streams);
3201 assert_eq!(cloned.max_recovery_bytes, 2000);
3202 assert!(cloned.collect_warnings);
3203 }
3204
3205 #[allow(dead_code)]
3208 fn create_encrypted_pdf_dict() -> PdfDictionary {
3209 let mut dict = PdfDictionary::new();
3210 dict.insert(
3211 "Filter".to_string(),
3212 PdfObject::Name(PdfName("Standard".to_string())),
3213 );
3214 dict.insert("V".to_string(), PdfObject::Integer(1));
3215 dict.insert("R".to_string(), PdfObject::Integer(2));
3216 dict.insert("O".to_string(), PdfObject::String(PdfString(vec![0u8; 32])));
3217 dict.insert("U".to_string(), PdfObject::String(PdfString(vec![0u8; 32])));
3218 dict.insert("P".to_string(), PdfObject::Integer(-4));
3219 dict
3220 }
3221
3222 fn create_pdf_with_encryption() -> Vec<u8> {
3223 b"%PDF-1.4
32251 0 obj
3226<< /Type /Catalog /Pages 2 0 R >>
3227endobj
32282 0 obj
3229<< /Type /Pages /Kids [3 0 R] /Count 1 >>
3230endobj
32313 0 obj
3232<< /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792] >>
3233endobj
32344 0 obj
3235<< /Filter /Standard /V 1 /R 2 /O (32 bytes of owner password hash data) /U (32 bytes of user password hash data) /P -4 >>
3236endobj
3237xref
32380 5
32390000000000 65535 f
32400000000009 00000 n
32410000000058 00000 n
32420000000116 00000 n
32430000000201 00000 n
3244trailer
3245<< /Size 5 /Root 1 0 R /Encrypt 4 0 R /ID [(file id)] >>
3246startxref
3247295
3248%%EOF"
3249 .to_vec()
3250 }
3251
3252 #[test]
3253 fn test_reader_encryption_detection() {
3254 let unencrypted_pdf = create_minimal_pdf();
3256 let cursor = Cursor::new(unencrypted_pdf);
3257 let reader = PdfReader::new(cursor).unwrap();
3258 assert!(!reader.is_encrypted());
3259 assert!(reader.is_unlocked()); let encrypted_pdf = create_pdf_with_encryption();
3263 let cursor = Cursor::new(encrypted_pdf);
3264 let result = PdfReader::new(cursor);
3265 assert!(result.is_err());
3267 }
3268
3269 #[test]
3270 fn test_reader_encryption_methods_unencrypted() {
3271 let pdf_data = create_minimal_pdf();
3272 let cursor = Cursor::new(pdf_data);
3273 let mut reader = PdfReader::new(cursor).unwrap();
3274
3275 assert!(!reader.is_encrypted());
3277 assert!(reader.is_unlocked());
3278 assert!(reader.encryption_handler().is_none());
3279 assert!(reader.encryption_handler_mut().is_none());
3280
3281 assert!(reader.unlock_with_password("any_password").unwrap());
3283 assert!(reader.try_empty_password().unwrap());
3284 }
3285
3286 #[test]
3287 fn test_reader_encryption_handler_access() {
3288 let pdf_data = create_minimal_pdf();
3289 let cursor = Cursor::new(pdf_data);
3290 let mut reader = PdfReader::new(cursor).unwrap();
3291
3292 assert!(reader.encryption_handler().is_none());
3294 assert!(reader.encryption_handler_mut().is_none());
3295
3296 assert!(!reader.is_encrypted());
3298 assert!(reader.is_unlocked());
3299 }
3300
3301 #[test]
3302 fn test_reader_multiple_password_attempts() {
3303 let pdf_data = create_minimal_pdf();
3304 let cursor = Cursor::new(pdf_data);
3305 let mut reader = PdfReader::new(cursor).unwrap();
3306
3307 let passwords = vec!["test1", "test2", "admin", "", "password"];
3309 for password in passwords {
3310 assert!(reader.unlock_with_password(password).unwrap());
3311 }
3312
3313 for _ in 0..5 {
3315 assert!(reader.try_empty_password().unwrap());
3316 }
3317 }
3318
3319 #[test]
3320 fn test_reader_encryption_state_consistency() {
3321 let pdf_data = create_minimal_pdf();
3322 let cursor = Cursor::new(pdf_data);
3323 let mut reader = PdfReader::new(cursor).unwrap();
3324
3325 assert!(!reader.is_encrypted());
3327 assert!(reader.is_unlocked());
3328 assert!(reader.encryption_handler().is_none());
3329
3330 let _ = reader.unlock_with_password("test");
3332 assert!(!reader.is_encrypted());
3333 assert!(reader.is_unlocked());
3334 assert!(reader.encryption_handler().is_none());
3335
3336 let _ = reader.try_empty_password();
3337 assert!(!reader.is_encrypted());
3338 assert!(reader.is_unlocked());
3339 assert!(reader.encryption_handler().is_none());
3340 }
3341
3342 #[test]
3343 fn test_reader_encryption_error_handling() {
3344 let encrypted_pdf = create_pdf_with_encryption();
3346 let cursor = Cursor::new(encrypted_pdf);
3347
3348 let result = PdfReader::new(cursor);
3350 match result {
3351 Err(ParseError::EncryptionNotSupported) => {
3352 }
3354 Err(_) => {
3355 }
3357 Ok(_) => {
3358 panic!("Should not successfully create reader for encrypted PDF without password");
3359 }
3360 }
3361 }
3362
3363 #[test]
3364 fn test_reader_encryption_with_options() {
3365 let pdf_data = create_minimal_pdf();
3366 let cursor = Cursor::new(pdf_data);
3367
3368 let strict_options = ParseOptions::strict();
3370 let strict_reader = PdfReader::new_with_options(cursor, strict_options).unwrap();
3371 assert!(!strict_reader.is_encrypted());
3372 assert!(strict_reader.is_unlocked());
3373
3374 let pdf_data = create_minimal_pdf();
3375 let cursor = Cursor::new(pdf_data);
3376 let lenient_options = ParseOptions::lenient();
3377 let lenient_reader = PdfReader::new_with_options(cursor, lenient_options).unwrap();
3378 assert!(!lenient_reader.is_encrypted());
3379 assert!(lenient_reader.is_unlocked());
3380 }
3381
3382 #[test]
3383 fn test_reader_encryption_integration_edge_cases() {
3384 let pdf_data = create_minimal_pdf();
3385 let cursor = Cursor::new(pdf_data);
3386 let mut reader = PdfReader::new(cursor).unwrap();
3387
3388 assert!(reader.unlock_with_password("").unwrap());
3390 assert!(reader.unlock_with_password(" ").unwrap()); assert!(reader
3392 .unlock_with_password("very_long_password_that_exceeds_normal_length")
3393 .unwrap());
3394 assert!(reader.unlock_with_password("unicode_test_ñáéíóú").unwrap());
3395
3396 assert!(reader.unlock_with_password("pass@#$%^&*()").unwrap());
3398 assert!(reader.unlock_with_password("pass\nwith\nnewlines").unwrap());
3399 assert!(reader.unlock_with_password("pass\twith\ttabs").unwrap());
3400 }
3401
3402 mod rigorous {
3403 use super::*;
3404
3405 #[test]
3410 fn test_reader_invalid_pdf_header() {
3411 let invalid_data = b"This is not a PDF file";
3413 let cursor = Cursor::new(invalid_data.to_vec());
3414 let result = PdfReader::new(cursor);
3415
3416 assert!(result.is_err(), "Should fail on invalid PDF header");
3417 }
3418
3419 #[test]
3420 fn test_reader_truncated_header() {
3421 let truncated = b"%PDF";
3423 let cursor = Cursor::new(truncated.to_vec());
3424 let result = PdfReader::new(cursor);
3425
3426 assert!(result.is_err(), "Should fail on truncated header");
3427 }
3428
3429 #[test]
3430 fn test_reader_empty_file() {
3431 let empty = Vec::new();
3432 let cursor = Cursor::new(empty);
3433 let result = PdfReader::new(cursor);
3434
3435 assert!(result.is_err(), "Should fail on empty file");
3436 }
3437
3438 #[test]
3439 fn test_reader_malformed_version() {
3440 let malformed = b"%PDF-X.Y\n%%\xE2\xE3\xCF\xD3\n";
3442 let cursor = Cursor::new(malformed.to_vec());
3443 let result = PdfReader::new(cursor);
3444
3445 if let Ok(reader) = result {
3447 let _version = reader.version();
3449 }
3450 }
3451
3452 #[test]
3453 fn test_reader_get_nonexistent_object() {
3454 let pdf_data = create_minimal_pdf();
3455 let cursor = Cursor::new(pdf_data);
3456 let mut reader = PdfReader::new(cursor).unwrap();
3457
3458 let result = reader.get_object(999, 0);
3460
3461 assert!(result.is_err(), "Should fail when object doesn't exist");
3462 }
3463
3464 #[test]
3465 fn test_reader_get_object_wrong_generation() {
3466 let pdf_data = create_minimal_pdf();
3467 let cursor = Cursor::new(pdf_data);
3468 let mut reader = PdfReader::new(cursor).unwrap();
3469
3470 let result = reader.get_object(1, 99);
3472
3473 if let Err(e) = result {
3475 let _ = e;
3477 }
3478 }
3479
3480 #[test]
3485 fn test_resolve_direct_object() {
3486 let pdf_data = create_minimal_pdf();
3487 let cursor = Cursor::new(pdf_data);
3488 let mut reader = PdfReader::new(cursor).unwrap();
3489
3490 let direct_obj = PdfObject::Integer(42);
3492
3493 let resolved = reader.resolve(&direct_obj).unwrap();
3494
3495 assert_eq!(resolved, &PdfObject::Integer(42));
3497 }
3498
3499 #[test]
3500 fn test_resolve_reference() {
3501 let pdf_data = create_minimal_pdf();
3502 let cursor = Cursor::new(pdf_data);
3503 let mut reader = PdfReader::new(cursor).unwrap();
3504
3505 let pages_ref = {
3507 let catalog = reader.catalog().unwrap();
3508 if let Some(PdfObject::Reference(obj_num, gen_num)) = catalog.get("Pages") {
3509 PdfObject::Reference(*obj_num, *gen_num)
3510 } else {
3511 panic!("Catalog /Pages must be a Reference");
3512 }
3513 };
3514
3515 let resolved = reader.resolve(&pages_ref).unwrap();
3517
3518 if let PdfObject::Dictionary(dict) = resolved {
3520 assert_eq!(
3521 dict.get("Type"),
3522 Some(&PdfObject::Name(PdfName("Pages".to_string())))
3523 );
3524 } else {
3525 panic!("Expected dictionary, got: {:?}", resolved);
3526 }
3527 }
3528
3529 #[test]
3534 fn test_is_encrypted_on_unencrypted() {
3535 let pdf_data = create_minimal_pdf();
3536 let cursor = Cursor::new(pdf_data);
3537 let reader = PdfReader::new(cursor).unwrap();
3538
3539 assert!(
3540 !reader.is_encrypted(),
3541 "Minimal PDF should not be encrypted"
3542 );
3543 }
3544
3545 #[test]
3546 fn test_is_unlocked_on_unencrypted() {
3547 let pdf_data = create_minimal_pdf();
3548 let cursor = Cursor::new(pdf_data);
3549 let reader = PdfReader::new(cursor).unwrap();
3550
3551 assert!(reader.is_unlocked(), "Unencrypted PDF should be unlocked");
3553 }
3554
3555 #[test]
3556 fn test_try_empty_password_on_unencrypted() {
3557 let pdf_data = create_minimal_pdf();
3558 let cursor = Cursor::new(pdf_data);
3559 let mut reader = PdfReader::new(cursor).unwrap();
3560
3561 let result = reader.try_empty_password();
3563 assert!(result.is_ok());
3564 }
3565
3566 #[test]
3571 fn test_reader_with_strict_options() {
3572 let pdf_data = create_minimal_pdf();
3573 let cursor = Cursor::new(pdf_data);
3574
3575 let options = ParseOptions::strict();
3576 let result = PdfReader::new_with_options(cursor, options);
3577
3578 assert!(result.is_ok(), "Minimal PDF should parse in strict mode");
3579 }
3580
3581 #[test]
3582 fn test_reader_with_lenient_options() {
3583 let pdf_data = create_minimal_pdf();
3584 let cursor = Cursor::new(pdf_data);
3585
3586 let options = ParseOptions::lenient();
3587 let result = PdfReader::new_with_options(cursor, options);
3588
3589 assert!(result.is_ok(), "Minimal PDF should parse in lenient mode");
3590 }
3591
3592 #[test]
3593 fn test_reader_options_accessible() {
3594 let pdf_data = create_minimal_pdf();
3595 let cursor = Cursor::new(pdf_data);
3596
3597 let options = ParseOptions::lenient();
3598 let reader = PdfReader::new_with_options(cursor, options.clone()).unwrap();
3599
3600 let reader_options = reader.options();
3602 assert_eq!(reader_options.strict_mode, options.strict_mode);
3603 }
3604
3605 #[test]
3610 fn test_catalog_has_required_fields() {
3611 let pdf_data = create_minimal_pdf();
3612 let cursor = Cursor::new(pdf_data);
3613 let mut reader = PdfReader::new(cursor).unwrap();
3614
3615 let catalog = reader.catalog().unwrap();
3616
3617 assert_eq!(
3619 catalog.get("Type"),
3620 Some(&PdfObject::Name(PdfName("Catalog".to_string()))),
3621 "Catalog must have /Type /Catalog"
3622 );
3623
3624 assert!(
3626 catalog.contains_key("Pages"),
3627 "Catalog must have /Pages entry"
3628 );
3629 }
3630
3631 #[test]
3632 fn test_info_fields_when_present() {
3633 let pdf_data = create_pdf_with_info();
3634 let cursor = Cursor::new(pdf_data);
3635 let mut reader = PdfReader::new(cursor).unwrap();
3636
3637 let info = reader.info().unwrap();
3638 assert!(info.is_some(), "PDF should have Info dictionary");
3639
3640 let info_dict = info.unwrap();
3641
3642 assert!(info_dict.contains_key("Title"), "Info should have Title");
3644 assert!(info_dict.contains_key("Author"), "Info should have Author");
3645 }
3646
3647 #[test]
3648 fn test_info_none_when_absent() {
3649 let pdf_data = create_minimal_pdf();
3650 let cursor = Cursor::new(pdf_data);
3651 let mut reader = PdfReader::new(cursor).unwrap();
3652
3653 let info = reader.info().unwrap();
3654 assert!(info.is_none(), "Minimal PDF should not have Info");
3655 }
3656
3657 #[test]
3662 fn test_version_exact_values() {
3663 let pdf_data = create_pdf_with_version("1.7");
3664 let cursor = Cursor::new(pdf_data);
3665 let reader = PdfReader::new(cursor).unwrap();
3666
3667 let version = reader.version();
3668 assert_eq!(version.major, 1, "Major version must be exact");
3669 assert_eq!(version.minor, 7, "Minor version must be exact");
3670 }
3671
3672 #[test]
3673 fn test_version_pdf_20() {
3674 let pdf_data = create_pdf_with_version("2.0");
3675 let cursor = Cursor::new(pdf_data);
3676 let reader = PdfReader::new(cursor).unwrap();
3677
3678 let version = reader.version();
3679 assert_eq!(version.major, 2, "PDF 2.0 major version");
3680 assert_eq!(version.minor, 0, "PDF 2.0 minor version");
3681 }
3682
3683 #[test]
3688 fn test_pages_returns_pages_dict() {
3689 let pdf_data = create_minimal_pdf();
3690 let cursor = Cursor::new(pdf_data);
3691 let mut reader = PdfReader::new(cursor).unwrap();
3692
3693 let pages_dict = reader
3694 .pages()
3695 .expect("pages() must return Pages dictionary");
3696
3697 assert_eq!(
3698 pages_dict.get("Type"),
3699 Some(&PdfObject::Name(PdfName("Pages".to_string()))),
3700 "Pages dict must have /Type /Pages"
3701 );
3702 }
3703
3704 #[test]
3705 fn test_page_count_minimal_pdf() {
3706 let pdf_data = create_minimal_pdf();
3707 let cursor = Cursor::new(pdf_data);
3708 let mut reader = PdfReader::new(cursor).unwrap();
3709
3710 let count = reader.page_count().expect("page_count() must succeed");
3711 assert_eq!(count, 0, "Minimal PDF has 0 pages");
3712 }
3713
3714 #[test]
3715 fn test_page_count_with_info_pdf() {
3716 let pdf_data = create_pdf_with_info();
3717 let cursor = Cursor::new(pdf_data);
3718 let mut reader = PdfReader::new(cursor).unwrap();
3719
3720 let count = reader.page_count().expect("page_count() must succeed");
3721 assert_eq!(count, 0, "create_pdf_with_info() has Count 0 in Pages dict");
3722 }
3723
3724 #[test]
3729 fn test_metadata_minimal_pdf() {
3730 let pdf_data = create_minimal_pdf();
3731 let cursor = Cursor::new(pdf_data);
3732 let mut reader = PdfReader::new(cursor).unwrap();
3733
3734 let meta = reader.metadata().expect("metadata() must succeed");
3735
3736 assert!(meta.title.is_none(), "Minimal PDF has no title");
3738 assert!(meta.author.is_none(), "Minimal PDF has no author");
3739 }
3740
3741 #[test]
3742 fn test_metadata_with_info() {
3743 let pdf_data = create_pdf_with_info();
3744 let cursor = Cursor::new(pdf_data);
3745 let mut reader = PdfReader::new(cursor).unwrap();
3746
3747 let meta = reader.metadata().expect("metadata() must succeed");
3748
3749 assert!(meta.title.is_some(), "PDF with Info has title");
3750 assert_eq!(meta.title.unwrap(), "Test PDF", "Title must match");
3751 assert!(meta.author.is_some(), "PDF with Info has author");
3752 assert_eq!(meta.author.unwrap(), "Test Author", "Author must match");
3753 }
3754
3755 #[test]
3760 fn test_resolve_stream_length_direct_integer() {
3761 let pdf_data = create_minimal_pdf();
3762 let cursor = Cursor::new(pdf_data);
3763 let mut reader = PdfReader::new(cursor).unwrap();
3764
3765 let length_obj = PdfObject::Integer(100);
3767
3768 let length = reader
3769 .resolve_stream_length(&length_obj)
3770 .expect("resolve_stream_length must succeed");
3771 assert_eq!(length, Some(100), "Direct integer must be resolved");
3772 }
3773
3774 #[test]
3775 fn test_resolve_stream_length_negative_integer() {
3776 let pdf_data = create_minimal_pdf();
3777 let cursor = Cursor::new(pdf_data);
3778 let mut reader = PdfReader::new(cursor).unwrap();
3779
3780 let length_obj = PdfObject::Integer(-10);
3782
3783 let length = reader
3784 .resolve_stream_length(&length_obj)
3785 .expect("resolve_stream_length must succeed");
3786 assert_eq!(length, None, "Negative integer returns None");
3787 }
3788
3789 #[test]
3790 fn test_resolve_stream_length_non_integer() {
3791 let pdf_data = create_minimal_pdf();
3792 let cursor = Cursor::new(pdf_data);
3793 let mut reader = PdfReader::new(cursor).unwrap();
3794
3795 let name_obj = PdfObject::Name(PdfName("Test".to_string()));
3797
3798 let length = reader
3799 .resolve_stream_length(&name_obj)
3800 .expect("resolve_stream_length must succeed");
3801 assert_eq!(length, None, "Non-integer object returns None");
3802 }
3803
3804 #[test]
3809 fn test_get_all_pages_empty_pdf() {
3810 let pdf_data = create_minimal_pdf();
3811 let cursor = Cursor::new(pdf_data);
3812 let mut reader = PdfReader::new(cursor).unwrap();
3813
3814 let pages = reader
3815 .get_all_pages()
3816 .expect("get_all_pages() must succeed");
3817 assert_eq!(pages.len(), 0, "Minimal PDF has 0 pages");
3818 }
3819
3820 #[test]
3821 fn test_get_all_pages_with_info() {
3822 let pdf_data = create_pdf_with_info();
3823 let cursor = Cursor::new(pdf_data);
3824 let mut reader = PdfReader::new(cursor).unwrap();
3825
3826 let pages = reader
3827 .get_all_pages()
3828 .expect("get_all_pages() must succeed");
3829 assert_eq!(
3830 pages.len(),
3831 0,
3832 "create_pdf_with_info() has 0 pages (Count 0)"
3833 );
3834 }
3835
3836 #[test]
3841 fn test_into_document_consumes_reader() {
3842 let pdf_data = create_minimal_pdf();
3843 let cursor = Cursor::new(pdf_data);
3844 let reader = PdfReader::new(cursor).unwrap();
3845
3846 let document = reader.into_document();
3847
3848 let version = document.version().expect("Document must have version");
3850 assert!(
3851 version.starts_with("1."),
3852 "Document must have PDF 1.x version, got: {}",
3853 version
3854 );
3855
3856 let page_count = document
3858 .page_count()
3859 .expect("Document must allow page_count()");
3860 assert_eq!(
3861 page_count, 0,
3862 "Minimal PDF has 0 pages (Count 0 in test helper)"
3863 );
3864 }
3865
3866 #[test]
3871 fn test_clear_parse_context() {
3872 let pdf_data = create_minimal_pdf();
3873 let cursor = Cursor::new(pdf_data);
3874 let mut reader = PdfReader::new(cursor).unwrap();
3875
3876 reader.clear_parse_context();
3878
3879 let version = reader.version();
3881 assert_eq!(version.major, 1, "Reader must still work after clear");
3882 }
3883
3884 #[test]
3885 fn test_parse_context_mut_accessible() {
3886 let pdf_data = create_minimal_pdf();
3887 let cursor = Cursor::new(pdf_data);
3888 let mut reader = PdfReader::new(cursor).unwrap();
3889
3890 let context = reader.parse_context_mut();
3891
3892 let initial_depth = context.depth;
3894 assert_eq!(initial_depth, 0, "Parse context must start with depth 0");
3895
3896 assert!(
3898 context.max_depth > 0,
3899 "Parse context must have positive max_depth"
3900 );
3901 }
3902
3903 #[test]
3908 fn test_find_bytes_basic() {
3909 let haystack = b"Hello World";
3910 let needle = b"World";
3911 let pos = find_bytes(haystack, needle);
3912 assert_eq!(pos, Some(6), "Must find 'World' at position 6");
3913 }
3914
3915 #[test]
3916 fn test_find_bytes_not_found() {
3917 let haystack = b"Hello World";
3918 let needle = b"Rust";
3919 let pos = find_bytes(haystack, needle);
3920 assert_eq!(pos, None, "Must return None when not found");
3921 }
3922
3923 #[test]
3924 fn test_find_bytes_at_start() {
3925 let haystack = b"Hello World";
3926 let needle = b"Hello";
3927 let pos = find_bytes(haystack, needle);
3928 assert_eq!(pos, Some(0), "Must find at position 0");
3929 }
3930
3931 #[test]
3932 fn test_is_immediate_stream_start_with_stream() {
3933 let data = b"stream\ndata";
3934 assert!(
3935 is_immediate_stream_start(data),
3936 "Must detect 'stream' at start"
3937 );
3938 }
3939
3940 #[test]
3941 fn test_is_immediate_stream_start_with_whitespace() {
3942 let data = b" \n\tstream\ndata";
3943 assert!(
3944 is_immediate_stream_start(data),
3945 "Must detect 'stream' after whitespace"
3946 );
3947 }
3948
3949 #[test]
3950 fn test_is_immediate_stream_start_no_stream() {
3951 let data = b"endobj";
3952 assert!(
3953 !is_immediate_stream_start(data),
3954 "Must return false when 'stream' absent"
3955 );
3956 }
3957 }
3958}