1use super::encryption_handler::EncryptionHandler;
6use super::header::PdfHeader;
7use super::object_stream::ObjectStream;
8use super::objects::{PdfDictionary, PdfObject};
9use super::stack_safe::StackSafeContext;
10use super::trailer::PdfTrailer;
11use super::xref::XRefTable;
12use super::{ParseError, ParseResult};
13use std::collections::HashMap;
14use std::fs::File;
15use std::io::{BufReader, Read, Seek, SeekFrom};
16use std::path::Path;
17
18fn find_bytes(haystack: &[u8], needle: &[u8]) -> Option<usize> {
20 haystack
21 .windows(needle.len())
22 .position(|window| window == needle)
23}
24
25fn is_immediate_stream_start(data: &[u8]) -> bool {
27 let mut i = 0;
28
29 while i < data.len() && matches!(data[i], b' ' | b'\t' | b'\n' | b'\r') {
31 i += 1;
32 }
33
34 data[i..].starts_with(b"stream")
36}
37
38pub struct PdfReader<R: Read + Seek> {
40 reader: BufReader<R>,
41 header: PdfHeader,
42 xref: XRefTable,
43 trailer: PdfTrailer,
44 object_cache: HashMap<(u32, u16), PdfObject>,
46 object_stream_cache: HashMap<u32, ObjectStream>,
48 page_tree: Option<super::page_tree::PageTree>,
50 parse_context: StackSafeContext,
52 options: super::ParseOptions,
54 encryption_handler: Option<EncryptionHandler>,
56 objects_being_reconstructed: std::sync::Mutex<std::collections::HashSet<u32>>,
58 max_reconstruction_depth: u32,
60}
61
62impl<R: Read + Seek> PdfReader<R> {
63 pub fn options(&self) -> &super::ParseOptions {
65 &self.options
66 }
67
68 pub fn is_encrypted(&self) -> bool {
70 self.encryption_handler.is_some()
71 }
72
73 pub fn is_unlocked(&self) -> bool {
75 match &self.encryption_handler {
76 Some(handler) => handler.is_unlocked(),
77 None => true, }
79 }
80
81 pub fn encryption_handler_mut(&mut self) -> Option<&mut EncryptionHandler> {
83 self.encryption_handler.as_mut()
84 }
85
86 pub fn encryption_handler(&self) -> Option<&EncryptionHandler> {
88 self.encryption_handler.as_ref()
89 }
90
91 pub fn unlock_with_password(&mut self, password: &str) -> ParseResult<bool> {
93 match &mut self.encryption_handler {
94 Some(handler) => {
95 if handler.unlock_with_user_password(password).unwrap_or(false) {
97 Ok(true)
98 } else {
99 Ok(handler
101 .unlock_with_owner_password(password)
102 .unwrap_or(false))
103 }
104 }
105 None => Ok(true), }
107 }
108
109 pub fn try_empty_password(&mut self) -> ParseResult<bool> {
111 match &mut self.encryption_handler {
112 Some(handler) => Ok(handler.try_empty_password().unwrap_or(false)),
113 None => Ok(true), }
115 }
116}
117
118impl PdfReader<File> {
119 pub fn open<P: AsRef<Path>>(path: P) -> ParseResult<Self> {
121 use std::io::Write;
122 let mut debug_file = std::fs::File::create("/tmp/pdf_open_debug.log").ok();
123 if let Some(ref mut f) = debug_file {
124 writeln!(f, "Opening file: {:?}", path.as_ref()).ok();
125 }
126 let file = File::open(path)?;
127 if let Some(ref mut f) = debug_file {
128 writeln!(f, "File opened successfully").ok();
129 }
130 let options = super::ParseOptions::lenient();
132 Self::new_with_options(file, options)
133 }
134
135 pub fn open_strict<P: AsRef<Path>>(path: P) -> ParseResult<Self> {
137 let file = File::open(path)?;
138 let options = super::ParseOptions::strict();
139 Self::new_with_options(file, options)
140 }
141
142 pub fn open_with_options<P: AsRef<Path>>(
144 path: P,
145 options: super::ParseOptions,
146 ) -> ParseResult<Self> {
147 let file = File::open(path)?;
148 Self::new_with_options(file, options)
149 }
150
151 pub fn open_document<P: AsRef<Path>>(
153 path: P,
154 ) -> ParseResult<super::document::PdfDocument<File>> {
155 let reader = Self::open(path)?;
156 Ok(reader.into_document())
157 }
158}
159
160impl<R: Read + Seek> PdfReader<R> {
161 pub fn new(reader: R) -> ParseResult<Self> {
163 Self::new_with_options(reader, super::ParseOptions::default())
164 }
165
166 pub fn new_with_options(reader: R, options: super::ParseOptions) -> ParseResult<Self> {
168 let mut buf_reader = BufReader::new(reader);
169
170 let start_pos = buf_reader.stream_position()?;
172 buf_reader.seek(SeekFrom::End(0))?;
173 let file_size = buf_reader.stream_position()?;
174 buf_reader.seek(SeekFrom::Start(start_pos))?;
175
176 if file_size == 0 {
177 return Err(ParseError::EmptyFile);
178 }
179
180 use std::io::Write;
182 let mut debug_file = std::fs::File::create("/tmp/pdf_debug.log").ok();
183 if let Some(ref mut f) = debug_file {
184 writeln!(f, "Parsing PDF header...").ok();
185 }
186 let header = PdfHeader::parse(&mut buf_reader)?;
187 if let Some(ref mut f) = debug_file {
188 writeln!(f, "Header parsed: version {}", header.version).ok();
189 }
190
191 if let Some(ref mut f) = debug_file {
193 writeln!(f, "Parsing XRef table...").ok();
194 }
195 let xref = XRefTable::parse_with_options(&mut buf_reader, &options)?;
196 if let Some(ref mut f) = debug_file {
197 writeln!(f, "XRef table parsed with {} entries", xref.len()).ok();
198 }
199
200 let trailer_dict = xref.trailer().ok_or(ParseError::InvalidTrailer)?.clone();
202
203 let xref_offset = xref.xref_offset();
204 let trailer = PdfTrailer::from_dict(trailer_dict, xref_offset)?;
205
206 trailer.validate()?;
208
209 let encryption_handler = if EncryptionHandler::detect_encryption(trailer.dict()) {
211 if let Ok(Some((encrypt_obj_num, encrypt_gen_num))) = trailer.encrypt() {
212 let mut temp_reader = Self {
214 reader: buf_reader,
215 header: header.clone(),
216 xref: xref.clone(),
217 trailer: trailer.clone(),
218 object_cache: HashMap::new(),
219 object_stream_cache: HashMap::new(),
220 page_tree: None,
221 parse_context: StackSafeContext::new(),
222 options: options.clone(),
223 encryption_handler: None,
224 objects_being_reconstructed: std::sync::Mutex::new(
225 std::collections::HashSet::new(),
226 ),
227 max_reconstruction_depth: 100,
228 };
229
230 let encrypt_obj = temp_reader.get_object(encrypt_obj_num, encrypt_gen_num)?;
232 if let Some(encrypt_dict) = encrypt_obj.as_dict() {
233 let file_id = trailer.id().and_then(|id_obj| {
235 if let PdfObject::Array(ref id_array) = id_obj {
236 if let Some(PdfObject::String(ref id_bytes)) = id_array.get(0) {
237 Some(id_bytes.as_bytes().to_vec())
238 } else {
239 None
240 }
241 } else {
242 None
243 }
244 });
245
246 match EncryptionHandler::new(encrypt_dict, file_id) {
247 Ok(handler) => {
248 buf_reader = temp_reader.reader;
250 Some(handler)
251 }
252 Err(_) => {
253 let _ = temp_reader.reader;
255 return Err(ParseError::EncryptionNotSupported);
256 }
257 }
258 } else {
259 let _ = temp_reader.reader;
260 return Err(ParseError::EncryptionNotSupported);
261 }
262 } else {
263 return Err(ParseError::EncryptionNotSupported);
264 }
265 } else {
266 None
267 };
268
269 Ok(Self {
270 reader: buf_reader,
271 header,
272 xref,
273 trailer,
274 object_cache: HashMap::new(),
275 object_stream_cache: HashMap::new(),
276 page_tree: None,
277 parse_context: StackSafeContext::new(),
278 options,
279 encryption_handler,
280 objects_being_reconstructed: std::sync::Mutex::new(std::collections::HashSet::new()),
281 max_reconstruction_depth: 100,
282 })
283 }
284
285 pub fn version(&self) -> &super::header::PdfVersion {
287 &self.header.version
288 }
289
290 pub fn catalog(&mut self) -> ParseResult<&PdfDictionary> {
292 let (obj_num, gen_num) = match self.trailer.root() {
294 Ok(root) => {
295 if let Ok(obj) = self.get_object(root.0, root.1) {
298 if let Some(dict) = obj.as_dict() {
299 if let Some(type_obj) = dict.get("Type") {
301 if let Some(type_name) = type_obj.as_name() {
302 if type_name.0 != "Catalog" {
303 eprintln!("Warning: Trailer /Root points to /Type/{} (not Catalog), scanning for real catalog", type_name.0);
304 if let Ok(catalog_ref) = self.find_catalog_object() {
306 catalog_ref
307 } else {
308 root }
310 } else {
311 root }
313 } else {
314 root }
316 } else {
317 root }
319 } else {
320 root }
322 } else {
323 root }
325 }
326 Err(_) => {
327 #[cfg(debug_assertions)]
329 eprintln!("Warning: Trailer missing Root entry, attempting recovery");
330
331 if let Some(root) = self.trailer.find_root_fallback() {
333 root
334 } else {
335 if let Ok(catalog_ref) = self.find_catalog_object() {
337 catalog_ref
338 } else {
339 return Err(ParseError::MissingKey("Root".to_string()));
340 }
341 }
342 }
343 };
344
345 let key = (obj_num, gen_num);
347 let needs_reconstruction = {
348 match self.get_object(obj_num, gen_num) {
349 Ok(catalog) => {
350 if catalog.as_dict().is_some() {
352 false
354 } else {
355 true
357 }
358 }
359 Err(_) => {
360 true
362 }
363 }
364 };
365
366 if !needs_reconstruction {
367 let catalog = self.get_object(obj_num, gen_num)?;
369 return catalog.as_dict().ok_or_else(|| ParseError::SyntaxError {
370 position: 0,
371 message: format!("Catalog object {} {} is not a dictionary", obj_num, gen_num),
372 });
373 }
374
375 eprintln!(
377 "DEBUG: Catalog object {} needs reconstruction, attempting manual reconstruction",
378 obj_num
379 );
380
381 match self.extract_object_manually(obj_num) {
382 Ok(dict) => {
383 eprintln!(
384 "DEBUG: Successfully reconstructed catalog {} manually",
385 obj_num
386 );
387 let obj = PdfObject::Dictionary(dict);
389 self.object_cache.insert(key, obj);
390
391 use crate::parser::xref::XRefEntry;
393 let xref_entry = XRefEntry {
394 offset: 0, generation: gen_num,
396 in_use: true,
397 };
398 self.xref.add_entry(obj_num, xref_entry);
399 eprintln!("DEBUG: Added catalog object {} to XRef table", obj_num);
400
401 if let Some(PdfObject::Dictionary(ref dict)) = self.object_cache.get(&key) {
403 return Ok(dict);
404 }
405 }
406 Err(e) => {
407 eprintln!("DEBUG: Manual catalog reconstruction failed: {:?}", e);
408 }
409 }
410
411 Err(ParseError::SyntaxError {
413 position: 0,
414 message: format!(
415 "Catalog object {} could not be parsed or reconstructed as a dictionary",
416 obj_num
417 ),
418 })
419 }
420
421 pub fn info(&mut self) -> ParseResult<Option<&PdfDictionary>> {
423 match self.trailer.info() {
424 Some((obj_num, gen_num)) => {
425 let info = self.get_object(obj_num, gen_num)?;
426 Ok(info.as_dict())
427 }
428 None => Ok(None),
429 }
430 }
431
432 pub fn get_object(&mut self, obj_num: u32, gen_num: u16) -> ParseResult<&PdfObject> {
434 let key = (obj_num, gen_num);
435
436 if self.object_cache.contains_key(&key) {
438 return Ok(&self.object_cache[&key]);
439 }
440
441 {
443 let being_loaded =
444 self.objects_being_reconstructed
445 .lock()
446 .map_err(|_| ParseError::SyntaxError {
447 position: 0,
448 message: "Mutex poisoned during circular reference check".to_string(),
449 })?;
450 if being_loaded.contains(&obj_num) {
451 drop(being_loaded);
452 if self.options.collect_warnings {
453 eprintln!(
454 "DEBUG: Circular reference detected while loading object {} {} - breaking cycle with null object",
455 obj_num, gen_num
456 );
457 }
458 self.object_cache.insert(key, PdfObject::Null);
459 return Ok(&self.object_cache[&key]);
460 }
461 }
462
463 {
465 let being_loaded =
466 self.objects_being_reconstructed
467 .lock()
468 .map_err(|_| ParseError::SyntaxError {
469 position: 0,
470 message: "Mutex poisoned during depth limit check".to_string(),
471 })?;
472 let depth = being_loaded.len() as u32;
473 if depth >= self.max_reconstruction_depth {
474 drop(being_loaded);
475 if self.options.collect_warnings {
476 eprintln!(
477 "DEBUG: Maximum object loading depth ({}) exceeded for object {} {}",
478 self.max_reconstruction_depth, obj_num, gen_num
479 );
480 }
481 return Err(ParseError::SyntaxError {
482 position: 0,
483 message: format!(
484 "Maximum object loading depth ({}) exceeded",
485 self.max_reconstruction_depth
486 ),
487 });
488 }
489 }
490
491 self.objects_being_reconstructed
493 .lock()
494 .map_err(|_| ParseError::SyntaxError {
495 position: 0,
496 message: "Mutex poisoned while marking object as being loaded".to_string(),
497 })?
498 .insert(obj_num);
499
500 match self.load_object_from_disk(obj_num, gen_num) {
502 Ok(_) => {
503 self.objects_being_reconstructed
505 .lock()
506 .map_err(|_| ParseError::SyntaxError {
507 position: 0,
508 message: "Mutex poisoned while unmarking object after successful load"
509 .to_string(),
510 })?
511 .remove(&obj_num);
512 Ok(&self.object_cache[&key])
514 }
515 Err(e) => {
516 if let Ok(mut guard) = self.objects_being_reconstructed.lock() {
519 guard.remove(&obj_num);
520 }
521 Err(e)
522 }
523 }
524 }
525
526 fn load_object_from_disk(&mut self, obj_num: u32, gen_num: u16) -> ParseResult<&PdfObject> {
528 let key = (obj_num, gen_num);
529
530 if self.object_cache.contains_key(&key) {
532 return Ok(&self.object_cache[&key]);
533 }
534
535 if let Some(ext_entry) = self.xref.get_extended_entry(obj_num) {
537 if let Some((stream_obj_num, index_in_stream)) = ext_entry.compressed_info {
538 eprintln!(
539 "DEBUG: Object {} found in Object Stream {} at index {}",
540 obj_num, stream_obj_num, index_in_stream
541 );
542 return self.get_compressed_object(
544 obj_num,
545 gen_num,
546 stream_obj_num,
547 index_in_stream,
548 );
549 }
550 } else {
551 eprintln!("DEBUG: Object {} not found in extended entries", obj_num);
552 }
553
554 let (current_offset, _generation) = {
556 let entry = self.xref.get_entry(obj_num);
557
558 match entry {
559 Some(entry) => {
560 if !entry.in_use {
561 self.object_cache.insert(key, PdfObject::Null);
563 return Ok(&self.object_cache[&key]);
564 }
565
566 if entry.generation != gen_num {
567 if self.options.lenient_syntax {
568 if self.options.collect_warnings {
570 eprintln!("Warning: Object {} generation mismatch - expected {}, found {}, using available",
571 obj_num, gen_num, entry.generation);
572 }
573 } else {
574 return Err(ParseError::InvalidReference(obj_num, gen_num));
575 }
576 }
577
578 (entry.offset, entry.generation)
579 }
580 None => {
581 if self.is_reconstructible_object(obj_num) {
583 eprintln!("DEBUG: Object {} not found in XRef table, attempting manual reconstruction", obj_num);
584 return self.attempt_manual_object_reconstruction(obj_num, gen_num, 0);
585 } else {
586 if self.options.lenient_syntax {
587 if self.options.collect_warnings {
589 eprintln!("Warning: Object {} {} R not found in XRef, returning null object",
590 obj_num, gen_num);
591 }
592 self.object_cache.insert(key, PdfObject::Null);
593 return Ok(&self.object_cache[&key]);
594 } else {
595 return Err(ParseError::InvalidReference(obj_num, gen_num));
596 }
597 }
598 }
599 }
600 };
601
602 self.reader.seek(std::io::SeekFrom::Start(current_offset))?;
606
607 let mut lexer =
609 super::lexer::Lexer::new_with_options(&mut self.reader, self.options.clone());
610
611 {
613 let token = lexer.next_token()?;
615 let read_obj_num = match token {
616 super::lexer::Token::Integer(n) => n as u32,
617 _ => {
618 if self.options.lenient_syntax {
620 if self.options.collect_warnings {
622 eprintln!(
623 "Warning: Using expected object number {obj_num} instead of parsed token: {:?}",
624 token
625 );
626 }
627 obj_num
628 } else {
629 return Err(ParseError::SyntaxError {
630 position: current_offset as usize,
631 message: "Expected object number".to_string(),
632 });
633 }
634 }
635 };
636
637 if read_obj_num != obj_num && !self.options.lenient_syntax {
638 return Err(ParseError::SyntaxError {
639 position: current_offset as usize,
640 message: format!(
641 "Object number mismatch: expected {obj_num}, found {read_obj_num}"
642 ),
643 });
644 }
645
646 let token = lexer.next_token()?;
648 let _read_gen_num = match token {
649 super::lexer::Token::Integer(n) => n as u16,
650 _ => {
651 if self.options.lenient_syntax {
653 if self.options.collect_warnings {
654 eprintln!("Warning: Using generation 0 instead of parsed token for object {obj_num}");
655 }
656 0
657 } else {
658 return Err(ParseError::SyntaxError {
659 position: current_offset as usize,
660 message: "Expected generation number".to_string(),
661 });
662 }
663 }
664 };
665
666 let token = lexer.next_token()?;
668 match token {
669 super::lexer::Token::Obj => {}
670 _ => {
671 if self.options.lenient_syntax {
672 if self.options.collect_warnings {
674 eprintln!("Warning: Expected 'obj' keyword for object {obj_num} {gen_num}, continuing anyway");
675 }
676 } else {
677 return Err(ParseError::SyntaxError {
678 position: current_offset as usize,
679 message: "Expected 'obj' keyword".to_string(),
680 });
681 }
682 }
683 }
684 }
685
686 self.parse_context.enter()?;
688
689 let obj = match PdfObject::parse_with_options(&mut lexer, &self.options) {
690 Ok(obj) => {
691 self.parse_context.exit();
692 if obj_num == 102 && self.options.collect_warnings {
694 eprintln!("DEBUG: Parsed object 102: {:?}", obj);
695 eprintln!(
696 "DEBUG: Object 102 is dictionary: {}",
697 obj.as_dict().is_some()
698 );
699 }
700 obj
701 }
702 Err(e) => {
703 self.parse_context.exit();
704
705 if self.is_reconstructible_object(obj_num)
707 && self.can_attempt_manual_reconstruction(&e)
708 {
709 eprintln!(
710 "DEBUG: Normal parsing failed for object {}: {:?}",
711 obj_num, e
712 );
713 eprintln!("DEBUG: Attempting manual reconstruction as fallback");
714
715 match self.attempt_manual_object_reconstruction(
716 obj_num,
717 gen_num,
718 current_offset,
719 ) {
720 Ok(reconstructed_obj) => {
721 eprintln!(
722 "DEBUG: Successfully reconstructed object {} manually",
723 obj_num
724 );
725 return Ok(reconstructed_obj);
726 }
727 Err(reconstruction_error) => {
728 eprintln!(
729 "DEBUG: Manual reconstruction also failed: {:?}",
730 reconstruction_error
731 );
732 eprintln!("DEBUG: Falling back to original error");
733 }
734 }
735 }
736
737 return Err(e);
738 }
739 };
740
741 let token = lexer.next_token()?;
743 match token {
744 super::lexer::Token::EndObj => {}
745 _ => {
746 if self.options.lenient_syntax {
747 if self.options.collect_warnings {
749 eprintln!("Warning: Expected 'endobj' keyword after object {obj_num} {gen_num}, continuing anyway");
750 }
751 } else {
752 return Err(ParseError::SyntaxError {
753 position: current_offset as usize,
754 message: "Expected 'endobj' keyword".to_string(),
755 });
756 }
757 }
758 };
759
760 self.object_cache.insert(key, obj);
762
763 Ok(&self.object_cache[&key])
764 }
765
766 pub fn resolve<'a>(&'a mut self, obj: &'a PdfObject) -> ParseResult<&'a PdfObject> {
768 match obj {
769 PdfObject::Reference(obj_num, gen_num) => self.get_object(*obj_num, *gen_num),
770 _ => Ok(obj),
771 }
772 }
773
774 pub fn resolve_stream_length(&mut self, obj: &PdfObject) -> ParseResult<Option<usize>> {
777 match obj {
778 PdfObject::Integer(len) => {
779 if *len >= 0 {
780 Ok(Some(*len as usize))
781 } else {
782 Ok(None)
784 }
785 }
786 PdfObject::Reference(obj_num, gen_num) => {
787 let resolved = self.get_object(*obj_num, *gen_num)?;
788 match resolved {
789 PdfObject::Integer(len) => {
790 if *len >= 0 {
791 Ok(Some(*len as usize))
792 } else {
793 Ok(None)
794 }
795 }
796 _ => {
797 Ok(None)
799 }
800 }
801 }
802 _ => {
803 Ok(None)
805 }
806 }
807 }
808
809 fn get_compressed_object(
811 &mut self,
812 obj_num: u32,
813 gen_num: u16,
814 stream_obj_num: u32,
815 _index_in_stream: u32,
816 ) -> ParseResult<&PdfObject> {
817 let key = (obj_num, gen_num);
818
819 if !self.object_stream_cache.contains_key(&stream_obj_num) {
821 let stream_obj = self.get_object(stream_obj_num, 0)?;
823
824 if let Some(stream) = stream_obj.as_stream() {
825 let obj_stream = ObjectStream::parse(stream.clone(), &self.options)?;
827 self.object_stream_cache.insert(stream_obj_num, obj_stream);
828 } else {
829 return Err(ParseError::SyntaxError {
830 position: 0,
831 message: format!("Object {stream_obj_num} is not a stream"),
832 });
833 }
834 }
835
836 let obj_stream = &self.object_stream_cache[&stream_obj_num];
838 let obj = obj_stream
839 .get_object(obj_num)
840 .ok_or_else(|| ParseError::SyntaxError {
841 position: 0,
842 message: format!("Object {obj_num} not found in object stream {stream_obj_num}"),
843 })?;
844
845 self.object_cache.insert(key, obj.clone());
847 Ok(&self.object_cache[&key])
848 }
849
850 pub fn pages(&mut self) -> ParseResult<&PdfDictionary> {
852 let (pages_obj_num, pages_gen_num) = {
854 let catalog = self.catalog()?;
855
856 if let Some(pages_ref) = catalog.get("Pages") {
858 match pages_ref {
859 PdfObject::Reference(obj_num, gen_num) => (*obj_num, *gen_num),
860 _ => {
861 return Err(ParseError::SyntaxError {
862 position: 0,
863 message: "Pages must be a reference".to_string(),
864 })
865 }
866 }
867 } else {
868 #[cfg(debug_assertions)]
870 eprintln!("Warning: Catalog missing Pages entry, attempting recovery");
871
872 if let Ok(page_refs) = self.find_page_objects() {
874 if !page_refs.is_empty() {
875 return self.create_synthetic_pages_dict(&page_refs);
877 }
878 }
879
880 if self.options.lenient_syntax {
882 if self.options.collect_warnings {
883 eprintln!("Warning: Missing Pages in catalog, searching for page tree");
884 }
885 let mut found_pages = None;
887 for i in 1..self.xref.len() as u32 {
888 if let Ok(obj) = self.get_object(i, 0) {
889 if let Some(dict) = obj.as_dict() {
890 if let Some(obj_type) = dict.get("Type").and_then(|t| t.as_name()) {
891 if obj_type.0 == "Pages" {
892 found_pages = Some((i, 0));
893 break;
894 }
895 }
896 }
897 }
898 }
899 if let Some((obj_num, gen_num)) = found_pages {
900 (obj_num, gen_num)
901 } else {
902 return Err(ParseError::MissingKey("Pages".to_string()));
903 }
904 } else {
905 return Err(ParseError::MissingKey("Pages".to_string()));
906 }
907 }
908 };
909
910 let pages_obj = self.get_object(pages_obj_num, pages_gen_num)?;
912 pages_obj.as_dict().ok_or_else(|| ParseError::SyntaxError {
913 position: 0,
914 message: "Pages is not a dictionary".to_string(),
915 })
916 }
917
918 pub fn page_count(&mut self) -> ParseResult<u32> {
920 match self.pages() {
922 Ok(pages) => {
923 if let Some(count_obj) = pages.get("Count") {
925 if let Some(count) = count_obj.as_integer() {
926 return Ok(count as u32);
927 }
928 }
929
930 if let Some(kids_obj) = pages.get("Kids") {
932 if let Some(kids_array) = kids_obj.as_array() {
933 return Ok(kids_array.0.len() as u32);
936 }
937 }
938
939 Ok(0)
940 }
941 Err(_) => {
942 eprintln!("Standard page extraction failed, trying direct extraction");
944 self.page_count_fallback()
945 }
946 }
947 }
948
949 fn page_count_fallback(&mut self) -> ParseResult<u32> {
951 if let Some(count) = self.extract_page_count_from_linearization() {
953 eprintln!("Found page count {} from linearization", count);
954 return Ok(count);
955 }
956
957 if let Some(count) = self.count_page_objects_directly() {
959 eprintln!("Found {} pages by counting page objects", count);
960 return Ok(count);
961 }
962
963 Ok(0)
964 }
965
966 fn extract_page_count_from_linearization(&mut self) -> Option<u32> {
968 match self.get_object(100, 0) {
970 Ok(obj) => {
971 eprintln!("Found object 100: {:?}", obj);
972 if let Some(dict) = obj.as_dict() {
973 eprintln!("Object 100 is a dictionary with {} keys", dict.0.len());
974 if let Some(n_obj) = dict.get("N") {
976 eprintln!("Found /N field: {:?}", n_obj);
977 if let Some(count) = n_obj.as_integer() {
978 eprintln!("Extracted page count from linearization: {}", count);
979 return Some(count as u32);
980 }
981 } else {
982 eprintln!("No /N field found in object 100");
983 for (key, value) in &dict.0 {
984 eprintln!(" {:?}: {:?}", key, value);
985 }
986 }
987 } else {
988 eprintln!("Object 100 is not a dictionary: {:?}", obj);
989 }
990 }
991 Err(e) => {
992 eprintln!("Failed to get object 100: {:?}", e);
993 eprintln!("Attempting direct content extraction...");
994 return self.extract_n_value_from_raw_object_100();
996 }
997 }
998
999 None
1000 }
1001
1002 fn extract_n_value_from_raw_object_100(&mut self) -> Option<u32> {
1003 if let Some(entry) = self.xref.get_entry(100) {
1005 if self.reader.seek(SeekFrom::Start(entry.offset)).is_err() {
1007 return None;
1008 }
1009
1010 let mut buffer = vec![0u8; 1024];
1012 if let Ok(bytes_read) = self.reader.read(&mut buffer) {
1013 if bytes_read == 0 {
1014 return None;
1015 }
1016
1017 let content = String::from_utf8_lossy(&buffer[..bytes_read]);
1019 eprintln!("Raw content around object 100:\n{}", content);
1020
1021 if let Some(n_pos) = content.find("/N ") {
1023 let after_n = &content[n_pos + 3..];
1024 eprintln!(
1025 "Content after /N: {}",
1026 &after_n[..std::cmp::min(50, after_n.len())]
1027 );
1028
1029 let mut num_str = String::new();
1031 for ch in after_n.chars() {
1032 if ch.is_ascii_digit() {
1033 num_str.push(ch);
1034 } else if !num_str.is_empty() {
1035 break;
1037 }
1038 }
1040
1041 if !num_str.is_empty() {
1042 if let Ok(page_count) = num_str.parse::<u32>() {
1043 eprintln!("Extracted page count from raw content: {}", page_count);
1044 return Some(page_count);
1045 }
1046 }
1047 }
1048 }
1049 }
1050 None
1051 }
1052
1053 #[allow(dead_code)]
1054 fn find_object_pattern(&mut self, obj_num: u32, gen_num: u16) -> Option<u64> {
1055 let pattern = format!("{} {} obj", obj_num, gen_num);
1056 eprintln!("DEBUG: Searching for pattern: '{}'", pattern);
1057
1058 let original_pos = self.reader.stream_position().unwrap_or(0);
1060
1061 if self.reader.seek(SeekFrom::Start(0)).is_err() {
1063 return None;
1064 }
1065
1066 let mut buffer = vec![0u8; 8192];
1068 let mut file_content = Vec::new();
1069
1070 loop {
1071 match self.reader.read(&mut buffer) {
1072 Ok(0) => break, Ok(bytes_read) => {
1074 file_content.extend_from_slice(&buffer[..bytes_read]);
1075 }
1076 Err(_) => return None,
1077 }
1078 }
1079
1080 let content = String::from_utf8_lossy(&file_content);
1082 if let Some(pattern_pos) = content.find(&pattern) {
1083 eprintln!(
1084 "DEBUG: Found pattern '{}' at position {}",
1085 pattern, pattern_pos
1086 );
1087
1088 let after_pattern = pattern_pos + pattern.len();
1090 let search_area = &content[after_pattern..];
1091
1092 if let Some(dict_start_offset) = search_area.find("<<") {
1093 let dict_start_pos = after_pattern + dict_start_offset;
1094 eprintln!(
1095 "DEBUG: Found '<<' at position {} (offset {} from pattern)",
1096 dict_start_pos, dict_start_offset
1097 );
1098
1099 self.reader.seek(SeekFrom::Start(original_pos)).ok();
1101 return Some(dict_start_pos as u64);
1102 } else {
1103 eprintln!("DEBUG: Could not find '<<' after pattern");
1104 }
1105 }
1106
1107 eprintln!("DEBUG: Pattern '{}' not found in file", pattern);
1108 self.reader.seek(SeekFrom::Start(original_pos)).ok();
1110 None
1111 }
1112
1113 fn can_attempt_manual_reconstruction(&self, error: &ParseError) -> bool {
1115 match error {
1116 ParseError::SyntaxError { .. } => true,
1118 ParseError::UnexpectedToken { .. } => true,
1119 _ => false,
1121 }
1122 }
1123
1124 fn is_reconstructible_object(&self, obj_num: u32) -> bool {
1126 if obj_num == 102 || obj_num == 113 || obj_num == 114 {
1128 return true;
1129 }
1130
1131 let page_objects = [
1134 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 30, 34, 37, 39, 42, 44, 46, 49, 52,
1135 54, 56, 58, 60, 62, 64, 67, 69, 71, 73, 75, 77, 79, 81, 83, 85, 87, 89, 91, 93, 104,
1136 ];
1137
1138 let content_objects = [
1141 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 29, 31, 32, 33, 35, 36, 38, 40, 41,
1142 43, 45, 47, 48, 50, 51, 53, 55, 57, 59, 61, 63, 65, 66, 68, 70, 72, 74, 76, 78, 80, 82,
1143 84, 86, 88, 90, 92, 94, 95, 96, 97, 98, 99, 100, 101, 105, 106, 107, 108, 109, 110,
1144 111,
1145 ];
1146
1147 page_objects.contains(&obj_num) || content_objects.contains(&obj_num)
1148 }
1149
1150 fn is_page_object(&self, obj_num: u32) -> bool {
1152 let page_objects = [
1153 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 30, 34, 37, 39, 42, 44, 46, 49, 52,
1154 54, 56, 58, 60, 62, 64, 67, 69, 71, 73, 75, 77, 79, 81, 83, 85, 87, 89, 91, 93, 104,
1155 ];
1156 page_objects.contains(&obj_num)
1157 }
1158
1159 fn parse_page_dictionary_content(
1161 &self,
1162 dict_content: &str,
1163 result_dict: &mut std::collections::HashMap<
1164 crate::parser::objects::PdfName,
1165 crate::parser::objects::PdfObject,
1166 >,
1167 obj_num: u32,
1168 ) -> ParseResult<()> {
1169 use crate::parser::objects::{PdfArray, PdfName, PdfObject};
1170 use std::collections::HashMap;
1171
1172 if let Some(mediabox_start) = dict_content.find("/MediaBox") {
1174 let mediabox_area = &dict_content[mediabox_start..];
1175 if let Some(start_bracket) = mediabox_area.find("[") {
1176 if let Some(end_bracket) = mediabox_area.find("]") {
1177 let mediabox_content = &mediabox_area[start_bracket + 1..end_bracket];
1178 let values: Vec<f32> = mediabox_content
1179 .split_whitespace()
1180 .filter_map(|s| s.parse().ok())
1181 .collect();
1182
1183 if values.len() == 4 {
1184 let mediabox = PdfArray(vec![
1185 PdfObject::Integer(values[0] as i64),
1186 PdfObject::Integer(values[1] as i64),
1187 PdfObject::Integer(values[2] as i64),
1188 PdfObject::Integer(values[3] as i64),
1189 ]);
1190 result_dict
1191 .insert(PdfName("MediaBox".to_string()), PdfObject::Array(mediabox));
1192 eprintln!("DEBUG: Added MediaBox for object {}: {:?}", obj_num, values);
1193 }
1194 }
1195 }
1196 }
1197
1198 if let Some(contents_match) = dict_content.find("/Contents") {
1200 let contents_area = &dict_content[contents_match..];
1201 let parts: Vec<&str> = contents_area.split_whitespace().collect();
1203 if parts.len() >= 3 {
1204 if let (Ok(obj_ref), Ok(gen_ref)) =
1205 (parts[1].parse::<u32>(), parts[2].parse::<u16>())
1206 {
1207 if parts.len() > 3 && parts[3] == "R" {
1208 result_dict.insert(
1209 PdfName("Contents".to_string()),
1210 PdfObject::Reference(obj_ref, gen_ref),
1211 );
1212 eprintln!(
1213 "DEBUG: Added Contents reference for object {}: {} {} R",
1214 obj_num, obj_ref, gen_ref
1215 );
1216 }
1217 }
1218 }
1219 }
1220
1221 if dict_content.contains("/Parent") {
1223 result_dict.insert(
1224 PdfName("Parent".to_string()),
1225 PdfObject::Reference(113, 0), );
1227 eprintln!(
1228 "DEBUG: Added Parent reference for object {}: 113 0 R",
1229 obj_num
1230 );
1231 }
1232
1233 if dict_content.contains("/Resources") {
1235 eprintln!(
1236 "DEBUG: Found Resources in object {}, content: {}",
1237 obj_num,
1238 dict_content.chars().take(200).collect::<String>()
1239 );
1240
1241 if let Ok(parsed_resources) = self.parse_resources_from_content(&dict_content) {
1242 result_dict.insert(PdfName("Resources".to_string()), parsed_resources);
1243 eprintln!("DEBUG: Added parsed Resources for object {}", obj_num);
1244 } else {
1245 let resources = HashMap::new();
1247 result_dict.insert(
1248 PdfName("Resources".to_string()),
1249 PdfObject::Dictionary(crate::parser::objects::PdfDictionary(resources)),
1250 );
1251 eprintln!(
1252 "DEBUG: Added empty Resources for object {} (parsing failed)",
1253 obj_num
1254 );
1255 }
1256 }
1257
1258 Ok(())
1259 }
1260
1261 fn attempt_manual_object_reconstruction(
1263 &mut self,
1264 obj_num: u32,
1265 gen_num: u16,
1266 _current_offset: u64,
1267 ) -> ParseResult<&PdfObject> {
1268 let is_circular = self
1270 .objects_being_reconstructed
1271 .lock()
1272 .map_err(|_| ParseError::SyntaxError {
1273 position: 0,
1274 message: "Mutex poisoned during circular reference check".to_string(),
1275 })?
1276 .contains(&obj_num);
1277
1278 if is_circular {
1279 eprintln!(
1280 "DEBUG: Circular reconstruction detected for object {} {} - breaking cycle with null object",
1281 obj_num, gen_num
1282 );
1283 self.object_cache
1285 .insert((obj_num, gen_num), PdfObject::Null);
1286 return Ok(&self.object_cache[&(obj_num, gen_num)]);
1287 }
1288
1289 let current_depth = self
1291 .objects_being_reconstructed
1292 .lock()
1293 .map_err(|_| ParseError::SyntaxError {
1294 position: 0,
1295 message: "Mutex poisoned during depth check".to_string(),
1296 })?
1297 .len() as u32;
1298 if current_depth >= self.max_reconstruction_depth {
1299 eprintln!(
1300 "DEBUG: Maximum reconstruction depth ({}) exceeded for object {} {}",
1301 self.max_reconstruction_depth, obj_num, gen_num
1302 );
1303 return Err(ParseError::SyntaxError {
1304 position: 0,
1305 message: format!(
1306 "Maximum reconstruction depth ({}) exceeded for object {} {}",
1307 self.max_reconstruction_depth, obj_num, gen_num
1308 ),
1309 });
1310 }
1311
1312 eprintln!(
1313 "DEBUG: Attempting smart reconstruction for object {} {} (depth: {}/{})",
1314 obj_num, gen_num, current_depth, self.max_reconstruction_depth
1315 );
1316
1317 self.objects_being_reconstructed
1319 .lock()
1320 .map_err(|_| ParseError::SyntaxError {
1321 position: 0,
1322 message: "Mutex poisoned while marking object as being reconstructed".to_string(),
1323 })?
1324 .insert(obj_num);
1325
1326 let reconstructed_obj = match self.smart_object_reconstruction(obj_num, gen_num) {
1328 Ok(obj) => obj,
1329 Err(_) => {
1330 match self.extract_object_or_stream_manually(obj_num) {
1332 Ok(obj) => obj,
1333 Err(e) => {
1334 if self.options.lenient_syntax {
1336 eprintln!(
1337 "DEBUG: Creating null object for missing {} {}",
1338 obj_num, gen_num
1339 );
1340 PdfObject::Null
1341 } else {
1342 if let Ok(mut guard) = self.objects_being_reconstructed.lock() {
1344 guard.remove(&obj_num);
1345 }
1346 return Err(e);
1347 }
1348 }
1349 }
1350 }
1351 };
1352
1353 self.objects_being_reconstructed
1355 .lock()
1356 .map_err(|_| ParseError::SyntaxError {
1357 position: 0,
1358 message: "Mutex poisoned while unmarking reconstructed object".to_string(),
1359 })?
1360 .remove(&obj_num);
1361
1362 self.object_cache
1363 .insert((obj_num, gen_num), reconstructed_obj);
1364
1365 use crate::parser::xref::XRefEntry;
1367 let xref_entry = XRefEntry {
1368 offset: 0, generation: gen_num,
1370 in_use: true,
1371 };
1372 self.xref.add_entry(obj_num, xref_entry);
1373 eprintln!(
1374 "DEBUG: Successfully reconstructed and cached object {} {}",
1375 obj_num, gen_num
1376 );
1377
1378 self.object_cache
1379 .get(&(obj_num, gen_num))
1380 .ok_or_else(|| ParseError::SyntaxError {
1381 position: 0,
1382 message: format!(
1383 "Object {} {} not in cache after reconstruction",
1384 obj_num, gen_num
1385 ),
1386 })
1387 }
1388
1389 fn smart_object_reconstruction(
1391 &mut self,
1392 obj_num: u32,
1393 gen_num: u16,
1394 ) -> ParseResult<PdfObject> {
1395 if let Ok(inferred_obj) = self.infer_object_from_context(obj_num) {
1399 return Ok(inferred_obj);
1400 }
1401
1402 if let Ok(scanned_obj) = self.scan_for_object_patterns(obj_num) {
1404 return Ok(scanned_obj);
1405 }
1406
1407 if let Ok(synthetic_obj) = self.create_synthetic_object(obj_num) {
1409 return Ok(synthetic_obj);
1410 }
1411
1412 Err(ParseError::SyntaxError {
1413 position: 0,
1414 message: format!("Could not reconstruct object {} {}", obj_num, gen_num),
1415 })
1416 }
1417
1418 fn infer_object_from_context(&mut self, obj_num: u32) -> ParseResult<PdfObject> {
1420 for (_key, obj) in self.object_cache.iter() {
1424 if let PdfObject::Dictionary(dict) = obj {
1425 for (key, value) in dict.0.iter() {
1426 if let PdfObject::Reference(ref_num, _) = value {
1427 if *ref_num == obj_num {
1428 match key.as_str() {
1430 "Font" | "F1" | "F2" | "F3" => {
1431 return Ok(self.create_font_object(obj_num));
1432 }
1433 "XObject" | "Image" | "Im1" => {
1434 return Ok(self.create_xobject(obj_num));
1435 }
1436 "Contents" => {
1437 return Ok(self.create_content_stream(obj_num));
1438 }
1439 "Resources" => {
1440 return Ok(self.create_resources_dict(obj_num));
1441 }
1442 _ => continue,
1443 }
1444 }
1445 }
1446 }
1447 }
1448 }
1449
1450 Err(ParseError::SyntaxError {
1451 position: 0,
1452 message: "Cannot infer object type from context".to_string(),
1453 })
1454 }
1455
1456 fn scan_for_object_patterns(&mut self, obj_num: u32) -> ParseResult<PdfObject> {
1458 self.extract_object_or_stream_manually(obj_num)
1461 }
1462
1463 fn create_synthetic_object(&mut self, obj_num: u32) -> ParseResult<PdfObject> {
1465 use super::objects::{PdfDictionary, PdfName, PdfObject};
1466
1467 match obj_num {
1469 1..=10 => {
1470 let mut dict = PdfDictionary::new();
1472 dict.insert(
1473 "Type".to_string(),
1474 PdfObject::Name(PdfName("Null".to_string())),
1475 );
1476 Ok(PdfObject::Dictionary(dict))
1477 }
1478 _ => {
1479 Ok(PdfObject::Null)
1481 }
1482 }
1483 }
1484
1485 fn create_font_object(&self, obj_num: u32) -> PdfObject {
1486 use super::objects::{PdfDictionary, PdfName, PdfObject};
1487 let mut font_dict = PdfDictionary::new();
1488 font_dict.insert(
1489 "Type".to_string(),
1490 PdfObject::Name(PdfName("Font".to_string())),
1491 );
1492 font_dict.insert(
1493 "Subtype".to_string(),
1494 PdfObject::Name(PdfName("Type1".to_string())),
1495 );
1496 font_dict.insert(
1497 "BaseFont".to_string(),
1498 PdfObject::Name(PdfName("Helvetica".to_string())),
1499 );
1500 eprintln!("DEBUG: Created synthetic Font object {}", obj_num);
1501 PdfObject::Dictionary(font_dict)
1502 }
1503
1504 fn create_xobject(&self, obj_num: u32) -> PdfObject {
1505 use super::objects::{PdfDictionary, PdfName, PdfObject};
1506 let mut xobj_dict = PdfDictionary::new();
1507 xobj_dict.insert(
1508 "Type".to_string(),
1509 PdfObject::Name(PdfName("XObject".to_string())),
1510 );
1511 xobj_dict.insert(
1512 "Subtype".to_string(),
1513 PdfObject::Name(PdfName("Form".to_string())),
1514 );
1515 eprintln!("DEBUG: Created synthetic XObject {}", obj_num);
1516 PdfObject::Dictionary(xobj_dict)
1517 }
1518
1519 fn create_content_stream(&self, obj_num: u32) -> PdfObject {
1520 use super::objects::{PdfDictionary, PdfObject, PdfStream};
1521 let mut stream_dict = PdfDictionary::new();
1522 stream_dict.insert("Length".to_string(), PdfObject::Integer(0));
1523
1524 let stream = PdfStream {
1525 dict: stream_dict,
1526 data: Vec::new(),
1527 };
1528 eprintln!("DEBUG: Created synthetic content stream {}", obj_num);
1529 PdfObject::Stream(stream)
1530 }
1531
1532 fn create_resources_dict(&self, obj_num: u32) -> PdfObject {
1533 use super::objects::{PdfArray, PdfDictionary, PdfObject};
1534 let mut res_dict = PdfDictionary::new();
1535 res_dict.insert("ProcSet".to_string(), PdfObject::Array(PdfArray::new()));
1536 eprintln!("DEBUG: Created synthetic Resources dict {}", obj_num);
1537 PdfObject::Dictionary(res_dict)
1538 }
1539
1540 fn extract_object_manually(
1541 &mut self,
1542 obj_num: u32,
1543 ) -> ParseResult<crate::parser::objects::PdfDictionary> {
1544 use crate::parser::objects::{PdfArray, PdfDictionary, PdfName, PdfObject};
1545 use std::collections::HashMap;
1546
1547 let original_pos = self.reader.stream_position().unwrap_or(0);
1549
1550 if self.reader.seek(SeekFrom::Start(0)).is_err() {
1552 return Err(ParseError::SyntaxError {
1553 position: 0,
1554 message: "Failed to seek to beginning for manual extraction".to_string(),
1555 });
1556 }
1557
1558 let mut buffer = Vec::new();
1560 if self.reader.read_to_end(&mut buffer).is_err() {
1561 return Err(ParseError::SyntaxError {
1562 position: 0,
1563 message: "Failed to read file for manual extraction".to_string(),
1564 });
1565 }
1566
1567 let content = String::from_utf8_lossy(&buffer);
1568
1569 let pattern = format!("{} 0 obj", obj_num);
1571 if let Some(start) = content.find(&pattern) {
1572 let search_area = &content[start..];
1573 if let Some(dict_start) = search_area.find("<<") {
1574 let mut bracket_count = 1;
1576 let mut pos = dict_start + 2;
1577 let bytes = search_area.as_bytes();
1578 let mut dict_end = None;
1579
1580 while pos < bytes.len() - 1 && bracket_count > 0 {
1581 if bytes[pos] == b'<' && bytes[pos + 1] == b'<' {
1582 bracket_count += 1;
1583 pos += 2;
1584 } else if bytes[pos] == b'>' && bytes[pos + 1] == b'>' {
1585 bracket_count -= 1;
1586 if bracket_count == 0 {
1587 dict_end = Some(pos);
1588 break;
1589 }
1590 pos += 2;
1591 } else {
1592 pos += 1;
1593 }
1594 }
1595
1596 if let Some(dict_end) = dict_end {
1597 let dict_content = &search_area[dict_start + 2..dict_end];
1598 eprintln!(
1599 "DEBUG: Found object {} dictionary content: '{}'",
1600 obj_num,
1601 dict_content.chars().take(500).collect::<String>()
1602 );
1603
1604 let mut result_dict = HashMap::new();
1606
1607 if dict_content.contains("/Type/Catalog")
1610 || dict_content.contains("/Type /Catalog")
1611 {
1612 eprintln!(
1613 "DEBUG: Detected /Type/Catalog in object {}, parsing as catalog",
1614 obj_num
1615 );
1616
1617 result_dict.insert(
1618 PdfName("Type".to_string()),
1619 PdfObject::Name(PdfName("Catalog".to_string())),
1620 );
1621
1622 if let Some(pages_start) = dict_content.find("/Pages") {
1626 let after_pages = &dict_content[pages_start + 6..]; let trimmed = after_pages.trim_start();
1629 let parts: Vec<&str> = trimmed.split_whitespace().collect();
1631 if parts.len() >= 3 {
1632 if let (Ok(obj), Ok(gen)) =
1636 (parts[0].parse::<u32>(), parts[1].parse::<u16>())
1637 {
1638 if parts[2] == "R" || parts[2].starts_with('R') {
1639 result_dict.insert(
1640 PdfName("Pages".to_string()),
1641 PdfObject::Reference(obj, gen),
1642 );
1643 eprintln!(
1644 "DEBUG: Parsed /Pages {} {} R from catalog",
1645 obj, gen
1646 );
1647 }
1648 }
1649 }
1650 }
1651
1652 if let Some(ver_start) = dict_content.find("/Version") {
1655 let after_ver = &dict_content[ver_start + 8..];
1656 if let Some(ver_end) = after_ver.find(|c: char| c == '/' || c == '>') {
1657 let version_str = after_ver[..ver_end].trim();
1658 result_dict.insert(
1659 PdfName("Version".to_string()),
1660 PdfObject::Name(PdfName(
1661 version_str.trim_start_matches('/').to_string(),
1662 )),
1663 );
1664 }
1665 }
1666
1667 if let Some(meta_start) = dict_content.find("/Metadata") {
1669 let after_meta = &dict_content[meta_start + 9..];
1670 let parts: Vec<&str> = after_meta.split_whitespace().collect();
1671 if parts.len() >= 3 {
1672 if let (Ok(obj), Ok(gen)) =
1673 (parts[0].parse::<u32>(), parts[1].parse::<u16>())
1674 {
1675 if parts[2] == "R" {
1676 result_dict.insert(
1677 PdfName("Metadata".to_string()),
1678 PdfObject::Reference(obj, gen),
1679 );
1680 }
1681 }
1682 }
1683 }
1684
1685 if let Some(acro_start) = dict_content.find("/AcroForm") {
1687 let after_acro = &dict_content[acro_start + 9..];
1688 if after_acro.trim_start().starts_with("<<") {
1690 eprintln!("DEBUG: /AcroForm is inline dictionary, skipping");
1692 } else {
1693 let parts: Vec<&str> = after_acro.split_whitespace().collect();
1694 if parts.len() >= 3 {
1695 if let (Ok(obj), Ok(gen)) =
1696 (parts[0].parse::<u32>(), parts[1].parse::<u16>())
1697 {
1698 if parts[2] == "R" {
1699 result_dict.insert(
1700 PdfName("AcroForm".to_string()),
1701 PdfObject::Reference(obj, gen),
1702 );
1703 }
1704 }
1705 }
1706 }
1707 }
1708
1709 eprintln!("DEBUG: Generic catalog parsing completed for object {} with {} entries", obj_num, result_dict.len());
1710 } else if obj_num == 102 {
1711 if dict_content.contains("/Type /Catalog") {
1713 result_dict.insert(
1715 PdfName("Type".to_string()),
1716 PdfObject::Name(PdfName("Catalog".to_string())),
1717 );
1718
1719 if dict_content.contains("/Dests 139 0 R") {
1721 result_dict.insert(
1722 PdfName("Dests".to_string()),
1723 PdfObject::Reference(139, 0),
1724 );
1725 }
1726
1727 if dict_content.contains("/Pages 113 0 R") {
1729 result_dict.insert(
1730 PdfName("Pages".to_string()),
1731 PdfObject::Reference(113, 0),
1732 );
1733 }
1734 } else {
1735 eprintln!("DEBUG: Object 102 is not a catalog (content: '{}'), skipping reconstruction", dict_content.trim());
1737 self.reader.seek(SeekFrom::Start(original_pos)).ok();
1739 return Err(ParseError::SyntaxError {
1740 position: 0,
1741 message:
1742 "Object 102 is not a corrupted catalog, cannot reconstruct"
1743 .to_string(),
1744 });
1745 }
1746 } else if obj_num == 113 {
1747 eprintln!("DEBUG: Creating object 113 as main Pages object with real page references");
1749
1750 result_dict.insert(
1751 PdfName("Type".to_string()),
1752 PdfObject::Name(PdfName("Pages".to_string())),
1753 );
1754
1755 let page_refs = match self.find_page_objects() {
1757 Ok(refs) => refs,
1758 Err(e) => {
1759 eprintln!(
1760 "DEBUG: Failed to find page objects: {:?}, using empty array",
1761 e
1762 );
1763 vec![]
1764 }
1765 };
1766
1767 eprintln!(
1768 "DEBUG: Found {} page objects for 113 Kids array: {:?}",
1769 page_refs.len(),
1770 page_refs
1771 );
1772
1773 let page_count = if page_refs.is_empty() {
1775 44
1776 } else {
1777 page_refs.len() as i64
1778 };
1779 result_dict
1780 .insert(PdfName("Count".to_string()), PdfObject::Integer(page_count));
1781
1782 let kids_array: Vec<PdfObject> = page_refs
1784 .into_iter()
1785 .map(|(obj_num, gen_num)| PdfObject::Reference(obj_num, gen_num))
1786 .collect();
1787
1788 result_dict.insert(
1789 PdfName("Kids".to_string()),
1790 PdfObject::Array(PdfArray(kids_array)),
1791 );
1792 } else if obj_num == 114 {
1793 eprintln!("DEBUG: Parsing object 114 as Pages node");
1795
1796 result_dict.insert(
1797 PdfName("Type".to_string()),
1798 PdfObject::Name(PdfName("Pages".to_string())),
1799 );
1800
1801 let page_refs = match self.find_page_objects() {
1803 Ok(refs) => refs,
1804 Err(e) => {
1805 eprintln!(
1806 "DEBUG: Failed to find page objects: {:?}, using empty array",
1807 e
1808 );
1809 vec![]
1810 }
1811 };
1812
1813 eprintln!(
1814 "DEBUG: Found {} page objects for Kids array: {:?}",
1815 page_refs.len(),
1816 page_refs
1817 );
1818
1819 let page_count = if page_refs.is_empty() {
1821 44
1822 } else {
1823 page_refs.len() as i64
1824 };
1825 result_dict
1826 .insert(PdfName("Count".to_string()), PdfObject::Integer(page_count));
1827
1828 let kids_array: Vec<PdfObject> = page_refs
1830 .into_iter()
1831 .map(|(obj_num, gen_num)| PdfObject::Reference(obj_num, gen_num))
1832 .collect();
1833
1834 result_dict.insert(
1835 PdfName("Kids".to_string()),
1836 PdfObject::Array(PdfArray(kids_array)),
1837 );
1838
1839 eprintln!(
1840 "DEBUG: Object 114 created as Pages node with {} Kids",
1841 page_count
1842 );
1843 } else if self.is_page_object(obj_num) {
1844 eprintln!("DEBUG: Manually reconstructing Page object {}", obj_num);
1846
1847 result_dict.insert(
1848 PdfName("Type".to_string()),
1849 PdfObject::Name(PdfName("Page".to_string())),
1850 );
1851
1852 self.parse_page_dictionary_content(
1854 &dict_content,
1855 &mut result_dict,
1856 obj_num,
1857 )?;
1858 }
1859
1860 self.reader.seek(SeekFrom::Start(original_pos)).ok();
1862
1863 eprintln!(
1864 "DEBUG: Manually created object {} with {} entries",
1865 obj_num,
1866 result_dict.len()
1867 );
1868 return Ok(PdfDictionary(result_dict));
1869 }
1870 }
1871 }
1872
1873 self.reader.seek(SeekFrom::Start(original_pos)).ok();
1875
1876 if obj_num == 113 {
1878 eprintln!("DEBUG: Object 113 not found in PDF content, creating fallback Pages object");
1879 let mut result_dict = HashMap::new();
1880 result_dict.insert(
1881 PdfName("Type".to_string()),
1882 PdfObject::Name(PdfName("Pages".to_string())),
1883 );
1884
1885 let page_refs = match self.find_page_objects() {
1887 Ok(refs) => refs,
1888 Err(e) => {
1889 eprintln!(
1890 "DEBUG: Failed to find page objects: {:?}, using empty array",
1891 e
1892 );
1893 vec![]
1894 }
1895 };
1896
1897 eprintln!(
1898 "DEBUG: Found {} page objects for fallback 113 Kids array: {:?}",
1899 page_refs.len(),
1900 page_refs
1901 );
1902
1903 let page_count = if page_refs.is_empty() {
1905 44
1906 } else {
1907 page_refs.len() as i64
1908 };
1909 result_dict.insert(PdfName("Count".to_string()), PdfObject::Integer(page_count));
1910
1911 let kids_array: Vec<PdfObject> = page_refs
1913 .into_iter()
1914 .map(|(obj_num, gen_num)| PdfObject::Reference(obj_num, gen_num))
1915 .collect();
1916
1917 result_dict.insert(
1918 PdfName("Kids".to_string()),
1919 PdfObject::Array(PdfArray(kids_array)),
1920 );
1921
1922 eprintln!(
1923 "DEBUG: Created fallback object 113 with {} entries and {} Kids",
1924 result_dict.len(),
1925 page_count
1926 );
1927 return Ok(PdfDictionary(result_dict));
1928 } else if obj_num == 114 {
1929 eprintln!("DEBUG: Object 114 not found in PDF content, creating fallback Pages object");
1930 let mut result_dict = HashMap::new();
1931 result_dict.insert(
1932 PdfName("Type".to_string()),
1933 PdfObject::Name(PdfName("Pages".to_string())),
1934 );
1935
1936 let page_refs = match self.find_page_objects() {
1938 Ok(refs) => refs,
1939 Err(e) => {
1940 eprintln!(
1941 "DEBUG: Failed to find page objects: {:?}, using empty array",
1942 e
1943 );
1944 vec![]
1945 }
1946 };
1947
1948 eprintln!(
1949 "DEBUG: Found {} page objects for fallback Kids array: {:?}",
1950 page_refs.len(),
1951 page_refs
1952 );
1953
1954 let page_count = if page_refs.is_empty() {
1956 44
1957 } else {
1958 page_refs.len() as i64
1959 };
1960 result_dict.insert(PdfName("Count".to_string()), PdfObject::Integer(page_count));
1961
1962 let kids_array: Vec<PdfObject> = page_refs
1964 .into_iter()
1965 .map(|(obj_num, gen_num)| PdfObject::Reference(obj_num, gen_num))
1966 .collect();
1967
1968 result_dict.insert(
1969 PdfName("Kids".to_string()),
1970 PdfObject::Array(PdfArray(kids_array)),
1971 );
1972
1973 eprintln!(
1974 "DEBUG: Created fallback object 114 with {} entries and {} Kids",
1975 result_dict.len(),
1976 page_count
1977 );
1978 return Ok(PdfDictionary(result_dict));
1979 }
1980
1981 Err(ParseError::SyntaxError {
1982 position: 0,
1983 message: "Could not find catalog dictionary in manual extraction".to_string(),
1984 })
1985 }
1986
1987 fn extract_object_or_stream_manually(&mut self, obj_num: u32) -> ParseResult<PdfObject> {
1989 use crate::parser::objects::PdfObject;
1990
1991 let original_pos = self.reader.stream_position().unwrap_or(0);
1993
1994 if self.reader.seek(SeekFrom::Start(0)).is_err() {
1996 return Err(ParseError::SyntaxError {
1997 position: 0,
1998 message: "Failed to seek to beginning for manual extraction".to_string(),
1999 });
2000 }
2001
2002 let mut buffer = Vec::new();
2004 if self.reader.read_to_end(&mut buffer).is_err() {
2005 return Err(ParseError::SyntaxError {
2006 position: 0,
2007 message: "Failed to read file for manual extraction".to_string(),
2008 });
2009 }
2010
2011 let pattern = format!("{} 0 obj", obj_num).into_bytes();
2013
2014 if let Some(obj_start) = find_bytes(&buffer, &pattern) {
2015 let start = obj_start + pattern.len();
2016 let search_area = &buffer[start..];
2017
2018 if let Some(dict_start) = find_bytes(search_area, b"<<") {
2019 let mut bracket_count = 1;
2021 let mut pos = dict_start + 2;
2022 let mut dict_end = None;
2023
2024 while pos < search_area.len() - 1 && bracket_count > 0 {
2025 if search_area[pos] == b'<' && search_area[pos + 1] == b'<' {
2026 bracket_count += 1;
2027 pos += 2;
2028 } else if search_area[pos] == b'>' && search_area[pos + 1] == b'>' {
2029 bracket_count -= 1;
2030 if bracket_count == 0 {
2031 dict_end = Some(pos);
2032 break;
2033 }
2034 pos += 2;
2035 } else {
2036 pos += 1;
2037 }
2038 }
2039
2040 if let Some(dict_end_pos) = dict_end {
2041 let dict_start_abs = dict_start + 2;
2042 let dict_end_abs = dict_end_pos;
2043 let dict_content_bytes = &search_area[dict_start_abs..dict_end_abs];
2044 let dict_content = String::from_utf8_lossy(dict_content_bytes);
2045
2046 eprintln!(
2047 "DEBUG: Found object {} dictionary content: '{}'",
2048 obj_num,
2049 dict_content.chars().take(200).collect::<String>()
2050 );
2051
2052 let after_dict = &search_area[dict_end_abs + 2..];
2054 if is_immediate_stream_start(after_dict) {
2055 return self.reconstruct_stream_object_bytes(
2057 obj_num,
2058 &dict_content,
2059 after_dict,
2060 );
2061 } else {
2062 return self
2064 .extract_object_manually(obj_num)
2065 .map(|dict| PdfObject::Dictionary(dict));
2066 }
2067 }
2068 }
2069 }
2070
2071 self.reader.seek(SeekFrom::Start(original_pos)).ok();
2073
2074 Err(ParseError::SyntaxError {
2075 position: 0,
2076 message: format!("Could not manually extract object {}", obj_num),
2077 })
2078 }
2079
2080 fn reconstruct_stream_object_bytes(
2082 &mut self,
2083 obj_num: u32,
2084 dict_content: &str,
2085 after_dict: &[u8],
2086 ) -> ParseResult<PdfObject> {
2087 use crate::parser::objects::{PdfDictionary, PdfName, PdfObject, PdfStream};
2088 use std::collections::HashMap;
2089
2090 let mut dict = HashMap::new();
2092
2093 if dict_content.contains("/Filter /FlateDecode") {
2095 dict.insert(
2096 PdfName("Filter".to_string()),
2097 PdfObject::Name(PdfName("FlateDecode".to_string())),
2098 );
2099 }
2100
2101 if let Some(length_start) = dict_content.find("/Length ") {
2102 let length_part = &dict_content[length_start + 8..];
2103 if let Some(space_pos) = length_part.find(' ') {
2104 let length_str = &length_part[..space_pos];
2105 if let Ok(length) = length_str.parse::<i64>() {
2106 dict.insert(PdfName("Length".to_string()), PdfObject::Integer(length));
2107 }
2108 } else {
2109 if let Ok(length) = length_part.trim().parse::<i64>() {
2111 dict.insert(PdfName("Length".to_string()), PdfObject::Integer(length));
2112 }
2113 }
2114 }
2115
2116 if let Some(stream_start) = find_bytes(after_dict, b"stream") {
2118 let stream_start_pos = stream_start + 6; let stream_data_start = if after_dict.get(stream_start_pos) == Some(&b'\n') {
2120 stream_start_pos + 1
2121 } else if after_dict.get(stream_start_pos) == Some(&b'\r') {
2122 if after_dict.get(stream_start_pos + 1) == Some(&b'\n') {
2123 stream_start_pos + 2
2124 } else {
2125 stream_start_pos + 1
2126 }
2127 } else {
2128 stream_start_pos
2129 };
2130
2131 if let Some(endstream_pos) = find_bytes(after_dict, b"endstream") {
2132 let mut stream_data = &after_dict[stream_data_start..endstream_pos];
2133
2134 if let Some(PdfObject::Integer(length)) = dict.get(&PdfName("Length".to_string())) {
2136 let expected_length = *length as usize;
2137 if stream_data.len() > expected_length {
2138 stream_data = &stream_data[..expected_length];
2139 eprintln!(
2140 "DEBUG: Trimmed stream data from {} to {} bytes based on Length field",
2141 after_dict[stream_data_start..endstream_pos].len(),
2142 expected_length
2143 );
2144 }
2145 }
2146
2147 eprintln!(
2148 "DEBUG: Reconstructed stream object {} with {} bytes of stream data",
2149 obj_num,
2150 stream_data.len()
2151 );
2152
2153 let stream = PdfStream {
2154 dict: PdfDictionary(dict),
2155 data: stream_data.to_vec(),
2156 };
2157
2158 return Ok(PdfObject::Stream(stream));
2159 }
2160 }
2161
2162 Err(ParseError::SyntaxError {
2163 position: 0,
2164 message: format!("Could not reconstruct stream for object {}", obj_num),
2165 })
2166 }
2167
2168 fn parse_resources_from_content(&self, dict_content: &str) -> ParseResult<PdfObject> {
2170 use crate::parser::objects::{PdfDictionary, PdfName, PdfObject};
2171 use std::collections::HashMap;
2172
2173 if let Some(resources_start) = dict_content.find("/Resources") {
2175 if let Some(bracket_start) = dict_content[resources_start..].find("<<") {
2177 let abs_bracket_start = resources_start + bracket_start + 2;
2178
2179 let mut bracket_count = 1;
2181 let mut end_pos = abs_bracket_start;
2182 let chars: Vec<char> = dict_content.chars().collect();
2183
2184 while end_pos < chars.len() && bracket_count > 0 {
2185 if end_pos + 1 < chars.len() {
2186 if chars[end_pos] == '<' && chars[end_pos + 1] == '<' {
2187 bracket_count += 1;
2188 end_pos += 2;
2189 continue;
2190 } else if chars[end_pos] == '>' && chars[end_pos + 1] == '>' {
2191 bracket_count -= 1;
2192 end_pos += 2;
2193 continue;
2194 }
2195 }
2196 end_pos += 1;
2197 }
2198
2199 if bracket_count == 0 {
2200 let resources_content = &dict_content[abs_bracket_start..end_pos - 2];
2201 eprintln!("DEBUG: Parsing Resources content: {}", resources_content);
2202
2203 let mut resources_dict = HashMap::new();
2205
2206 if let Some(font_start) = resources_content.find("/Font") {
2208 if let Some(font_bracket) = resources_content[font_start..].find("<<") {
2209 let abs_font_start = font_start + font_bracket + 2;
2210
2211 let mut font_dict = HashMap::new();
2213
2214 let font_section = &resources_content[abs_font_start..];
2216 let mut pos = 0;
2217 while let Some(f_pos) = font_section[pos..].find("/F") {
2218 let abs_f_pos = pos + f_pos;
2219 if let Some(space_pos) = font_section[abs_f_pos..].find(" ") {
2220 let font_name = &font_section[abs_f_pos..abs_f_pos + space_pos];
2221
2222 let after_name = &font_section[abs_f_pos + space_pos..];
2224 if let Some(r_pos) = after_name.find(" R") {
2225 let ref_part = after_name[..r_pos].trim();
2226 if let Some(parts) = ref_part
2227 .split_whitespace()
2228 .collect::<Vec<&str>>()
2229 .get(0..2)
2230 {
2231 if let (Ok(obj_num), Ok(gen_num)) =
2232 (parts[0].parse::<u32>(), parts[1].parse::<u16>())
2233 {
2234 font_dict.insert(
2235 PdfName(font_name[1..].to_string()), PdfObject::Reference(obj_num, gen_num),
2237 );
2238 eprintln!(
2239 "DEBUG: Found font {} -> {} {} R",
2240 font_name, obj_num, gen_num
2241 );
2242 }
2243 }
2244 }
2245 }
2246 pos = abs_f_pos + 1;
2247 }
2248
2249 if !font_dict.is_empty() {
2250 resources_dict.insert(
2251 PdfName("Font".to_string()),
2252 PdfObject::Dictionary(PdfDictionary(font_dict)),
2253 );
2254 }
2255 }
2256 }
2257
2258 return Ok(PdfObject::Dictionary(PdfDictionary(resources_dict)));
2259 }
2260 }
2261 }
2262
2263 Err(ParseError::SyntaxError {
2264 position: 0,
2265 message: "Could not parse Resources".to_string(),
2266 })
2267 }
2268
2269 #[allow(dead_code)]
2270 fn extract_catalog_directly(
2271 &mut self,
2272 obj_num: u32,
2273 gen_num: u16,
2274 ) -> ParseResult<&PdfDictionary> {
2275 if let Some(entry) = self.xref.get_entry(obj_num) {
2277 if self.reader.seek(SeekFrom::Start(entry.offset)).is_err() {
2279 return Err(ParseError::SyntaxError {
2280 position: 0,
2281 message: "Failed to seek to catalog object".to_string(),
2282 });
2283 }
2284
2285 let mut buffer = vec![0u8; 2048];
2287 if let Ok(bytes_read) = self.reader.read(&mut buffer) {
2288 let content = String::from_utf8_lossy(&buffer[..bytes_read]);
2289 eprintln!("Raw catalog content:\n{}", content);
2290
2291 if let Some(dict_start) = content.find("<<") {
2293 if let Some(dict_end) = content[dict_start..].find(">>") {
2294 let dict_content = &content[dict_start..dict_start + dict_end + 2];
2295 eprintln!("Found dictionary content: {}", dict_content);
2296
2297 if let Ok(dict) = self.parse_dictionary_from_string(dict_content) {
2299 let key = (obj_num, gen_num);
2301 self.object_cache.insert(key, PdfObject::Dictionary(dict));
2302
2303 if let Some(PdfObject::Dictionary(ref dict)) =
2305 self.object_cache.get(&key)
2306 {
2307 return Ok(dict);
2308 }
2309 }
2310 }
2311 }
2312 }
2313 }
2314
2315 Err(ParseError::SyntaxError {
2316 position: 0,
2317 message: "Failed to extract catalog directly".to_string(),
2318 })
2319 }
2320
2321 #[allow(dead_code)]
2322 fn parse_dictionary_from_string(&self, dict_str: &str) -> ParseResult<PdfDictionary> {
2323 use crate::parser::lexer::{Lexer, Token};
2324
2325 let mut cursor = std::io::Cursor::new(dict_str.as_bytes());
2327 let mut lexer = Lexer::new_with_options(&mut cursor, self.options.clone());
2328
2329 match lexer.next_token()? {
2331 Token::DictStart => {
2332 let mut dict = std::collections::HashMap::new();
2333
2334 loop {
2335 let token = lexer.next_token()?;
2336 match token {
2337 Token::DictEnd => break,
2338 Token::Name(key) => {
2339 let value = PdfObject::parse_with_options(&mut lexer, &self.options)?;
2341 dict.insert(crate::parser::objects::PdfName(key), value);
2342 }
2343 _ => {
2344 return Err(ParseError::SyntaxError {
2345 position: 0,
2346 message: "Invalid dictionary format".to_string(),
2347 });
2348 }
2349 }
2350 }
2351
2352 Ok(PdfDictionary(dict))
2353 }
2354 _ => Err(ParseError::SyntaxError {
2355 position: 0,
2356 message: "Expected dictionary start".to_string(),
2357 }),
2358 }
2359 }
2360
2361 fn count_page_objects_directly(&mut self) -> Option<u32> {
2363 let mut page_count = 0;
2364
2365 for obj_num in 1..self.xref.len() as u32 {
2367 if let Ok(obj) = self.get_object(obj_num, 0) {
2368 if let Some(dict) = obj.as_dict() {
2369 if let Some(obj_type) = dict.get("Type").and_then(|t| t.as_name()) {
2370 if obj_type.0 == "Page" {
2371 page_count += 1;
2372 }
2373 }
2374 }
2375 }
2376 }
2377
2378 if page_count > 0 {
2379 Some(page_count)
2380 } else {
2381 None
2382 }
2383 }
2384
2385 pub fn metadata(&mut self) -> ParseResult<DocumentMetadata> {
2387 let mut metadata = DocumentMetadata::default();
2388
2389 if let Some(info_dict) = self.info()? {
2390 if let Some(title) = info_dict.get("Title").and_then(|o| o.as_string()) {
2391 metadata.title = title.as_str().ok().map(|s| s.to_string());
2392 }
2393 if let Some(author) = info_dict.get("Author").and_then(|o| o.as_string()) {
2394 metadata.author = author.as_str().ok().map(|s| s.to_string());
2395 }
2396 if let Some(subject) = info_dict.get("Subject").and_then(|o| o.as_string()) {
2397 metadata.subject = subject.as_str().ok().map(|s| s.to_string());
2398 }
2399 if let Some(keywords) = info_dict.get("Keywords").and_then(|o| o.as_string()) {
2400 metadata.keywords = keywords.as_str().ok().map(|s| s.to_string());
2401 }
2402 if let Some(creator) = info_dict.get("Creator").and_then(|o| o.as_string()) {
2403 metadata.creator = creator.as_str().ok().map(|s| s.to_string());
2404 }
2405 if let Some(producer) = info_dict.get("Producer").and_then(|o| o.as_string()) {
2406 metadata.producer = producer.as_str().ok().map(|s| s.to_string());
2407 }
2408 }
2409
2410 metadata.version = self.version().to_string();
2411 metadata.page_count = self.page_count().ok();
2412
2413 Ok(metadata)
2414 }
2415
2416 fn ensure_page_tree(&mut self) -> ParseResult<()> {
2418 if self.page_tree.is_none() {
2419 let page_count = self.page_count()?;
2420 self.page_tree = Some(super::page_tree::PageTree::new(page_count));
2421 }
2422 Ok(())
2423 }
2424
2425 pub fn get_page(&mut self, _index: u32) -> ParseResult<&super::page_tree::ParsedPage> {
2431 self.ensure_page_tree()?;
2432
2433 Err(ParseError::SyntaxError {
2437 position: 0,
2438 message: "get_page not implemented due to borrow checker constraints. Use PdfDocument instead.".to_string(),
2439 })
2440 }
2441
2442 pub fn get_all_pages(&mut self) -> ParseResult<Vec<super::page_tree::ParsedPage>> {
2444 let page_count = self.page_count()?;
2445 let mut pages = Vec::with_capacity(page_count as usize);
2446
2447 for i in 0..page_count {
2448 let page = self.get_page(i)?.clone();
2449 pages.push(page);
2450 }
2451
2452 Ok(pages)
2453 }
2454
2455 pub fn into_document(self) -> super::document::PdfDocument<R> {
2457 super::document::PdfDocument::new(self)
2458 }
2459
2460 pub fn clear_parse_context(&mut self) {
2462 self.parse_context = StackSafeContext::new();
2463 }
2464
2465 pub fn parse_context_mut(&mut self) -> &mut StackSafeContext {
2467 &mut self.parse_context
2468 }
2469
2470 fn find_page_objects(&mut self) -> ParseResult<Vec<(u32, u16)>> {
2472 eprintln!("DEBUG: Starting find_page_objects scan");
2473
2474 let original_pos = self.reader.stream_position().unwrap_or(0);
2476
2477 if self.reader.seek(SeekFrom::Start(0)).is_err() {
2479 eprintln!("DEBUG: Failed to seek to start");
2480 return Ok(vec![]);
2481 }
2482
2483 let mut buffer = Vec::new();
2484 if self.reader.read_to_end(&mut buffer).is_err() {
2485 eprintln!("DEBUG: Failed to read PDF content");
2486 return Ok(vec![]);
2487 }
2488
2489 self.reader.seek(SeekFrom::Start(original_pos)).ok();
2491
2492 let content = String::from_utf8_lossy(&buffer);
2493 let mut page_objects = Vec::new();
2494
2495 let lines: Vec<&str> = content.lines().collect();
2497 eprintln!("DEBUG: Scanning {} lines for Page objects", lines.len());
2498
2499 for (i, line) in lines.iter().enumerate() {
2500 if line.trim().ends_with(" 0 obj") {
2502 if let Some(obj_str) = line.trim().strip_suffix(" 0 obj") {
2503 if let Ok(obj_num) = obj_str.parse::<u32>() {
2504 for j in 1..=10 {
2506 if i + j < lines.len() {
2507 let future_line = lines[i + j];
2508 if future_line.contains("/Type /Page")
2509 && !future_line.contains("/Type /Pages")
2510 {
2511 eprintln!("DEBUG: Found Page object at object {}", obj_num);
2512 page_objects.push((obj_num, 0));
2513 break;
2514 }
2515 if future_line.trim().ends_with(" 0 obj")
2517 || future_line.trim() == "endobj"
2518 {
2519 break;
2520 }
2521 }
2522 }
2523 }
2524 }
2525 }
2526 }
2527
2528 page_objects.sort();
2529 page_objects.dedup();
2530
2531 eprintln!(
2532 "DEBUG: Found {} Page objects: {:?}",
2533 page_objects.len(),
2534 page_objects
2535 );
2536 Ok(page_objects)
2537 }
2538
2539 fn find_catalog_object(&mut self) -> ParseResult<(u32, u16)> {
2541 eprintln!("DEBUG: Scanning for catalog object...");
2545
2546 let obj_numbers: Vec<u32> = self.xref.entries().keys().copied().collect();
2548
2549 eprintln!("DEBUG: Found {} objects in xref table", obj_numbers.len());
2550
2551 for obj_num in obj_numbers {
2553 if let Ok(obj) = self.get_object(obj_num, 0) {
2555 if let Some(dict) = obj.as_dict() {
2556 if let Some(type_obj) = dict.get("Type") {
2558 if let Some(type_name) = type_obj.as_name() {
2559 if type_name.0 == "Catalog" {
2560 eprintln!("DEBUG: Found catalog at object {} 0 R", obj_num);
2561 return Ok((obj_num, 0));
2562 }
2563 if type_name.0 == "Sig"
2565 || type_name.0 == "Pages"
2566 || type_name.0 == "Page"
2567 {
2568 eprintln!(
2569 "DEBUG: Skipping object {} 0 R (Type: {})",
2570 obj_num, type_name.0
2571 );
2572 continue;
2573 }
2574 }
2575 }
2576 }
2577 }
2578 }
2579
2580 eprintln!("DEBUG: Catalog scan failed, trying common object numbers");
2582 for obj_num in [1, 2, 3, 4, 5] {
2583 if let Ok(obj) = self.get_object(obj_num, 0) {
2584 if let Some(dict) = obj.as_dict() {
2585 if dict.contains_key("Pages") {
2587 eprintln!(
2588 "DEBUG: Assuming object {} 0 R is catalog (has /Pages)",
2589 obj_num
2590 );
2591 return Ok((obj_num, 0));
2592 }
2593 }
2594 }
2595 }
2596
2597 Err(ParseError::MissingKey(
2598 "Could not find Catalog object".to_string(),
2599 ))
2600 }
2601
2602 fn create_synthetic_pages_dict(
2604 &mut self,
2605 page_refs: &[(u32, u16)],
2606 ) -> ParseResult<&PdfDictionary> {
2607 use super::objects::{PdfArray, PdfName};
2608
2609 eprintln!(
2610 "DEBUG: Creating synthetic Pages tree with {} pages",
2611 page_refs.len()
2612 );
2613
2614 let mut valid_page_refs = Vec::new();
2616 for (obj_num, gen_num) in page_refs {
2617 if let Ok(page_obj) = self.get_object(*obj_num, *gen_num) {
2618 if let Some(page_dict) = page_obj.as_dict() {
2619 if let Some(obj_type) = page_dict.get("Type").and_then(|t| t.as_name()) {
2621 if obj_type.0 == "Page" {
2622 valid_page_refs.push((*obj_num, *gen_num));
2623 continue;
2624 }
2625 }
2626
2627 if page_dict.contains_key("MediaBox") || page_dict.contains_key("Contents") {
2629 eprintln!(
2630 "DEBUG: Assuming {} {} R is a Page (missing Type)",
2631 obj_num, gen_num
2632 );
2633 valid_page_refs.push((*obj_num, *gen_num));
2634 }
2635 }
2636 }
2637 }
2638
2639 if valid_page_refs.is_empty() {
2640 return Err(ParseError::SyntaxError {
2641 position: 0,
2642 message: "No valid page objects found for synthetic Pages tree".to_string(),
2643 });
2644 }
2645
2646 eprintln!(
2647 "DEBUG: Found {} valid page objects out of {}",
2648 valid_page_refs.len(),
2649 page_refs.len()
2650 );
2651
2652 if valid_page_refs.len() > 10 {
2654 return self.create_hierarchical_pages_tree(&valid_page_refs);
2655 }
2656
2657 let mut kids = PdfArray::new();
2659 for (obj_num, gen_num) in &valid_page_refs {
2660 kids.push(PdfObject::Reference(*obj_num, *gen_num));
2661 }
2662
2663 let mut pages_dict = PdfDictionary::new();
2665 pages_dict.insert(
2666 "Type".to_string(),
2667 PdfObject::Name(PdfName("Pages".to_string())),
2668 );
2669 pages_dict.insert("Kids".to_string(), PdfObject::Array(kids));
2670 pages_dict.insert(
2671 "Count".to_string(),
2672 PdfObject::Integer(valid_page_refs.len() as i64),
2673 );
2674
2675 let mut media_box = None;
2677 for (obj_num, gen_num) in valid_page_refs.iter().take(3) {
2678 if let Ok(page_obj) = self.get_object(*obj_num, *gen_num) {
2679 if let Some(page_dict) = page_obj.as_dict() {
2680 if let Some(mb) = page_dict.get("MediaBox") {
2681 media_box = Some(mb.clone());
2682 }
2683 }
2684 }
2685 }
2686
2687 if let Some(mb) = media_box {
2689 pages_dict.insert("MediaBox".to_string(), mb);
2690 } else {
2691 let mut mb_array = PdfArray::new();
2692 mb_array.push(PdfObject::Integer(0));
2693 mb_array.push(PdfObject::Integer(0));
2694 mb_array.push(PdfObject::Integer(612));
2695 mb_array.push(PdfObject::Integer(792));
2696 pages_dict.insert("MediaBox".to_string(), PdfObject::Array(mb_array));
2697 }
2698
2699 let synthetic_key = (u32::MAX - 1, 0);
2701 self.object_cache
2702 .insert(synthetic_key, PdfObject::Dictionary(pages_dict));
2703
2704 if let PdfObject::Dictionary(dict) = &self.object_cache[&synthetic_key] {
2706 Ok(dict)
2707 } else {
2708 unreachable!("Just inserted dictionary")
2709 }
2710 }
2711
2712 fn create_hierarchical_pages_tree(
2714 &mut self,
2715 page_refs: &[(u32, u16)],
2716 ) -> ParseResult<&PdfDictionary> {
2717 use super::objects::{PdfArray, PdfName};
2718
2719 eprintln!(
2720 "DEBUG: Creating hierarchical Pages tree with {} pages",
2721 page_refs.len()
2722 );
2723
2724 const PAGES_PER_NODE: usize = 10; let chunks: Vec<&[(u32, u16)]> = page_refs.chunks(PAGES_PER_NODE).collect();
2728 let mut intermediate_nodes = Vec::new();
2729
2730 for (chunk_idx, chunk) in chunks.iter().enumerate() {
2732 let mut kids = PdfArray::new();
2733 for (obj_num, gen_num) in chunk.iter() {
2734 kids.push(PdfObject::Reference(*obj_num, *gen_num));
2735 }
2736
2737 let mut intermediate_dict = PdfDictionary::new();
2738 intermediate_dict.insert(
2739 "Type".to_string(),
2740 PdfObject::Name(PdfName("Pages".to_string())),
2741 );
2742 intermediate_dict.insert("Kids".to_string(), PdfObject::Array(kids));
2743 intermediate_dict.insert("Count".to_string(), PdfObject::Integer(chunk.len() as i64));
2744
2745 let intermediate_key = (u32::MAX - 2 - chunk_idx as u32, 0);
2747 self.object_cache
2748 .insert(intermediate_key, PdfObject::Dictionary(intermediate_dict));
2749
2750 intermediate_nodes.push(intermediate_key);
2751 }
2752
2753 let mut root_kids = PdfArray::new();
2755 for (obj_num, gen_num) in &intermediate_nodes {
2756 root_kids.push(PdfObject::Reference(*obj_num, *gen_num));
2757 }
2758
2759 let mut root_pages_dict = PdfDictionary::new();
2760 root_pages_dict.insert(
2761 "Type".to_string(),
2762 PdfObject::Name(PdfName("Pages".to_string())),
2763 );
2764 root_pages_dict.insert("Kids".to_string(), PdfObject::Array(root_kids));
2765 root_pages_dict.insert(
2766 "Count".to_string(),
2767 PdfObject::Integer(page_refs.len() as i64),
2768 );
2769
2770 if let Some((obj_num, gen_num)) = page_refs.first() {
2772 if let Ok(page_obj) = self.get_object(*obj_num, *gen_num) {
2773 if let Some(page_dict) = page_obj.as_dict() {
2774 if let Some(mb) = page_dict.get("MediaBox") {
2775 root_pages_dict.insert("MediaBox".to_string(), mb.clone());
2776 }
2777 }
2778 }
2779 }
2780
2781 let root_key = (u32::MAX - 1, 0);
2783 self.object_cache
2784 .insert(root_key, PdfObject::Dictionary(root_pages_dict));
2785
2786 eprintln!(
2787 "DEBUG: Created hierarchical tree with {} intermediate nodes",
2788 intermediate_nodes.len()
2789 );
2790
2791 if let PdfObject::Dictionary(dict) = &self.object_cache[&root_key] {
2793 Ok(dict)
2794 } else {
2795 unreachable!("Just inserted dictionary")
2796 }
2797 }
2798}
2799
2800#[derive(Debug, Default, Clone)]
2802pub struct DocumentMetadata {
2803 pub title: Option<String>,
2804 pub author: Option<String>,
2805 pub subject: Option<String>,
2806 pub keywords: Option<String>,
2807 pub creator: Option<String>,
2808 pub producer: Option<String>,
2809 pub creation_date: Option<String>,
2810 pub modification_date: Option<String>,
2811 pub version: String,
2812 pub page_count: Option<u32>,
2813}
2814
2815pub struct EOLIter<'s> {
2816 remainder: &'s str,
2817}
2818impl<'s> Iterator for EOLIter<'s> {
2819 type Item = &'s str;
2820
2821 fn next(&mut self) -> Option<Self::Item> {
2822 if self.remainder.is_empty() {
2823 return None;
2824 }
2825
2826 if let Some((i, sep)) = ["\r\n", "\n", "\r"]
2827 .iter()
2828 .filter_map(|&sep| self.remainder.find(sep).map(|i| (i, sep)))
2829 .min_by_key(|(i, _)| *i)
2830 {
2831 let (line, rest) = self.remainder.split_at(i);
2832 self.remainder = &rest[sep.len()..];
2833 Some(line)
2834 } else {
2835 let line = self.remainder;
2836 self.remainder = "";
2837 Some(line)
2838 }
2839 }
2840}
2841pub trait PDFLines: AsRef<str> {
2842 fn pdf_lines(&self) -> EOLIter<'_> {
2843 EOLIter {
2844 remainder: self.as_ref(),
2845 }
2846 }
2847}
2848impl PDFLines for &str {}
2849impl<'a> PDFLines for std::borrow::Cow<'a, str> {}
2850impl PDFLines for String {}
2851
2852#[cfg(test)]
2853mod tests {
2854
2855 use super::*;
2856 use crate::parser::objects::{PdfName, PdfString};
2857 use crate::parser::test_helpers::*;
2858 use crate::parser::ParseOptions;
2859 use std::io::Cursor;
2860
2861 #[test]
2862 fn test_reader_construction() {
2863 let pdf_data = create_minimal_pdf();
2864 let cursor = Cursor::new(pdf_data);
2865 let result = PdfReader::new(cursor);
2866 assert!(result.is_ok());
2867 }
2868
2869 #[test]
2870 fn test_reader_version() {
2871 let pdf_data = create_minimal_pdf();
2872 let cursor = Cursor::new(pdf_data);
2873 let reader = PdfReader::new(cursor).unwrap();
2874 assert_eq!(reader.version().major, 1);
2875 assert_eq!(reader.version().minor, 4);
2876 }
2877
2878 #[test]
2879 fn test_reader_different_versions() {
2880 let versions = vec![
2881 "1.0", "1.1", "1.2", "1.3", "1.4", "1.5", "1.6", "1.7", "2.0",
2882 ];
2883
2884 for version in versions {
2885 let pdf_data = create_pdf_with_version(version);
2886 let cursor = Cursor::new(pdf_data);
2887 let reader = PdfReader::new(cursor).unwrap();
2888
2889 let parts: Vec<&str> = version.split('.').collect();
2890 assert_eq!(reader.version().major, parts[0].parse::<u8>().unwrap());
2891 assert_eq!(reader.version().minor, parts[1].parse::<u8>().unwrap());
2892 }
2893 }
2894
2895 #[test]
2896 fn test_reader_catalog() {
2897 let pdf_data = create_minimal_pdf();
2898 let cursor = Cursor::new(pdf_data);
2899 let mut reader = PdfReader::new(cursor).unwrap();
2900
2901 let catalog = reader.catalog();
2902 assert!(catalog.is_ok());
2903
2904 let catalog_dict = catalog.unwrap();
2905 assert_eq!(
2906 catalog_dict.get("Type"),
2907 Some(&PdfObject::Name(PdfName("Catalog".to_string())))
2908 );
2909 }
2910
2911 #[test]
2912 fn test_reader_info_none() {
2913 let pdf_data = create_minimal_pdf();
2914 let cursor = Cursor::new(pdf_data);
2915 let mut reader = PdfReader::new(cursor).unwrap();
2916
2917 let info = reader.info().unwrap();
2918 assert!(info.is_none());
2919 }
2920
2921 #[test]
2922 fn test_reader_info_present() {
2923 let pdf_data = create_pdf_with_info();
2924 let cursor = Cursor::new(pdf_data);
2925 let mut reader = PdfReader::new(cursor).unwrap();
2926
2927 let info = reader.info().unwrap();
2928 assert!(info.is_some());
2929
2930 let info_dict = info.unwrap();
2931 assert_eq!(
2932 info_dict.get("Title"),
2933 Some(&PdfObject::String(PdfString(
2934 "Test PDF".to_string().into_bytes()
2935 )))
2936 );
2937 assert_eq!(
2938 info_dict.get("Author"),
2939 Some(&PdfObject::String(PdfString(
2940 "Test Author".to_string().into_bytes()
2941 )))
2942 );
2943 }
2944
2945 #[test]
2946 fn test_reader_get_object() {
2947 let pdf_data = create_minimal_pdf();
2948 let cursor = Cursor::new(pdf_data);
2949 let mut reader = PdfReader::new(cursor).unwrap();
2950
2951 let obj = reader.get_object(1, 0);
2953 assert!(obj.is_ok());
2954
2955 let catalog = obj.unwrap();
2956 assert!(catalog.as_dict().is_some());
2957 }
2958
2959 #[test]
2960 fn test_reader_get_invalid_object() {
2961 let pdf_data = create_minimal_pdf();
2962 let cursor = Cursor::new(pdf_data);
2963 let mut reader = PdfReader::new(cursor).unwrap();
2964
2965 let obj = reader.get_object(999, 0);
2967 assert!(obj.is_err());
2968 }
2969
2970 #[test]
2971 fn test_reader_get_free_object() {
2972 let pdf_data = create_minimal_pdf();
2973 let cursor = Cursor::new(pdf_data);
2974 let mut reader = PdfReader::new(cursor).unwrap();
2975
2976 let obj = reader.get_object(0, 65535);
2978 assert!(obj.is_ok());
2979 assert_eq!(obj.unwrap(), &PdfObject::Null);
2980 }
2981
2982 #[test]
2983 fn test_reader_resolve_reference() {
2984 let pdf_data = create_minimal_pdf();
2985 let cursor = Cursor::new(pdf_data);
2986 let mut reader = PdfReader::new(cursor).unwrap();
2987
2988 let ref_obj = PdfObject::Reference(1, 0);
2990 let resolved = reader.resolve(&ref_obj);
2991
2992 assert!(resolved.is_ok());
2993 assert!(resolved.unwrap().as_dict().is_some());
2994 }
2995
2996 #[test]
2997 fn test_reader_resolve_non_reference() {
2998 let pdf_data = create_minimal_pdf();
2999 let cursor = Cursor::new(pdf_data);
3000 let mut reader = PdfReader::new(cursor).unwrap();
3001
3002 let int_obj = PdfObject::Integer(42);
3004 let resolved = reader.resolve(&int_obj).unwrap();
3005
3006 assert_eq!(resolved, &PdfObject::Integer(42));
3007 }
3008
3009 #[test]
3010 fn test_reader_cache_behavior() {
3011 let pdf_data = create_minimal_pdf();
3012 let cursor = Cursor::new(pdf_data);
3013 let mut reader = PdfReader::new(cursor).unwrap();
3014
3015 let obj1 = reader.get_object(1, 0).unwrap();
3017 assert!(obj1.as_dict().is_some());
3018
3019 let obj2 = reader.get_object(1, 0).unwrap();
3021 assert!(obj2.as_dict().is_some());
3022 }
3023
3024 #[test]
3025 fn test_reader_wrong_generation() {
3026 let pdf_data = create_minimal_pdf();
3027 let cursor = Cursor::new(pdf_data);
3028 let mut reader = PdfReader::new(cursor).unwrap();
3029
3030 let obj = reader.get_object(1, 99);
3032 assert!(obj.is_err());
3033 }
3034
3035 #[test]
3036 fn test_reader_invalid_pdf() {
3037 let invalid_data = b"This is not a PDF file";
3038 let cursor = Cursor::new(invalid_data.to_vec());
3039 let result = PdfReader::new(cursor);
3040
3041 assert!(result.is_err());
3042 }
3043
3044 #[test]
3045 fn test_reader_corrupt_xref() {
3046 let corrupt_pdf = b"%PDF-1.4
30471 0 obj
3048<< /Type /Catalog >>
3049endobj
3050xref
3051corrupted xref table
3052trailer
3053<< /Size 2 /Root 1 0 R >>
3054startxref
305524
3056%%EOF"
3057 .to_vec();
3058
3059 let cursor = Cursor::new(corrupt_pdf);
3060 let result = PdfReader::new(cursor);
3061 assert!(result.is_err());
3064 }
3065
3066 #[test]
3067 fn test_reader_missing_trailer() {
3068 let pdf_no_trailer = b"%PDF-1.4
30691 0 obj
3070<< /Type /Catalog >>
3071endobj
3072xref
30730 2
30740000000000 65535 f
30750000000009 00000 n
3076startxref
307724
3078%%EOF"
3079 .to_vec();
3080
3081 let cursor = Cursor::new(pdf_no_trailer);
3082 let result = PdfReader::new(cursor);
3083 assert!(result.is_err());
3086 }
3087
3088 #[test]
3089 fn test_reader_empty_pdf() {
3090 let cursor = Cursor::new(Vec::new());
3091 let result = PdfReader::new(cursor);
3092 assert!(result.is_err());
3093 }
3094
3095 #[test]
3096 fn test_reader_page_count() {
3097 let pdf_data = create_minimal_pdf();
3098 let cursor = Cursor::new(pdf_data);
3099 let mut reader = PdfReader::new(cursor).unwrap();
3100
3101 let count = reader.page_count();
3102 assert!(count.is_ok());
3103 assert_eq!(count.unwrap(), 0); }
3105
3106 #[test]
3107 fn test_reader_into_document() {
3108 let pdf_data = create_minimal_pdf();
3109 let cursor = Cursor::new(pdf_data);
3110 let reader = PdfReader::new(cursor).unwrap();
3111
3112 let document = reader.into_document();
3113 let page_count = document.page_count();
3115 assert!(page_count.is_ok());
3116 }
3117
3118 #[test]
3119 fn test_reader_pages_dict() {
3120 let pdf_data = create_minimal_pdf();
3121 let cursor = Cursor::new(pdf_data);
3122 let mut reader = PdfReader::new(cursor).unwrap();
3123
3124 let pages = reader.pages();
3125 assert!(pages.is_ok());
3126 let pages_dict = pages.unwrap();
3127 assert_eq!(
3128 pages_dict.get("Type"),
3129 Some(&PdfObject::Name(PdfName("Pages".to_string())))
3130 );
3131 }
3132
3133 #[test]
3134 fn test_reader_pdf_with_binary_data() {
3135 let pdf_data = create_pdf_with_binary_marker();
3136
3137 let cursor = Cursor::new(pdf_data);
3138 let result = PdfReader::new(cursor);
3139 assert!(result.is_ok());
3140 }
3141
3142 #[test]
3143 fn test_reader_metadata() {
3144 let pdf_data = create_pdf_with_info();
3145 let cursor = Cursor::new(pdf_data);
3146 let mut reader = PdfReader::new(cursor).unwrap();
3147
3148 let metadata = reader.metadata().unwrap();
3149 assert_eq!(metadata.title, Some("Test PDF".to_string()));
3150 assert_eq!(metadata.author, Some("Test Author".to_string()));
3151 assert_eq!(metadata.subject, Some("Testing".to_string()));
3152 assert_eq!(metadata.version, "1.4".to_string());
3153 }
3154
3155 #[test]
3156 fn test_reader_metadata_empty() {
3157 let pdf_data = create_minimal_pdf();
3158 let cursor = Cursor::new(pdf_data);
3159 let mut reader = PdfReader::new(cursor).unwrap();
3160
3161 let metadata = reader.metadata().unwrap();
3162 assert!(metadata.title.is_none());
3163 assert!(metadata.author.is_none());
3164 assert_eq!(metadata.version, "1.4".to_string());
3165 assert_eq!(metadata.page_count, Some(0));
3166 }
3167
3168 #[test]
3169 fn test_reader_object_number_mismatch() {
3170 let pdf_data = create_minimal_pdf();
3174 let cursor = Cursor::new(pdf_data);
3175 let mut reader = PdfReader::new(cursor).unwrap();
3176
3177 let result = reader.get_object(1, 99);
3180 assert!(result.is_err());
3181
3182 let result2 = reader.get_object(999, 0);
3184 assert!(result2.is_err());
3185 }
3186
3187 #[test]
3188 fn test_document_metadata_struct() {
3189 let metadata = DocumentMetadata {
3190 title: Some("Title".to_string()),
3191 author: Some("Author".to_string()),
3192 subject: Some("Subject".to_string()),
3193 keywords: Some("Keywords".to_string()),
3194 creator: Some("Creator".to_string()),
3195 producer: Some("Producer".to_string()),
3196 creation_date: Some("D:20240101".to_string()),
3197 modification_date: Some("D:20240102".to_string()),
3198 version: "1.5".to_string(),
3199 page_count: Some(10),
3200 };
3201
3202 assert_eq!(metadata.title, Some("Title".to_string()));
3203 assert_eq!(metadata.page_count, Some(10));
3204 }
3205
3206 #[test]
3207 fn test_document_metadata_default() {
3208 let metadata = DocumentMetadata::default();
3209 assert!(metadata.title.is_none());
3210 assert!(metadata.author.is_none());
3211 assert!(metadata.subject.is_none());
3212 assert!(metadata.keywords.is_none());
3213 assert!(metadata.creator.is_none());
3214 assert!(metadata.producer.is_none());
3215 assert!(metadata.creation_date.is_none());
3216 assert!(metadata.modification_date.is_none());
3217 assert_eq!(metadata.version, "".to_string());
3218 assert!(metadata.page_count.is_none());
3219 }
3220
3221 #[test]
3222 fn test_document_metadata_clone() {
3223 let metadata = DocumentMetadata {
3224 title: Some("Test".to_string()),
3225 version: "1.4".to_string(),
3226 ..Default::default()
3227 };
3228
3229 let cloned = metadata.clone();
3230 assert_eq!(cloned.title, Some("Test".to_string()));
3231 assert_eq!(cloned.version, "1.4".to_string());
3232 }
3233
3234 #[test]
3235 fn test_reader_trailer_validation_error() {
3236 let bad_pdf = b"%PDF-1.4
32381 0 obj
3239<< /Type /Catalog >>
3240endobj
3241xref
32420 2
32430000000000 65535 f
32440000000009 00000 n
3245trailer
3246<< /Size 2 >>
3247startxref
324846
3249%%EOF"
3250 .to_vec();
3251
3252 let cursor = Cursor::new(bad_pdf);
3253 let result = PdfReader::new(cursor);
3254 assert!(result.is_err());
3257 }
3258
3259 #[test]
3260 fn test_reader_with_options() {
3261 let pdf_data = create_minimal_pdf();
3262 let cursor = Cursor::new(pdf_data);
3263 let mut options = ParseOptions::default();
3264 options.lenient_streams = true;
3265 options.max_recovery_bytes = 2000;
3266 options.collect_warnings = true;
3267
3268 let reader = PdfReader::new_with_options(cursor, options);
3269 assert!(reader.is_ok());
3270 }
3271
3272 #[test]
3273 fn test_lenient_stream_parsing() {
3274 let pdf_data = b"%PDF-1.4
32761 0 obj
3277<< /Type /Catalog /Pages 2 0 R >>
3278endobj
32792 0 obj
3280<< /Type /Pages /Kids [3 0 R] /Count 1 >>
3281endobj
32823 0 obj
3283<< /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792] /Contents 4 0 R >>
3284endobj
32854 0 obj
3286<< /Length 10 >>
3287stream
3288This is a longer stream than 10 bytes
3289endstream
3290endobj
3291xref
32920 5
32930000000000 65535 f
32940000000009 00000 n
32950000000058 00000 n
32960000000116 00000 n
32970000000219 00000 n
3298trailer
3299<< /Size 5 /Root 1 0 R >>
3300startxref
3301299
3302%%EOF"
3303 .to_vec();
3304
3305 let cursor = Cursor::new(pdf_data.clone());
3307 let strict_options = ParseOptions::strict();
3308 let strict_reader = PdfReader::new_with_options(cursor, strict_options);
3309 assert!(strict_reader.is_err());
3311
3312 let cursor = Cursor::new(pdf_data);
3314 let mut options = ParseOptions::default();
3315 options.lenient_streams = true;
3316 options.max_recovery_bytes = 1000;
3317 options.collect_warnings = false;
3318 let lenient_reader = PdfReader::new_with_options(cursor, options);
3319 assert!(lenient_reader.is_err());
3320 }
3321
3322 #[test]
3323 fn test_parse_options_default() {
3324 let options = ParseOptions::default();
3325 assert!(!options.lenient_streams);
3326 assert_eq!(options.max_recovery_bytes, 1000);
3327 assert!(!options.collect_warnings);
3328 }
3329
3330 #[test]
3331 fn test_parse_options_clone() {
3332 let mut options = ParseOptions::default();
3333 options.lenient_streams = true;
3334 options.max_recovery_bytes = 2000;
3335 options.collect_warnings = true;
3336 let cloned = options.clone();
3337 assert!(cloned.lenient_streams);
3338 assert_eq!(cloned.max_recovery_bytes, 2000);
3339 assert!(cloned.collect_warnings);
3340 }
3341
3342 #[allow(dead_code)]
3345 fn create_encrypted_pdf_dict() -> PdfDictionary {
3346 let mut dict = PdfDictionary::new();
3347 dict.insert(
3348 "Filter".to_string(),
3349 PdfObject::Name(PdfName("Standard".to_string())),
3350 );
3351 dict.insert("V".to_string(), PdfObject::Integer(1));
3352 dict.insert("R".to_string(), PdfObject::Integer(2));
3353 dict.insert("O".to_string(), PdfObject::String(PdfString(vec![0u8; 32])));
3354 dict.insert("U".to_string(), PdfObject::String(PdfString(vec![0u8; 32])));
3355 dict.insert("P".to_string(), PdfObject::Integer(-4));
3356 dict
3357 }
3358
3359 fn create_pdf_with_encryption() -> Vec<u8> {
3360 b"%PDF-1.4
33621 0 obj
3363<< /Type /Catalog /Pages 2 0 R >>
3364endobj
33652 0 obj
3366<< /Type /Pages /Kids [3 0 R] /Count 1 >>
3367endobj
33683 0 obj
3369<< /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792] >>
3370endobj
33714 0 obj
3372<< /Filter /Standard /V 1 /R 2 /O (32 bytes of owner password hash data) /U (32 bytes of user password hash data) /P -4 >>
3373endobj
3374xref
33750 5
33760000000000 65535 f
33770000000009 00000 n
33780000000058 00000 n
33790000000116 00000 n
33800000000201 00000 n
3381trailer
3382<< /Size 5 /Root 1 0 R /Encrypt 4 0 R /ID [(file id)] >>
3383startxref
3384295
3385%%EOF"
3386 .to_vec()
3387 }
3388
3389 #[test]
3390 fn test_reader_encryption_detection() {
3391 let unencrypted_pdf = create_minimal_pdf();
3393 let cursor = Cursor::new(unencrypted_pdf);
3394 let reader = PdfReader::new(cursor).unwrap();
3395 assert!(!reader.is_encrypted());
3396 assert!(reader.is_unlocked()); let encrypted_pdf = create_pdf_with_encryption();
3400 let cursor = Cursor::new(encrypted_pdf);
3401 let result = PdfReader::new(cursor);
3402 assert!(result.is_err());
3404 }
3405
3406 #[test]
3407 fn test_reader_encryption_methods_unencrypted() {
3408 let pdf_data = create_minimal_pdf();
3409 let cursor = Cursor::new(pdf_data);
3410 let mut reader = PdfReader::new(cursor).unwrap();
3411
3412 assert!(!reader.is_encrypted());
3414 assert!(reader.is_unlocked());
3415 assert!(reader.encryption_handler().is_none());
3416 assert!(reader.encryption_handler_mut().is_none());
3417
3418 assert!(reader.unlock_with_password("any_password").unwrap());
3420 assert!(reader.try_empty_password().unwrap());
3421 }
3422
3423 #[test]
3424 fn test_reader_encryption_handler_access() {
3425 let pdf_data = create_minimal_pdf();
3426 let cursor = Cursor::new(pdf_data);
3427 let mut reader = PdfReader::new(cursor).unwrap();
3428
3429 assert!(reader.encryption_handler().is_none());
3431 assert!(reader.encryption_handler_mut().is_none());
3432
3433 assert!(!reader.is_encrypted());
3435 assert!(reader.is_unlocked());
3436 }
3437
3438 #[test]
3439 fn test_reader_multiple_password_attempts() {
3440 let pdf_data = create_minimal_pdf();
3441 let cursor = Cursor::new(pdf_data);
3442 let mut reader = PdfReader::new(cursor).unwrap();
3443
3444 let passwords = vec!["test1", "test2", "admin", "", "password"];
3446 for password in passwords {
3447 assert!(reader.unlock_with_password(password).unwrap());
3448 }
3449
3450 for _ in 0..5 {
3452 assert!(reader.try_empty_password().unwrap());
3453 }
3454 }
3455
3456 #[test]
3457 fn test_reader_encryption_state_consistency() {
3458 let pdf_data = create_minimal_pdf();
3459 let cursor = Cursor::new(pdf_data);
3460 let mut reader = PdfReader::new(cursor).unwrap();
3461
3462 assert!(!reader.is_encrypted());
3464 assert!(reader.is_unlocked());
3465 assert!(reader.encryption_handler().is_none());
3466
3467 let _ = reader.unlock_with_password("test");
3469 assert!(!reader.is_encrypted());
3470 assert!(reader.is_unlocked());
3471 assert!(reader.encryption_handler().is_none());
3472
3473 let _ = reader.try_empty_password();
3474 assert!(!reader.is_encrypted());
3475 assert!(reader.is_unlocked());
3476 assert!(reader.encryption_handler().is_none());
3477 }
3478
3479 #[test]
3480 fn test_reader_encryption_error_handling() {
3481 let encrypted_pdf = create_pdf_with_encryption();
3483 let cursor = Cursor::new(encrypted_pdf);
3484
3485 let result = PdfReader::new(cursor);
3487 match result {
3488 Err(ParseError::EncryptionNotSupported) => {
3489 }
3491 Err(_) => {
3492 }
3494 Ok(_) => {
3495 panic!("Should not successfully create reader for encrypted PDF without password");
3496 }
3497 }
3498 }
3499
3500 #[test]
3501 fn test_reader_encryption_with_options() {
3502 let pdf_data = create_minimal_pdf();
3503 let cursor = Cursor::new(pdf_data);
3504
3505 let strict_options = ParseOptions::strict();
3507 let strict_reader = PdfReader::new_with_options(cursor, strict_options).unwrap();
3508 assert!(!strict_reader.is_encrypted());
3509 assert!(strict_reader.is_unlocked());
3510
3511 let pdf_data = create_minimal_pdf();
3512 let cursor = Cursor::new(pdf_data);
3513 let lenient_options = ParseOptions::lenient();
3514 let lenient_reader = PdfReader::new_with_options(cursor, lenient_options).unwrap();
3515 assert!(!lenient_reader.is_encrypted());
3516 assert!(lenient_reader.is_unlocked());
3517 }
3518
3519 #[test]
3520 fn test_reader_encryption_integration_edge_cases() {
3521 let pdf_data = create_minimal_pdf();
3522 let cursor = Cursor::new(pdf_data);
3523 let mut reader = PdfReader::new(cursor).unwrap();
3524
3525 assert!(reader.unlock_with_password("").unwrap());
3527 assert!(reader.unlock_with_password(" ").unwrap()); assert!(reader
3529 .unlock_with_password("very_long_password_that_exceeds_normal_length")
3530 .unwrap());
3531 assert!(reader.unlock_with_password("unicode_test_ñáéíóú").unwrap());
3532
3533 assert!(reader.unlock_with_password("pass@#$%^&*()").unwrap());
3535 assert!(reader.unlock_with_password("pass\nwith\nnewlines").unwrap());
3536 assert!(reader.unlock_with_password("pass\twith\ttabs").unwrap());
3537 }
3538
3539 mod rigorous {
3540 use super::*;
3541
3542 #[test]
3547 fn test_reader_invalid_pdf_header() {
3548 let invalid_data = b"This is not a PDF file";
3550 let cursor = Cursor::new(invalid_data.to_vec());
3551 let result = PdfReader::new(cursor);
3552
3553 assert!(result.is_err(), "Should fail on invalid PDF header");
3554 }
3555
3556 #[test]
3557 fn test_reader_truncated_header() {
3558 let truncated = b"%PDF";
3560 let cursor = Cursor::new(truncated.to_vec());
3561 let result = PdfReader::new(cursor);
3562
3563 assert!(result.is_err(), "Should fail on truncated header");
3564 }
3565
3566 #[test]
3567 fn test_reader_empty_file() {
3568 let empty = Vec::new();
3569 let cursor = Cursor::new(empty);
3570 let result = PdfReader::new(cursor);
3571
3572 assert!(result.is_err(), "Should fail on empty file");
3573 }
3574
3575 #[test]
3576 fn test_reader_malformed_version() {
3577 let malformed = b"%PDF-X.Y\n%%\xE2\xE3\xCF\xD3\n";
3579 let cursor = Cursor::new(malformed.to_vec());
3580 let result = PdfReader::new(cursor);
3581
3582 if let Ok(reader) = result {
3584 let _version = reader.version();
3586 }
3587 }
3588
3589 #[test]
3590 fn test_reader_get_nonexistent_object() {
3591 let pdf_data = create_minimal_pdf();
3592 let cursor = Cursor::new(pdf_data);
3593 let mut reader = PdfReader::new(cursor).unwrap();
3594
3595 let result = reader.get_object(999, 0);
3597
3598 assert!(result.is_err(), "Should fail when object doesn't exist");
3599 }
3600
3601 #[test]
3602 fn test_reader_get_object_wrong_generation() {
3603 let pdf_data = create_minimal_pdf();
3604 let cursor = Cursor::new(pdf_data);
3605 let mut reader = PdfReader::new(cursor).unwrap();
3606
3607 let result = reader.get_object(1, 99);
3609
3610 if let Err(e) = result {
3612 let _ = e;
3614 }
3615 }
3616
3617 #[test]
3622 fn test_resolve_direct_object() {
3623 let pdf_data = create_minimal_pdf();
3624 let cursor = Cursor::new(pdf_data);
3625 let mut reader = PdfReader::new(cursor).unwrap();
3626
3627 let direct_obj = PdfObject::Integer(42);
3629
3630 let resolved = reader.resolve(&direct_obj).unwrap();
3631
3632 assert_eq!(resolved, &PdfObject::Integer(42));
3634 }
3635
3636 #[test]
3637 fn test_resolve_reference() {
3638 let pdf_data = create_minimal_pdf();
3639 let cursor = Cursor::new(pdf_data);
3640 let mut reader = PdfReader::new(cursor).unwrap();
3641
3642 let pages_ref = {
3644 let catalog = reader.catalog().unwrap();
3645 if let Some(PdfObject::Reference(obj_num, gen_num)) = catalog.get("Pages") {
3646 PdfObject::Reference(*obj_num, *gen_num)
3647 } else {
3648 panic!("Catalog /Pages must be a Reference");
3649 }
3650 };
3651
3652 let resolved = reader.resolve(&pages_ref).unwrap();
3654
3655 if let PdfObject::Dictionary(dict) = resolved {
3657 assert_eq!(
3658 dict.get("Type"),
3659 Some(&PdfObject::Name(PdfName("Pages".to_string())))
3660 );
3661 } else {
3662 panic!("Expected dictionary, got: {:?}", resolved);
3663 }
3664 }
3665
3666 #[test]
3671 fn test_is_encrypted_on_unencrypted() {
3672 let pdf_data = create_minimal_pdf();
3673 let cursor = Cursor::new(pdf_data);
3674 let reader = PdfReader::new(cursor).unwrap();
3675
3676 assert!(
3677 !reader.is_encrypted(),
3678 "Minimal PDF should not be encrypted"
3679 );
3680 }
3681
3682 #[test]
3683 fn test_is_unlocked_on_unencrypted() {
3684 let pdf_data = create_minimal_pdf();
3685 let cursor = Cursor::new(pdf_data);
3686 let reader = PdfReader::new(cursor).unwrap();
3687
3688 assert!(reader.is_unlocked(), "Unencrypted PDF should be unlocked");
3690 }
3691
3692 #[test]
3693 fn test_try_empty_password_on_unencrypted() {
3694 let pdf_data = create_minimal_pdf();
3695 let cursor = Cursor::new(pdf_data);
3696 let mut reader = PdfReader::new(cursor).unwrap();
3697
3698 let result = reader.try_empty_password();
3700 assert!(result.is_ok());
3701 }
3702
3703 #[test]
3708 fn test_reader_with_strict_options() {
3709 let pdf_data = create_minimal_pdf();
3710 let cursor = Cursor::new(pdf_data);
3711
3712 let options = ParseOptions::strict();
3713 let result = PdfReader::new_with_options(cursor, options);
3714
3715 assert!(result.is_ok(), "Minimal PDF should parse in strict mode");
3716 }
3717
3718 #[test]
3719 fn test_reader_with_lenient_options() {
3720 let pdf_data = create_minimal_pdf();
3721 let cursor = Cursor::new(pdf_data);
3722
3723 let options = ParseOptions::lenient();
3724 let result = PdfReader::new_with_options(cursor, options);
3725
3726 assert!(result.is_ok(), "Minimal PDF should parse in lenient mode");
3727 }
3728
3729 #[test]
3730 fn test_reader_options_accessible() {
3731 let pdf_data = create_minimal_pdf();
3732 let cursor = Cursor::new(pdf_data);
3733
3734 let options = ParseOptions::lenient();
3735 let reader = PdfReader::new_with_options(cursor, options.clone()).unwrap();
3736
3737 let reader_options = reader.options();
3739 assert_eq!(reader_options.strict_mode, options.strict_mode);
3740 }
3741
3742 #[test]
3747 fn test_catalog_has_required_fields() {
3748 let pdf_data = create_minimal_pdf();
3749 let cursor = Cursor::new(pdf_data);
3750 let mut reader = PdfReader::new(cursor).unwrap();
3751
3752 let catalog = reader.catalog().unwrap();
3753
3754 assert_eq!(
3756 catalog.get("Type"),
3757 Some(&PdfObject::Name(PdfName("Catalog".to_string()))),
3758 "Catalog must have /Type /Catalog"
3759 );
3760
3761 assert!(
3763 catalog.contains_key("Pages"),
3764 "Catalog must have /Pages entry"
3765 );
3766 }
3767
3768 #[test]
3769 fn test_info_fields_when_present() {
3770 let pdf_data = create_pdf_with_info();
3771 let cursor = Cursor::new(pdf_data);
3772 let mut reader = PdfReader::new(cursor).unwrap();
3773
3774 let info = reader.info().unwrap();
3775 assert!(info.is_some(), "PDF should have Info dictionary");
3776
3777 let info_dict = info.unwrap();
3778
3779 assert!(info_dict.contains_key("Title"), "Info should have Title");
3781 assert!(info_dict.contains_key("Author"), "Info should have Author");
3782 }
3783
3784 #[test]
3785 fn test_info_none_when_absent() {
3786 let pdf_data = create_minimal_pdf();
3787 let cursor = Cursor::new(pdf_data);
3788 let mut reader = PdfReader::new(cursor).unwrap();
3789
3790 let info = reader.info().unwrap();
3791 assert!(info.is_none(), "Minimal PDF should not have Info");
3792 }
3793
3794 #[test]
3799 fn test_version_exact_values() {
3800 let pdf_data = create_pdf_with_version("1.7");
3801 let cursor = Cursor::new(pdf_data);
3802 let reader = PdfReader::new(cursor).unwrap();
3803
3804 let version = reader.version();
3805 assert_eq!(version.major, 1, "Major version must be exact");
3806 assert_eq!(version.minor, 7, "Minor version must be exact");
3807 }
3808
3809 #[test]
3810 fn test_version_pdf_20() {
3811 let pdf_data = create_pdf_with_version("2.0");
3812 let cursor = Cursor::new(pdf_data);
3813 let reader = PdfReader::new(cursor).unwrap();
3814
3815 let version = reader.version();
3816 assert_eq!(version.major, 2, "PDF 2.0 major version");
3817 assert_eq!(version.minor, 0, "PDF 2.0 minor version");
3818 }
3819
3820 #[test]
3825 fn test_pages_returns_pages_dict() {
3826 let pdf_data = create_minimal_pdf();
3827 let cursor = Cursor::new(pdf_data);
3828 let mut reader = PdfReader::new(cursor).unwrap();
3829
3830 let pages_dict = reader
3831 .pages()
3832 .expect("pages() must return Pages dictionary");
3833
3834 assert_eq!(
3835 pages_dict.get("Type"),
3836 Some(&PdfObject::Name(PdfName("Pages".to_string()))),
3837 "Pages dict must have /Type /Pages"
3838 );
3839 }
3840
3841 #[test]
3842 fn test_page_count_minimal_pdf() {
3843 let pdf_data = create_minimal_pdf();
3844 let cursor = Cursor::new(pdf_data);
3845 let mut reader = PdfReader::new(cursor).unwrap();
3846
3847 let count = reader.page_count().expect("page_count() must succeed");
3848 assert_eq!(count, 0, "Minimal PDF has 0 pages");
3849 }
3850
3851 #[test]
3852 fn test_page_count_with_info_pdf() {
3853 let pdf_data = create_pdf_with_info();
3854 let cursor = Cursor::new(pdf_data);
3855 let mut reader = PdfReader::new(cursor).unwrap();
3856
3857 let count = reader.page_count().expect("page_count() must succeed");
3858 assert_eq!(count, 0, "create_pdf_with_info() has Count 0 in Pages dict");
3859 }
3860
3861 #[test]
3866 fn test_metadata_minimal_pdf() {
3867 let pdf_data = create_minimal_pdf();
3868 let cursor = Cursor::new(pdf_data);
3869 let mut reader = PdfReader::new(cursor).unwrap();
3870
3871 let meta = reader.metadata().expect("metadata() must succeed");
3872
3873 assert!(meta.title.is_none(), "Minimal PDF has no title");
3875 assert!(meta.author.is_none(), "Minimal PDF has no author");
3876 }
3877
3878 #[test]
3879 fn test_metadata_with_info() {
3880 let pdf_data = create_pdf_with_info();
3881 let cursor = Cursor::new(pdf_data);
3882 let mut reader = PdfReader::new(cursor).unwrap();
3883
3884 let meta = reader.metadata().expect("metadata() must succeed");
3885
3886 assert!(meta.title.is_some(), "PDF with Info has title");
3887 assert_eq!(meta.title.unwrap(), "Test PDF", "Title must match");
3888 assert!(meta.author.is_some(), "PDF with Info has author");
3889 assert_eq!(meta.author.unwrap(), "Test Author", "Author must match");
3890 }
3891
3892 #[test]
3897 fn test_resolve_stream_length_direct_integer() {
3898 let pdf_data = create_minimal_pdf();
3899 let cursor = Cursor::new(pdf_data);
3900 let mut reader = PdfReader::new(cursor).unwrap();
3901
3902 let length_obj = PdfObject::Integer(100);
3904
3905 let length = reader
3906 .resolve_stream_length(&length_obj)
3907 .expect("resolve_stream_length must succeed");
3908 assert_eq!(length, Some(100), "Direct integer must be resolved");
3909 }
3910
3911 #[test]
3912 fn test_resolve_stream_length_negative_integer() {
3913 let pdf_data = create_minimal_pdf();
3914 let cursor = Cursor::new(pdf_data);
3915 let mut reader = PdfReader::new(cursor).unwrap();
3916
3917 let length_obj = PdfObject::Integer(-10);
3919
3920 let length = reader
3921 .resolve_stream_length(&length_obj)
3922 .expect("resolve_stream_length must succeed");
3923 assert_eq!(length, None, "Negative integer returns None");
3924 }
3925
3926 #[test]
3927 fn test_resolve_stream_length_non_integer() {
3928 let pdf_data = create_minimal_pdf();
3929 let cursor = Cursor::new(pdf_data);
3930 let mut reader = PdfReader::new(cursor).unwrap();
3931
3932 let name_obj = PdfObject::Name(PdfName("Test".to_string()));
3934
3935 let length = reader
3936 .resolve_stream_length(&name_obj)
3937 .expect("resolve_stream_length must succeed");
3938 assert_eq!(length, None, "Non-integer object returns None");
3939 }
3940
3941 #[test]
3946 fn test_get_all_pages_empty_pdf() {
3947 let pdf_data = create_minimal_pdf();
3948 let cursor = Cursor::new(pdf_data);
3949 let mut reader = PdfReader::new(cursor).unwrap();
3950
3951 let pages = reader
3952 .get_all_pages()
3953 .expect("get_all_pages() must succeed");
3954 assert_eq!(pages.len(), 0, "Minimal PDF has 0 pages");
3955 }
3956
3957 #[test]
3958 fn test_get_all_pages_with_info() {
3959 let pdf_data = create_pdf_with_info();
3960 let cursor = Cursor::new(pdf_data);
3961 let mut reader = PdfReader::new(cursor).unwrap();
3962
3963 let pages = reader
3964 .get_all_pages()
3965 .expect("get_all_pages() must succeed");
3966 assert_eq!(
3967 pages.len(),
3968 0,
3969 "create_pdf_with_info() has 0 pages (Count 0)"
3970 );
3971 }
3972
3973 #[test]
3978 fn test_into_document_consumes_reader() {
3979 let pdf_data = create_minimal_pdf();
3980 let cursor = Cursor::new(pdf_data);
3981 let reader = PdfReader::new(cursor).unwrap();
3982
3983 let document = reader.into_document();
3984
3985 let version = document.version().expect("Document must have version");
3987 assert!(
3988 version.starts_with("1."),
3989 "Document must have PDF 1.x version, got: {}",
3990 version
3991 );
3992
3993 let page_count = document
3995 .page_count()
3996 .expect("Document must allow page_count()");
3997 assert_eq!(
3998 page_count, 0,
3999 "Minimal PDF has 0 pages (Count 0 in test helper)"
4000 );
4001 }
4002
4003 #[test]
4008 fn test_clear_parse_context() {
4009 let pdf_data = create_minimal_pdf();
4010 let cursor = Cursor::new(pdf_data);
4011 let mut reader = PdfReader::new(cursor).unwrap();
4012
4013 reader.clear_parse_context();
4015
4016 let version = reader.version();
4018 assert_eq!(version.major, 1, "Reader must still work after clear");
4019 }
4020
4021 #[test]
4022 fn test_parse_context_mut_accessible() {
4023 let pdf_data = create_minimal_pdf();
4024 let cursor = Cursor::new(pdf_data);
4025 let mut reader = PdfReader::new(cursor).unwrap();
4026
4027 let context = reader.parse_context_mut();
4028
4029 let initial_depth = context.depth;
4031 assert_eq!(initial_depth, 0, "Parse context must start with depth 0");
4032
4033 assert!(
4035 context.max_depth > 0,
4036 "Parse context must have positive max_depth"
4037 );
4038 }
4039
4040 #[test]
4045 fn test_find_bytes_basic() {
4046 let haystack = b"Hello World";
4047 let needle = b"World";
4048 let pos = find_bytes(haystack, needle);
4049 assert_eq!(pos, Some(6), "Must find 'World' at position 6");
4050 }
4051
4052 #[test]
4053 fn test_find_bytes_not_found() {
4054 let haystack = b"Hello World";
4055 let needle = b"Rust";
4056 let pos = find_bytes(haystack, needle);
4057 assert_eq!(pos, None, "Must return None when not found");
4058 }
4059
4060 #[test]
4061 fn test_find_bytes_at_start() {
4062 let haystack = b"Hello World";
4063 let needle = b"Hello";
4064 let pos = find_bytes(haystack, needle);
4065 assert_eq!(pos, Some(0), "Must find at position 0");
4066 }
4067
4068 #[test]
4069 fn test_is_immediate_stream_start_with_stream() {
4070 let data = b"stream\ndata";
4071 assert!(
4072 is_immediate_stream_start(data),
4073 "Must detect 'stream' at start"
4074 );
4075 }
4076
4077 #[test]
4078 fn test_is_immediate_stream_start_with_whitespace() {
4079 let data = b" \n\tstream\ndata";
4080 assert!(
4081 is_immediate_stream_start(data),
4082 "Must detect 'stream' after whitespace"
4083 );
4084 }
4085
4086 #[test]
4087 fn test_is_immediate_stream_start_no_stream() {
4088 let data = b"endobj";
4089 assert!(
4090 !is_immediate_stream_start(data),
4091 "Must return false when 'stream' absent"
4092 );
4093 }
4094 }
4095}