1use super::encryption_handler::EncryptionHandler;
6use super::header::PdfHeader;
7use super::object_stream::ObjectStream;
8use super::objects::{PdfDictionary, PdfObject};
9use super::stack_safe::StackSafeContext;
10use super::trailer::PdfTrailer;
11use super::xref::XRefTable;
12use super::{ParseError, ParseResult};
13use std::collections::HashMap;
14use std::fs::File;
15use std::io::{BufReader, Read, Seek, SeekFrom};
16use std::path::Path;
17
18fn find_bytes(haystack: &[u8], needle: &[u8]) -> Option<usize> {
20 haystack
21 .windows(needle.len())
22 .position(|window| window == needle)
23}
24
25fn is_immediate_stream_start(data: &[u8]) -> bool {
27 let mut i = 0;
28
29 while i < data.len() && matches!(data[i], b' ' | b'\t' | b'\n' | b'\r') {
31 i += 1;
32 }
33
34 data[i..].starts_with(b"stream")
36}
37
38pub struct PdfReader<R: Read + Seek> {
40 reader: BufReader<R>,
41 header: PdfHeader,
42 xref: XRefTable,
43 trailer: PdfTrailer,
44 object_cache: HashMap<(u32, u16), PdfObject>,
46 object_stream_cache: HashMap<u32, ObjectStream>,
48 page_tree: Option<super::page_tree::PageTree>,
50 parse_context: StackSafeContext,
52 options: super::ParseOptions,
54 encryption_handler: Option<EncryptionHandler>,
56}
57
58impl<R: Read + Seek> PdfReader<R> {
59 pub fn options(&self) -> &super::ParseOptions {
61 &self.options
62 }
63
64 pub fn is_encrypted(&self) -> bool {
66 self.encryption_handler.is_some()
67 }
68
69 pub fn is_unlocked(&self) -> bool {
71 match &self.encryption_handler {
72 Some(handler) => handler.is_unlocked(),
73 None => true, }
75 }
76
77 pub fn encryption_handler_mut(&mut self) -> Option<&mut EncryptionHandler> {
79 self.encryption_handler.as_mut()
80 }
81
82 pub fn encryption_handler(&self) -> Option<&EncryptionHandler> {
84 self.encryption_handler.as_ref()
85 }
86
87 pub fn unlock_with_password(&mut self, password: &str) -> ParseResult<bool> {
89 match &mut self.encryption_handler {
90 Some(handler) => {
91 if handler.unlock_with_user_password(password).unwrap_or(false) {
93 Ok(true)
94 } else {
95 Ok(handler
97 .unlock_with_owner_password(password)
98 .unwrap_or(false))
99 }
100 }
101 None => Ok(true), }
103 }
104
105 pub fn try_empty_password(&mut self) -> ParseResult<bool> {
107 match &mut self.encryption_handler {
108 Some(handler) => Ok(handler.try_empty_password().unwrap_or(false)),
109 None => Ok(true), }
111 }
112}
113
114impl PdfReader<File> {
115 pub fn open<P: AsRef<Path>>(path: P) -> ParseResult<Self> {
117 use std::io::Write;
118 let mut debug_file = std::fs::File::create("/tmp/pdf_open_debug.log").ok();
119 if let Some(ref mut f) = debug_file {
120 writeln!(f, "Opening file: {:?}", path.as_ref()).ok();
121 }
122 let file = File::open(path)?;
123 if let Some(ref mut f) = debug_file {
124 writeln!(f, "File opened successfully").ok();
125 }
126 let options = super::ParseOptions::lenient();
128 Self::new_with_options(file, options)
129 }
130
131 pub fn open_strict<P: AsRef<Path>>(path: P) -> ParseResult<Self> {
133 let file = File::open(path)?;
134 let options = super::ParseOptions::strict();
135 Self::new_with_options(file, options)
136 }
137
138 pub fn open_with_options<P: AsRef<Path>>(
140 path: P,
141 options: super::ParseOptions,
142 ) -> ParseResult<Self> {
143 let file = File::open(path)?;
144 Self::new_with_options(file, options)
145 }
146
147 pub fn open_document<P: AsRef<Path>>(
149 path: P,
150 ) -> ParseResult<super::document::PdfDocument<File>> {
151 let reader = Self::open(path)?;
152 Ok(reader.into_document())
153 }
154}
155
156impl<R: Read + Seek> PdfReader<R> {
157 pub fn new(reader: R) -> ParseResult<Self> {
159 Self::new_with_options(reader, super::ParseOptions::default())
160 }
161
162 pub fn new_with_options(reader: R, options: super::ParseOptions) -> ParseResult<Self> {
164 let mut buf_reader = BufReader::new(reader);
165
166 let start_pos = buf_reader.stream_position()?;
168 buf_reader.seek(SeekFrom::End(0))?;
169 let file_size = buf_reader.stream_position()?;
170 buf_reader.seek(SeekFrom::Start(start_pos))?;
171
172 if file_size == 0 {
173 return Err(ParseError::EmptyFile);
174 }
175
176 use std::io::Write;
178 let mut debug_file = std::fs::File::create("/tmp/pdf_debug.log").ok();
179 if let Some(ref mut f) = debug_file {
180 writeln!(f, "Parsing PDF header...").ok();
181 }
182 let header = PdfHeader::parse(&mut buf_reader)?;
183 if let Some(ref mut f) = debug_file {
184 writeln!(f, "Header parsed: version {}", header.version).ok();
185 }
186
187 if let Some(ref mut f) = debug_file {
189 writeln!(f, "Parsing XRef table...").ok();
190 }
191 let xref = XRefTable::parse_with_options(&mut buf_reader, &options)?;
192 if let Some(ref mut f) = debug_file {
193 writeln!(f, "XRef table parsed with {} entries", xref.len()).ok();
194 }
195
196 let trailer_dict = xref.trailer().ok_or(ParseError::InvalidTrailer)?.clone();
198
199 let xref_offset = xref.xref_offset();
200 let trailer = PdfTrailer::from_dict(trailer_dict, xref_offset)?;
201
202 trailer.validate()?;
204
205 let encryption_handler = if EncryptionHandler::detect_encryption(trailer.dict()) {
207 if let Ok(Some((encrypt_obj_num, encrypt_gen_num))) = trailer.encrypt() {
208 let mut temp_reader = Self {
210 reader: buf_reader,
211 header: header.clone(),
212 xref: xref.clone(),
213 trailer: trailer.clone(),
214 object_cache: HashMap::new(),
215 object_stream_cache: HashMap::new(),
216 page_tree: None,
217 parse_context: StackSafeContext::new(),
218 options: options.clone(),
219 encryption_handler: None,
220 };
221
222 let encrypt_obj = temp_reader.get_object(encrypt_obj_num, encrypt_gen_num)?;
224 if let Some(encrypt_dict) = encrypt_obj.as_dict() {
225 let file_id = trailer.id().and_then(|id_obj| {
227 if let PdfObject::Array(ref id_array) = id_obj {
228 if let Some(PdfObject::String(ref id_bytes)) = id_array.get(0) {
229 Some(id_bytes.as_bytes().to_vec())
230 } else {
231 None
232 }
233 } else {
234 None
235 }
236 });
237
238 match EncryptionHandler::new(encrypt_dict, file_id) {
239 Ok(handler) => {
240 buf_reader = temp_reader.reader;
242 Some(handler)
243 }
244 Err(_) => {
245 let _ = temp_reader.reader;
247 return Err(ParseError::EncryptionNotSupported);
248 }
249 }
250 } else {
251 let _ = temp_reader.reader;
252 return Err(ParseError::EncryptionNotSupported);
253 }
254 } else {
255 return Err(ParseError::EncryptionNotSupported);
256 }
257 } else {
258 None
259 };
260
261 Ok(Self {
262 reader: buf_reader,
263 header,
264 xref,
265 trailer,
266 object_cache: HashMap::new(),
267 object_stream_cache: HashMap::new(),
268 page_tree: None,
269 parse_context: StackSafeContext::new(),
270 options,
271 encryption_handler,
272 })
273 }
274
275 pub fn version(&self) -> &super::header::PdfVersion {
277 &self.header.version
278 }
279
280 pub fn catalog(&mut self) -> ParseResult<&PdfDictionary> {
282 let (obj_num, gen_num) = match self.trailer.root() {
284 Ok(root) => root,
285 Err(_) => {
286 #[cfg(debug_assertions)]
288 eprintln!("Warning: Trailer missing Root entry, attempting recovery");
289
290 if let Some(root) = self.trailer.find_root_fallback() {
292 root
293 } else {
294 if let Ok(catalog_ref) = self.find_catalog_object() {
296 catalog_ref
297 } else {
298 return Err(ParseError::MissingKey("Root".to_string()));
299 }
300 }
301 }
302 };
303
304 let key = (obj_num, gen_num);
306 let needs_reconstruction = {
307 match self.get_object(obj_num, gen_num) {
308 Ok(catalog) => {
309 if catalog.as_dict().is_some() {
311 false
313 } else {
314 true
316 }
317 }
318 Err(_) => {
319 true
321 }
322 }
323 };
324
325 if !needs_reconstruction {
326 let catalog = self.get_object(obj_num, gen_num)?;
328 return Ok(catalog.as_dict().unwrap());
329 }
330
331 eprintln!(
333 "DEBUG: Catalog object {} needs reconstruction, attempting manual reconstruction",
334 obj_num
335 );
336
337 match self.extract_object_manually(obj_num) {
338 Ok(dict) => {
339 eprintln!(
340 "DEBUG: Successfully reconstructed catalog {} manually",
341 obj_num
342 );
343 let obj = PdfObject::Dictionary(dict);
345 self.object_cache.insert(key, obj);
346
347 use crate::parser::xref::XRefEntry;
349 let xref_entry = XRefEntry {
350 offset: 0, generation: gen_num,
352 in_use: true,
353 };
354 self.xref.add_entry(obj_num, xref_entry);
355 eprintln!("DEBUG: Added catalog object {} to XRef table", obj_num);
356
357 if let Some(PdfObject::Dictionary(ref dict)) = self.object_cache.get(&key) {
359 return Ok(dict);
360 }
361 }
362 Err(e) => {
363 eprintln!("DEBUG: Manual catalog reconstruction failed: {:?}", e);
364 }
365 }
366
367 Err(ParseError::SyntaxError {
369 position: 0,
370 message: format!(
371 "Catalog object {} could not be parsed or reconstructed as a dictionary",
372 obj_num
373 ),
374 })
375 }
376
377 pub fn info(&mut self) -> ParseResult<Option<&PdfDictionary>> {
379 match self.trailer.info() {
380 Some((obj_num, gen_num)) => {
381 let info = self.get_object(obj_num, gen_num)?;
382 Ok(info.as_dict())
383 }
384 None => Ok(None),
385 }
386 }
387
388 pub fn get_object(&mut self, obj_num: u32, gen_num: u16) -> ParseResult<&PdfObject> {
390 self.load_object_from_disk(obj_num, gen_num)
391 }
392
393 fn load_object_from_disk(&mut self, obj_num: u32, gen_num: u16) -> ParseResult<&PdfObject> {
395 let key = (obj_num, gen_num);
396
397 if self.object_cache.contains_key(&key) {
399 return Ok(&self.object_cache[&key]);
400 }
401
402 if let Some(ext_entry) = self.xref.get_extended_entry(obj_num) {
404 if let Some((stream_obj_num, index_in_stream)) = ext_entry.compressed_info {
405 eprintln!(
406 "DEBUG: Object {} found in Object Stream {} at index {}",
407 obj_num, stream_obj_num, index_in_stream
408 );
409 return self.get_compressed_object(
411 obj_num,
412 gen_num,
413 stream_obj_num,
414 index_in_stream,
415 );
416 }
417 } else {
418 eprintln!("DEBUG: Object {} not found in extended entries", obj_num);
419 }
420
421 let (current_offset, _generation) = {
423 let entry = self.xref.get_entry(obj_num);
424
425 match entry {
426 Some(entry) => {
427 if !entry.in_use {
428 self.object_cache.insert(key, PdfObject::Null);
430 return Ok(&self.object_cache[&key]);
431 }
432
433 if entry.generation != gen_num {
434 if self.options.lenient_syntax {
435 if self.options.collect_warnings {
437 eprintln!("Warning: Object {} generation mismatch - expected {}, found {}, using available",
438 obj_num, gen_num, entry.generation);
439 }
440 } else {
441 return Err(ParseError::InvalidReference(obj_num, gen_num));
442 }
443 }
444
445 (entry.offset, entry.generation)
446 }
447 None => {
448 if self.is_reconstructible_object(obj_num) {
450 eprintln!("DEBUG: Object {} not found in XRef table, attempting manual reconstruction", obj_num);
451 return self.attempt_manual_object_reconstruction(obj_num, gen_num, 0);
452 } else {
453 if self.options.lenient_syntax {
454 if self.options.collect_warnings {
456 eprintln!("Warning: Object {} {} R not found in XRef, returning null object",
457 obj_num, gen_num);
458 }
459 self.object_cache.insert(key, PdfObject::Null);
460 return Ok(&self.object_cache[&key]);
461 } else {
462 return Err(ParseError::InvalidReference(obj_num, gen_num));
463 }
464 }
465 }
466 }
467 };
468
469 self.reader.seek(std::io::SeekFrom::Start(current_offset))?;
473
474 let mut lexer =
476 super::lexer::Lexer::new_with_options(&mut self.reader, self.options.clone());
477
478 {
480 let token = lexer.next_token()?;
482 let read_obj_num = match token {
483 super::lexer::Token::Integer(n) => n as u32,
484 _ => {
485 if self.options.lenient_syntax {
487 if self.options.collect_warnings {
489 eprintln!(
490 "Warning: Using expected object number {obj_num} instead of parsed token: {:?}",
491 token
492 );
493 }
494 obj_num
495 } else {
496 return Err(ParseError::SyntaxError {
497 position: current_offset as usize,
498 message: "Expected object number".to_string(),
499 });
500 }
501 }
502 };
503
504 if read_obj_num != obj_num && !self.options.lenient_syntax {
505 return Err(ParseError::SyntaxError {
506 position: current_offset as usize,
507 message: format!(
508 "Object number mismatch: expected {obj_num}, found {read_obj_num}"
509 ),
510 });
511 }
512
513 let token = lexer.next_token()?;
515 let _read_gen_num = match token {
516 super::lexer::Token::Integer(n) => n as u16,
517 _ => {
518 if self.options.lenient_syntax {
520 if self.options.collect_warnings {
521 eprintln!("Warning: Using generation 0 instead of parsed token for object {obj_num}");
522 }
523 0
524 } else {
525 return Err(ParseError::SyntaxError {
526 position: current_offset as usize,
527 message: "Expected generation number".to_string(),
528 });
529 }
530 }
531 };
532
533 let token = lexer.next_token()?;
535 match token {
536 super::lexer::Token::Obj => {}
537 _ => {
538 if self.options.lenient_syntax {
539 if self.options.collect_warnings {
541 eprintln!("Warning: Expected 'obj' keyword for object {obj_num} {gen_num}, continuing anyway");
542 }
543 } else {
544 return Err(ParseError::SyntaxError {
545 position: current_offset as usize,
546 message: "Expected 'obj' keyword".to_string(),
547 });
548 }
549 }
550 }
551 }
552
553 self.parse_context.enter()?;
555
556 let obj = match PdfObject::parse_with_options(&mut lexer, &self.options) {
557 Ok(obj) => {
558 self.parse_context.exit();
559 if obj_num == 102 && self.options.collect_warnings {
561 eprintln!("DEBUG: Parsed object 102: {:?}", obj);
562 eprintln!(
563 "DEBUG: Object 102 is dictionary: {}",
564 obj.as_dict().is_some()
565 );
566 }
567 obj
568 }
569 Err(e) => {
570 self.parse_context.exit();
571
572 if self.is_reconstructible_object(obj_num)
574 && self.can_attempt_manual_reconstruction(&e)
575 {
576 eprintln!(
577 "DEBUG: Normal parsing failed for object {}: {:?}",
578 obj_num, e
579 );
580 eprintln!("DEBUG: Attempting manual reconstruction as fallback");
581
582 match self.attempt_manual_object_reconstruction(
583 obj_num,
584 gen_num,
585 current_offset,
586 ) {
587 Ok(reconstructed_obj) => {
588 eprintln!(
589 "DEBUG: Successfully reconstructed object {} manually",
590 obj_num
591 );
592 return Ok(reconstructed_obj);
593 }
594 Err(reconstruction_error) => {
595 eprintln!(
596 "DEBUG: Manual reconstruction also failed: {:?}",
597 reconstruction_error
598 );
599 eprintln!("DEBUG: Falling back to original error");
600 }
601 }
602 }
603
604 return Err(e);
605 }
606 };
607
608 let token = lexer.next_token()?;
610 match token {
611 super::lexer::Token::EndObj => {}
612 _ => {
613 if self.options.lenient_syntax {
614 if self.options.collect_warnings {
616 eprintln!("Warning: Expected 'endobj' keyword after object {obj_num} {gen_num}, continuing anyway");
617 }
618 } else {
619 return Err(ParseError::SyntaxError {
620 position: current_offset as usize,
621 message: "Expected 'endobj' keyword".to_string(),
622 });
623 }
624 }
625 };
626
627 self.object_cache.insert(key, obj);
629
630 Ok(&self.object_cache[&key])
631 }
632
633 pub fn resolve<'a>(&'a mut self, obj: &'a PdfObject) -> ParseResult<&'a PdfObject> {
635 match obj {
636 PdfObject::Reference(obj_num, gen_num) => self.get_object(*obj_num, *gen_num),
637 _ => Ok(obj),
638 }
639 }
640
641 pub fn resolve_stream_length(&mut self, obj: &PdfObject) -> ParseResult<Option<usize>> {
644 match obj {
645 PdfObject::Integer(len) => {
646 if *len >= 0 {
647 Ok(Some(*len as usize))
648 } else {
649 Ok(None)
651 }
652 }
653 PdfObject::Reference(obj_num, gen_num) => {
654 let resolved = self.get_object(*obj_num, *gen_num)?;
655 match resolved {
656 PdfObject::Integer(len) => {
657 if *len >= 0 {
658 Ok(Some(*len as usize))
659 } else {
660 Ok(None)
661 }
662 }
663 _ => {
664 Ok(None)
666 }
667 }
668 }
669 _ => {
670 Ok(None)
672 }
673 }
674 }
675
676 fn get_compressed_object(
678 &mut self,
679 obj_num: u32,
680 gen_num: u16,
681 stream_obj_num: u32,
682 _index_in_stream: u32,
683 ) -> ParseResult<&PdfObject> {
684 let key = (obj_num, gen_num);
685
686 if !self.object_stream_cache.contains_key(&stream_obj_num) {
688 let stream_obj = self.load_object_from_disk(stream_obj_num, 0)?;
690
691 if let Some(stream) = stream_obj.as_stream() {
692 let obj_stream = ObjectStream::parse(stream.clone(), &self.options)?;
694 self.object_stream_cache.insert(stream_obj_num, obj_stream);
695 } else {
696 return Err(ParseError::SyntaxError {
697 position: 0,
698 message: format!("Object {stream_obj_num} is not a stream"),
699 });
700 }
701 }
702
703 let obj_stream = &self.object_stream_cache[&stream_obj_num];
705 let obj = obj_stream
706 .get_object(obj_num)
707 .ok_or_else(|| ParseError::SyntaxError {
708 position: 0,
709 message: format!("Object {obj_num} not found in object stream {stream_obj_num}"),
710 })?;
711
712 self.object_cache.insert(key, obj.clone());
714 Ok(&self.object_cache[&key])
715 }
716
717 pub fn pages(&mut self) -> ParseResult<&PdfDictionary> {
719 let (pages_obj_num, pages_gen_num) = {
721 let catalog = self.catalog()?;
722
723 if let Some(pages_ref) = catalog.get("Pages") {
725 match pages_ref {
726 PdfObject::Reference(obj_num, gen_num) => (*obj_num, *gen_num),
727 _ => {
728 return Err(ParseError::SyntaxError {
729 position: 0,
730 message: "Pages must be a reference".to_string(),
731 })
732 }
733 }
734 } else {
735 #[cfg(debug_assertions)]
737 eprintln!("Warning: Catalog missing Pages entry, attempting recovery");
738
739 if let Ok(page_refs) = self.find_page_objects() {
741 if !page_refs.is_empty() {
742 return self.create_synthetic_pages_dict(&page_refs);
744 }
745 }
746
747 if self.options.lenient_syntax {
749 if self.options.collect_warnings {
750 eprintln!("Warning: Missing Pages in catalog, searching for page tree");
751 }
752 let mut found_pages = None;
754 for i in 1..self.xref.len() as u32 {
755 if let Ok(obj) = self.get_object(i, 0) {
756 if let Some(dict) = obj.as_dict() {
757 if let Some(obj_type) = dict.get("Type").and_then(|t| t.as_name()) {
758 if obj_type.0 == "Pages" {
759 found_pages = Some((i, 0));
760 break;
761 }
762 }
763 }
764 }
765 }
766 if let Some((obj_num, gen_num)) = found_pages {
767 (obj_num, gen_num)
768 } else {
769 return Err(ParseError::MissingKey("Pages".to_string()));
770 }
771 } else {
772 return Err(ParseError::MissingKey("Pages".to_string()));
773 }
774 }
775 };
776
777 let pages_obj = self.get_object(pages_obj_num, pages_gen_num)?;
779 pages_obj.as_dict().ok_or_else(|| ParseError::SyntaxError {
780 position: 0,
781 message: "Pages is not a dictionary".to_string(),
782 })
783 }
784
785 pub fn page_count(&mut self) -> ParseResult<u32> {
787 match self.pages() {
789 Ok(pages) => {
790 if let Some(count_obj) = pages.get("Count") {
792 if let Some(count) = count_obj.as_integer() {
793 return Ok(count as u32);
794 }
795 }
796
797 if let Some(kids_obj) = pages.get("Kids") {
799 if let Some(kids_array) = kids_obj.as_array() {
800 return Ok(kids_array.0.len() as u32);
803 }
804 }
805
806 Ok(0)
807 }
808 Err(_) => {
809 eprintln!("Standard page extraction failed, trying direct extraction");
811 self.page_count_fallback()
812 }
813 }
814 }
815
816 fn page_count_fallback(&mut self) -> ParseResult<u32> {
818 if let Some(count) = self.extract_page_count_from_linearization() {
820 eprintln!("Found page count {} from linearization", count);
821 return Ok(count);
822 }
823
824 if let Some(count) = self.count_page_objects_directly() {
826 eprintln!("Found {} pages by counting page objects", count);
827 return Ok(count);
828 }
829
830 Ok(0)
831 }
832
833 fn extract_page_count_from_linearization(&mut self) -> Option<u32> {
835 match self.get_object(100, 0) {
837 Ok(obj) => {
838 eprintln!("Found object 100: {:?}", obj);
839 if let Some(dict) = obj.as_dict() {
840 eprintln!("Object 100 is a dictionary with {} keys", dict.0.len());
841 if let Some(n_obj) = dict.get("N") {
843 eprintln!("Found /N field: {:?}", n_obj);
844 if let Some(count) = n_obj.as_integer() {
845 eprintln!("Extracted page count from linearization: {}", count);
846 return Some(count as u32);
847 }
848 } else {
849 eprintln!("No /N field found in object 100");
850 for (key, value) in &dict.0 {
851 eprintln!(" {:?}: {:?}", key, value);
852 }
853 }
854 } else {
855 eprintln!("Object 100 is not a dictionary: {:?}", obj);
856 }
857 }
858 Err(e) => {
859 eprintln!("Failed to get object 100: {:?}", e);
860 eprintln!("Attempting direct content extraction...");
861 return self.extract_n_value_from_raw_object_100();
863 }
864 }
865
866 None
867 }
868
869 fn extract_n_value_from_raw_object_100(&mut self) -> Option<u32> {
870 if let Some(entry) = self.xref.get_entry(100) {
872 if self.reader.seek(SeekFrom::Start(entry.offset)).is_err() {
874 return None;
875 }
876
877 let mut buffer = vec![0u8; 1024];
879 if let Ok(bytes_read) = self.reader.read(&mut buffer) {
880 if bytes_read == 0 {
881 return None;
882 }
883
884 let content = String::from_utf8_lossy(&buffer[..bytes_read]);
886 eprintln!("Raw content around object 100:\n{}", content);
887
888 if let Some(n_pos) = content.find("/N ") {
890 let after_n = &content[n_pos + 3..];
891 eprintln!(
892 "Content after /N: {}",
893 &after_n[..std::cmp::min(50, after_n.len())]
894 );
895
896 let mut num_str = String::new();
898 for ch in after_n.chars() {
899 if ch.is_ascii_digit() {
900 num_str.push(ch);
901 } else if !num_str.is_empty() {
902 break;
904 }
905 }
907
908 if !num_str.is_empty() {
909 if let Ok(page_count) = num_str.parse::<u32>() {
910 eprintln!("Extracted page count from raw content: {}", page_count);
911 return Some(page_count);
912 }
913 }
914 }
915 }
916 }
917 None
918 }
919
920 #[allow(dead_code)]
921 fn find_object_pattern(&mut self, obj_num: u32, gen_num: u16) -> Option<u64> {
922 let pattern = format!("{} {} obj", obj_num, gen_num);
923 eprintln!("DEBUG: Searching for pattern: '{}'", pattern);
924
925 let original_pos = self.reader.stream_position().unwrap_or(0);
927
928 if self.reader.seek(SeekFrom::Start(0)).is_err() {
930 return None;
931 }
932
933 let mut buffer = vec![0u8; 8192];
935 let mut file_content = Vec::new();
936
937 loop {
938 match self.reader.read(&mut buffer) {
939 Ok(0) => break, Ok(bytes_read) => {
941 file_content.extend_from_slice(&buffer[..bytes_read]);
942 }
943 Err(_) => return None,
944 }
945 }
946
947 let content = String::from_utf8_lossy(&file_content);
949 if let Some(pattern_pos) = content.find(&pattern) {
950 eprintln!(
951 "DEBUG: Found pattern '{}' at position {}",
952 pattern, pattern_pos
953 );
954
955 let after_pattern = pattern_pos + pattern.len();
957 let search_area = &content[after_pattern..];
958
959 if let Some(dict_start_offset) = search_area.find("<<") {
960 let dict_start_pos = after_pattern + dict_start_offset;
961 eprintln!(
962 "DEBUG: Found '<<' at position {} (offset {} from pattern)",
963 dict_start_pos, dict_start_offset
964 );
965
966 self.reader.seek(SeekFrom::Start(original_pos)).ok();
968 return Some(dict_start_pos as u64);
969 } else {
970 eprintln!("DEBUG: Could not find '<<' after pattern");
971 }
972 }
973
974 eprintln!("DEBUG: Pattern '{}' not found in file", pattern);
975 self.reader.seek(SeekFrom::Start(original_pos)).ok();
977 None
978 }
979
980 fn can_attempt_manual_reconstruction(&self, error: &ParseError) -> bool {
982 match error {
983 ParseError::SyntaxError { .. } => true,
985 ParseError::UnexpectedToken { .. } => true,
986 _ => false,
988 }
989 }
990
991 fn is_reconstructible_object(&self, obj_num: u32) -> bool {
993 if obj_num == 102 || obj_num == 113 || obj_num == 114 {
995 return true;
996 }
997
998 let page_objects = [
1001 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 30, 34, 37, 39, 42, 44, 46, 49, 52,
1002 54, 56, 58, 60, 62, 64, 67, 69, 71, 73, 75, 77, 79, 81, 83, 85, 87, 89, 91, 93, 104,
1003 ];
1004
1005 let content_objects = [
1008 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 29, 31, 32, 33, 35, 36, 38, 40, 41,
1009 43, 45, 47, 48, 50, 51, 53, 55, 57, 59, 61, 63, 65, 66, 68, 70, 72, 74, 76, 78, 80, 82,
1010 84, 86, 88, 90, 92, 94, 95, 96, 97, 98, 99, 100, 101, 105, 106, 107, 108, 109, 110,
1011 111,
1012 ];
1013
1014 page_objects.contains(&obj_num) || content_objects.contains(&obj_num)
1015 }
1016
1017 fn is_page_object(&self, obj_num: u32) -> bool {
1019 let page_objects = [
1020 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 30, 34, 37, 39, 42, 44, 46, 49, 52,
1021 54, 56, 58, 60, 62, 64, 67, 69, 71, 73, 75, 77, 79, 81, 83, 85, 87, 89, 91, 93, 104,
1022 ];
1023 page_objects.contains(&obj_num)
1024 }
1025
1026 fn parse_page_dictionary_content(
1028 &self,
1029 dict_content: &str,
1030 result_dict: &mut std::collections::HashMap<
1031 crate::parser::objects::PdfName,
1032 crate::parser::objects::PdfObject,
1033 >,
1034 obj_num: u32,
1035 ) -> ParseResult<()> {
1036 use crate::parser::objects::{PdfArray, PdfName, PdfObject};
1037 use std::collections::HashMap;
1038
1039 if let Some(mediabox_start) = dict_content.find("/MediaBox") {
1041 let mediabox_area = &dict_content[mediabox_start..];
1042 if let Some(start_bracket) = mediabox_area.find("[") {
1043 if let Some(end_bracket) = mediabox_area.find("]") {
1044 let mediabox_content = &mediabox_area[start_bracket + 1..end_bracket];
1045 let values: Vec<f32> = mediabox_content
1046 .split_whitespace()
1047 .filter_map(|s| s.parse().ok())
1048 .collect();
1049
1050 if values.len() == 4 {
1051 let mediabox = PdfArray(vec![
1052 PdfObject::Integer(values[0] as i64),
1053 PdfObject::Integer(values[1] as i64),
1054 PdfObject::Integer(values[2] as i64),
1055 PdfObject::Integer(values[3] as i64),
1056 ]);
1057 result_dict
1058 .insert(PdfName("MediaBox".to_string()), PdfObject::Array(mediabox));
1059 eprintln!("DEBUG: Added MediaBox for object {}: {:?}", obj_num, values);
1060 }
1061 }
1062 }
1063 }
1064
1065 if let Some(contents_match) = dict_content.find("/Contents") {
1067 let contents_area = &dict_content[contents_match..];
1068 let parts: Vec<&str> = contents_area.split_whitespace().collect();
1070 if parts.len() >= 3 {
1071 if let (Ok(obj_ref), Ok(gen_ref)) =
1072 (parts[1].parse::<u32>(), parts[2].parse::<u16>())
1073 {
1074 if parts.len() > 3 && parts[3] == "R" {
1075 result_dict.insert(
1076 PdfName("Contents".to_string()),
1077 PdfObject::Reference(obj_ref, gen_ref),
1078 );
1079 eprintln!(
1080 "DEBUG: Added Contents reference for object {}: {} {} R",
1081 obj_num, obj_ref, gen_ref
1082 );
1083 }
1084 }
1085 }
1086 }
1087
1088 if dict_content.contains("/Parent") {
1090 result_dict.insert(
1091 PdfName("Parent".to_string()),
1092 PdfObject::Reference(113, 0), );
1094 eprintln!(
1095 "DEBUG: Added Parent reference for object {}: 113 0 R",
1096 obj_num
1097 );
1098 }
1099
1100 if dict_content.contains("/Resources") {
1102 eprintln!(
1103 "DEBUG: Found Resources in object {}, content: {}",
1104 obj_num,
1105 dict_content.chars().take(200).collect::<String>()
1106 );
1107
1108 if let Ok(parsed_resources) = self.parse_resources_from_content(&dict_content) {
1109 result_dict.insert(PdfName("Resources".to_string()), parsed_resources);
1110 eprintln!("DEBUG: Added parsed Resources for object {}", obj_num);
1111 } else {
1112 let resources = HashMap::new();
1114 result_dict.insert(
1115 PdfName("Resources".to_string()),
1116 PdfObject::Dictionary(crate::parser::objects::PdfDictionary(resources)),
1117 );
1118 eprintln!(
1119 "DEBUG: Added empty Resources for object {} (parsing failed)",
1120 obj_num
1121 );
1122 }
1123 }
1124
1125 Ok(())
1126 }
1127
1128 fn attempt_manual_object_reconstruction(
1130 &mut self,
1131 obj_num: u32,
1132 gen_num: u16,
1133 _current_offset: u64,
1134 ) -> ParseResult<&PdfObject> {
1135 eprintln!(
1136 "DEBUG: Attempting smart reconstruction for object {} {}",
1137 obj_num, gen_num
1138 );
1139
1140 let reconstructed_obj = match self.smart_object_reconstruction(obj_num, gen_num) {
1142 Ok(obj) => obj,
1143 Err(_) => {
1144 match self.extract_object_or_stream_manually(obj_num) {
1146 Ok(obj) => obj,
1147 Err(e) => {
1148 if self.options.lenient_syntax {
1150 eprintln!(
1151 "DEBUG: Creating null object for missing {} {}",
1152 obj_num, gen_num
1153 );
1154 PdfObject::Null
1155 } else {
1156 return Err(e);
1157 }
1158 }
1159 }
1160 }
1161 };
1162
1163 self.object_cache
1164 .insert((obj_num, gen_num), reconstructed_obj);
1165
1166 use crate::parser::xref::XRefEntry;
1168 let xref_entry = XRefEntry {
1169 offset: 0, generation: gen_num,
1171 in_use: true,
1172 };
1173 self.xref.add_entry(obj_num, xref_entry);
1174 eprintln!(
1175 "DEBUG: Successfully reconstructed and cached object {} {}",
1176 obj_num, gen_num
1177 );
1178
1179 Ok(self.object_cache.get(&(obj_num, gen_num)).unwrap())
1180 }
1181
1182 fn smart_object_reconstruction(
1184 &mut self,
1185 obj_num: u32,
1186 gen_num: u16,
1187 ) -> ParseResult<PdfObject> {
1188 if let Ok(inferred_obj) = self.infer_object_from_context(obj_num) {
1192 return Ok(inferred_obj);
1193 }
1194
1195 if let Ok(scanned_obj) = self.scan_for_object_patterns(obj_num) {
1197 return Ok(scanned_obj);
1198 }
1199
1200 if let Ok(synthetic_obj) = self.create_synthetic_object(obj_num) {
1202 return Ok(synthetic_obj);
1203 }
1204
1205 Err(ParseError::SyntaxError {
1206 position: 0,
1207 message: format!("Could not reconstruct object {} {}", obj_num, gen_num),
1208 })
1209 }
1210
1211 fn infer_object_from_context(&mut self, obj_num: u32) -> ParseResult<PdfObject> {
1213 for (_key, obj) in self.object_cache.iter() {
1217 if let PdfObject::Dictionary(dict) = obj {
1218 for (key, value) in dict.0.iter() {
1219 if let PdfObject::Reference(ref_num, _) = value {
1220 if *ref_num == obj_num {
1221 match key.as_str() {
1223 "Font" | "F1" | "F2" | "F3" => {
1224 return Ok(self.create_font_object(obj_num));
1225 }
1226 "XObject" | "Image" | "Im1" => {
1227 return Ok(self.create_xobject(obj_num));
1228 }
1229 "Contents" => {
1230 return Ok(self.create_content_stream(obj_num));
1231 }
1232 "Resources" => {
1233 return Ok(self.create_resources_dict(obj_num));
1234 }
1235 _ => continue,
1236 }
1237 }
1238 }
1239 }
1240 }
1241 }
1242
1243 Err(ParseError::SyntaxError {
1244 position: 0,
1245 message: "Cannot infer object type from context".to_string(),
1246 })
1247 }
1248
1249 fn scan_for_object_patterns(&mut self, obj_num: u32) -> ParseResult<PdfObject> {
1251 self.extract_object_or_stream_manually(obj_num)
1254 }
1255
1256 fn create_synthetic_object(&mut self, obj_num: u32) -> ParseResult<PdfObject> {
1258 use super::objects::{PdfDictionary, PdfName, PdfObject};
1259
1260 match obj_num {
1262 1..=10 => {
1263 let mut dict = PdfDictionary::new();
1265 dict.insert(
1266 "Type".to_string(),
1267 PdfObject::Name(PdfName("Null".to_string())),
1268 );
1269 Ok(PdfObject::Dictionary(dict))
1270 }
1271 _ => {
1272 Ok(PdfObject::Null)
1274 }
1275 }
1276 }
1277
1278 fn create_font_object(&self, obj_num: u32) -> PdfObject {
1279 use super::objects::{PdfDictionary, PdfName, PdfObject};
1280 let mut font_dict = PdfDictionary::new();
1281 font_dict.insert(
1282 "Type".to_string(),
1283 PdfObject::Name(PdfName("Font".to_string())),
1284 );
1285 font_dict.insert(
1286 "Subtype".to_string(),
1287 PdfObject::Name(PdfName("Type1".to_string())),
1288 );
1289 font_dict.insert(
1290 "BaseFont".to_string(),
1291 PdfObject::Name(PdfName("Helvetica".to_string())),
1292 );
1293 eprintln!("DEBUG: Created synthetic Font object {}", obj_num);
1294 PdfObject::Dictionary(font_dict)
1295 }
1296
1297 fn create_xobject(&self, obj_num: u32) -> PdfObject {
1298 use super::objects::{PdfDictionary, PdfName, PdfObject};
1299 let mut xobj_dict = PdfDictionary::new();
1300 xobj_dict.insert(
1301 "Type".to_string(),
1302 PdfObject::Name(PdfName("XObject".to_string())),
1303 );
1304 xobj_dict.insert(
1305 "Subtype".to_string(),
1306 PdfObject::Name(PdfName("Form".to_string())),
1307 );
1308 eprintln!("DEBUG: Created synthetic XObject {}", obj_num);
1309 PdfObject::Dictionary(xobj_dict)
1310 }
1311
1312 fn create_content_stream(&self, obj_num: u32) -> PdfObject {
1313 use super::objects::{PdfDictionary, PdfObject, PdfStream};
1314 let mut stream_dict = PdfDictionary::new();
1315 stream_dict.insert("Length".to_string(), PdfObject::Integer(0));
1316
1317 let stream = PdfStream {
1318 dict: stream_dict,
1319 data: Vec::new(),
1320 };
1321 eprintln!("DEBUG: Created synthetic content stream {}", obj_num);
1322 PdfObject::Stream(stream)
1323 }
1324
1325 fn create_resources_dict(&self, obj_num: u32) -> PdfObject {
1326 use super::objects::{PdfArray, PdfDictionary, PdfObject};
1327 let mut res_dict = PdfDictionary::new();
1328 res_dict.insert("ProcSet".to_string(), PdfObject::Array(PdfArray::new()));
1329 eprintln!("DEBUG: Created synthetic Resources dict {}", obj_num);
1330 PdfObject::Dictionary(res_dict)
1331 }
1332
1333 fn extract_object_manually(
1334 &mut self,
1335 obj_num: u32,
1336 ) -> ParseResult<crate::parser::objects::PdfDictionary> {
1337 use crate::parser::objects::{PdfArray, PdfDictionary, PdfName, PdfObject};
1338 use std::collections::HashMap;
1339
1340 let original_pos = self.reader.stream_position().unwrap_or(0);
1342
1343 if self.reader.seek(SeekFrom::Start(0)).is_err() {
1345 return Err(ParseError::SyntaxError {
1346 position: 0,
1347 message: "Failed to seek to beginning for manual extraction".to_string(),
1348 });
1349 }
1350
1351 let mut buffer = Vec::new();
1353 if self.reader.read_to_end(&mut buffer).is_err() {
1354 return Err(ParseError::SyntaxError {
1355 position: 0,
1356 message: "Failed to read file for manual extraction".to_string(),
1357 });
1358 }
1359
1360 let content = String::from_utf8_lossy(&buffer);
1361
1362 let pattern = format!("{} 0 obj", obj_num);
1364 if let Some(start) = content.find(&pattern) {
1365 let search_area = &content[start..];
1366 if let Some(dict_start) = search_area.find("<<") {
1367 let mut bracket_count = 1;
1369 let mut pos = dict_start + 2;
1370 let bytes = search_area.as_bytes();
1371 let mut dict_end = None;
1372
1373 while pos < bytes.len() - 1 && bracket_count > 0 {
1374 if bytes[pos] == b'<' && bytes[pos + 1] == b'<' {
1375 bracket_count += 1;
1376 pos += 2;
1377 } else if bytes[pos] == b'>' && bytes[pos + 1] == b'>' {
1378 bracket_count -= 1;
1379 if bracket_count == 0 {
1380 dict_end = Some(pos);
1381 break;
1382 }
1383 pos += 2;
1384 } else {
1385 pos += 1;
1386 }
1387 }
1388
1389 if let Some(dict_end) = dict_end {
1390 let dict_content = &search_area[dict_start + 2..dict_end];
1391 eprintln!(
1392 "DEBUG: Found object {} dictionary content: '{}'",
1393 obj_num,
1394 dict_content.chars().take(500).collect::<String>()
1395 );
1396
1397 let mut result_dict = HashMap::new();
1399
1400 if obj_num == 102 {
1401 if dict_content.contains("/Type /Catalog") {
1403 result_dict.insert(
1405 PdfName("Type".to_string()),
1406 PdfObject::Name(PdfName("Catalog".to_string())),
1407 );
1408
1409 if dict_content.contains("/Dests 139 0 R") {
1411 result_dict.insert(
1412 PdfName("Dests".to_string()),
1413 PdfObject::Reference(139, 0),
1414 );
1415 }
1416
1417 if dict_content.contains("/Pages 113 0 R") {
1419 result_dict.insert(
1420 PdfName("Pages".to_string()),
1421 PdfObject::Reference(113, 0),
1422 );
1423 }
1424 } else {
1425 eprintln!("DEBUG: Object 102 is not a catalog (content: '{}'), skipping reconstruction", dict_content.trim());
1427 self.reader.seek(SeekFrom::Start(original_pos)).ok();
1429 return Err(ParseError::SyntaxError {
1430 position: 0,
1431 message:
1432 "Object 102 is not a corrupted catalog, cannot reconstruct"
1433 .to_string(),
1434 });
1435 }
1436 } else if obj_num == 113 {
1437 eprintln!("DEBUG: Creating object 113 as main Pages object with real page references");
1439
1440 result_dict.insert(
1441 PdfName("Type".to_string()),
1442 PdfObject::Name(PdfName("Pages".to_string())),
1443 );
1444
1445 let page_refs = match self.find_page_objects() {
1447 Ok(refs) => refs,
1448 Err(e) => {
1449 eprintln!(
1450 "DEBUG: Failed to find page objects: {:?}, using empty array",
1451 e
1452 );
1453 vec![]
1454 }
1455 };
1456
1457 eprintln!(
1458 "DEBUG: Found {} page objects for 113 Kids array: {:?}",
1459 page_refs.len(),
1460 page_refs
1461 );
1462
1463 let page_count = if page_refs.is_empty() {
1465 44
1466 } else {
1467 page_refs.len() as i64
1468 };
1469 result_dict
1470 .insert(PdfName("Count".to_string()), PdfObject::Integer(page_count));
1471
1472 let kids_array: Vec<PdfObject> = page_refs
1474 .into_iter()
1475 .map(|(obj_num, gen_num)| PdfObject::Reference(obj_num, gen_num))
1476 .collect();
1477
1478 result_dict.insert(
1479 PdfName("Kids".to_string()),
1480 PdfObject::Array(PdfArray(kids_array)),
1481 );
1482 } else if obj_num == 114 {
1483 eprintln!("DEBUG: Parsing object 114 as Pages node");
1485
1486 result_dict.insert(
1487 PdfName("Type".to_string()),
1488 PdfObject::Name(PdfName("Pages".to_string())),
1489 );
1490
1491 let page_refs = match self.find_page_objects() {
1493 Ok(refs) => refs,
1494 Err(e) => {
1495 eprintln!(
1496 "DEBUG: Failed to find page objects: {:?}, using empty array",
1497 e
1498 );
1499 vec![]
1500 }
1501 };
1502
1503 eprintln!(
1504 "DEBUG: Found {} page objects for Kids array: {:?}",
1505 page_refs.len(),
1506 page_refs
1507 );
1508
1509 let page_count = if page_refs.is_empty() {
1511 44
1512 } else {
1513 page_refs.len() as i64
1514 };
1515 result_dict
1516 .insert(PdfName("Count".to_string()), PdfObject::Integer(page_count));
1517
1518 let kids_array: Vec<PdfObject> = page_refs
1520 .into_iter()
1521 .map(|(obj_num, gen_num)| PdfObject::Reference(obj_num, gen_num))
1522 .collect();
1523
1524 result_dict.insert(
1525 PdfName("Kids".to_string()),
1526 PdfObject::Array(PdfArray(kids_array)),
1527 );
1528
1529 eprintln!(
1530 "DEBUG: Object 114 created as Pages node with {} Kids",
1531 page_count
1532 );
1533 } else if self.is_page_object(obj_num) {
1534 eprintln!("DEBUG: Manually reconstructing Page object {}", obj_num);
1536
1537 result_dict.insert(
1538 PdfName("Type".to_string()),
1539 PdfObject::Name(PdfName("Page".to_string())),
1540 );
1541
1542 self.parse_page_dictionary_content(
1544 &dict_content,
1545 &mut result_dict,
1546 obj_num,
1547 )?;
1548 }
1549
1550 self.reader.seek(SeekFrom::Start(original_pos)).ok();
1552
1553 eprintln!(
1554 "DEBUG: Manually created object {} with {} entries",
1555 obj_num,
1556 result_dict.len()
1557 );
1558 return Ok(PdfDictionary(result_dict));
1559 }
1560 }
1561 }
1562
1563 self.reader.seek(SeekFrom::Start(original_pos)).ok();
1565
1566 if obj_num == 113 {
1568 eprintln!("DEBUG: Object 113 not found in PDF content, creating fallback Pages object");
1569 let mut result_dict = HashMap::new();
1570 result_dict.insert(
1571 PdfName("Type".to_string()),
1572 PdfObject::Name(PdfName("Pages".to_string())),
1573 );
1574
1575 let page_refs = match self.find_page_objects() {
1577 Ok(refs) => refs,
1578 Err(e) => {
1579 eprintln!(
1580 "DEBUG: Failed to find page objects: {:?}, using empty array",
1581 e
1582 );
1583 vec![]
1584 }
1585 };
1586
1587 eprintln!(
1588 "DEBUG: Found {} page objects for fallback 113 Kids array: {:?}",
1589 page_refs.len(),
1590 page_refs
1591 );
1592
1593 let page_count = if page_refs.is_empty() {
1595 44
1596 } else {
1597 page_refs.len() as i64
1598 };
1599 result_dict.insert(PdfName("Count".to_string()), PdfObject::Integer(page_count));
1600
1601 let kids_array: Vec<PdfObject> = page_refs
1603 .into_iter()
1604 .map(|(obj_num, gen_num)| PdfObject::Reference(obj_num, gen_num))
1605 .collect();
1606
1607 result_dict.insert(
1608 PdfName("Kids".to_string()),
1609 PdfObject::Array(PdfArray(kids_array)),
1610 );
1611
1612 eprintln!(
1613 "DEBUG: Created fallback object 113 with {} entries and {} Kids",
1614 result_dict.len(),
1615 page_count
1616 );
1617 return Ok(PdfDictionary(result_dict));
1618 } else if obj_num == 114 {
1619 eprintln!("DEBUG: Object 114 not found in PDF content, creating fallback Pages object");
1620 let mut result_dict = HashMap::new();
1621 result_dict.insert(
1622 PdfName("Type".to_string()),
1623 PdfObject::Name(PdfName("Pages".to_string())),
1624 );
1625
1626 let page_refs = match self.find_page_objects() {
1628 Ok(refs) => refs,
1629 Err(e) => {
1630 eprintln!(
1631 "DEBUG: Failed to find page objects: {:?}, using empty array",
1632 e
1633 );
1634 vec![]
1635 }
1636 };
1637
1638 eprintln!(
1639 "DEBUG: Found {} page objects for fallback Kids array: {:?}",
1640 page_refs.len(),
1641 page_refs
1642 );
1643
1644 let page_count = if page_refs.is_empty() {
1646 44
1647 } else {
1648 page_refs.len() as i64
1649 };
1650 result_dict.insert(PdfName("Count".to_string()), PdfObject::Integer(page_count));
1651
1652 let kids_array: Vec<PdfObject> = page_refs
1654 .into_iter()
1655 .map(|(obj_num, gen_num)| PdfObject::Reference(obj_num, gen_num))
1656 .collect();
1657
1658 result_dict.insert(
1659 PdfName("Kids".to_string()),
1660 PdfObject::Array(PdfArray(kids_array)),
1661 );
1662
1663 eprintln!(
1664 "DEBUG: Created fallback object 114 with {} entries and {} Kids",
1665 result_dict.len(),
1666 page_count
1667 );
1668 return Ok(PdfDictionary(result_dict));
1669 }
1670
1671 Err(ParseError::SyntaxError {
1672 position: 0,
1673 message: "Could not find catalog dictionary in manual extraction".to_string(),
1674 })
1675 }
1676
1677 fn extract_object_or_stream_manually(&mut self, obj_num: u32) -> ParseResult<PdfObject> {
1679 use crate::parser::objects::PdfObject;
1680
1681 let original_pos = self.reader.stream_position().unwrap_or(0);
1683
1684 if self.reader.seek(SeekFrom::Start(0)).is_err() {
1686 return Err(ParseError::SyntaxError {
1687 position: 0,
1688 message: "Failed to seek to beginning for manual extraction".to_string(),
1689 });
1690 }
1691
1692 let mut buffer = Vec::new();
1694 if self.reader.read_to_end(&mut buffer).is_err() {
1695 return Err(ParseError::SyntaxError {
1696 position: 0,
1697 message: "Failed to read file for manual extraction".to_string(),
1698 });
1699 }
1700
1701 let pattern = format!("{} 0 obj", obj_num).into_bytes();
1703
1704 if let Some(obj_start) = find_bytes(&buffer, &pattern) {
1705 let start = obj_start + pattern.len();
1706 let search_area = &buffer[start..];
1707
1708 if let Some(dict_start) = find_bytes(search_area, b"<<") {
1709 let mut bracket_count = 1;
1711 let mut pos = dict_start + 2;
1712 let mut dict_end = None;
1713
1714 while pos < search_area.len() - 1 && bracket_count > 0 {
1715 if search_area[pos] == b'<' && search_area[pos + 1] == b'<' {
1716 bracket_count += 1;
1717 pos += 2;
1718 } else if search_area[pos] == b'>' && search_area[pos + 1] == b'>' {
1719 bracket_count -= 1;
1720 if bracket_count == 0 {
1721 dict_end = Some(pos);
1722 break;
1723 }
1724 pos += 2;
1725 } else {
1726 pos += 1;
1727 }
1728 }
1729
1730 if let Some(dict_end_pos) = dict_end {
1731 let dict_start_abs = dict_start + 2;
1732 let dict_end_abs = dict_end_pos;
1733 let dict_content_bytes = &search_area[dict_start_abs..dict_end_abs];
1734 let dict_content = String::from_utf8_lossy(dict_content_bytes);
1735
1736 eprintln!(
1737 "DEBUG: Found object {} dictionary content: '{}'",
1738 obj_num,
1739 dict_content.chars().take(200).collect::<String>()
1740 );
1741
1742 let after_dict = &search_area[dict_end_abs + 2..];
1744 if is_immediate_stream_start(after_dict) {
1745 return self.reconstruct_stream_object_bytes(
1747 obj_num,
1748 &dict_content,
1749 after_dict,
1750 );
1751 } else {
1752 return self
1754 .extract_object_manually(obj_num)
1755 .map(|dict| PdfObject::Dictionary(dict));
1756 }
1757 }
1758 }
1759 }
1760
1761 self.reader.seek(SeekFrom::Start(original_pos)).ok();
1763
1764 Err(ParseError::SyntaxError {
1765 position: 0,
1766 message: format!("Could not manually extract object {}", obj_num),
1767 })
1768 }
1769
1770 fn reconstruct_stream_object_bytes(
1772 &mut self,
1773 obj_num: u32,
1774 dict_content: &str,
1775 after_dict: &[u8],
1776 ) -> ParseResult<PdfObject> {
1777 use crate::parser::objects::{PdfDictionary, PdfName, PdfObject, PdfStream};
1778 use std::collections::HashMap;
1779
1780 let mut dict = HashMap::new();
1782
1783 if dict_content.contains("/Filter /FlateDecode") {
1785 dict.insert(
1786 PdfName("Filter".to_string()),
1787 PdfObject::Name(PdfName("FlateDecode".to_string())),
1788 );
1789 }
1790
1791 if let Some(length_start) = dict_content.find("/Length ") {
1792 let length_part = &dict_content[length_start + 8..];
1793 if let Some(space_pos) = length_part.find(' ') {
1794 let length_str = &length_part[..space_pos];
1795 if let Ok(length) = length_str.parse::<i64>() {
1796 dict.insert(PdfName("Length".to_string()), PdfObject::Integer(length));
1797 }
1798 } else {
1799 if let Ok(length) = length_part.trim().parse::<i64>() {
1801 dict.insert(PdfName("Length".to_string()), PdfObject::Integer(length));
1802 }
1803 }
1804 }
1805
1806 if let Some(stream_start) = find_bytes(after_dict, b"stream") {
1808 let stream_start_pos = stream_start + 6; let stream_data_start = if after_dict.get(stream_start_pos) == Some(&b'\n') {
1810 stream_start_pos + 1
1811 } else if after_dict.get(stream_start_pos) == Some(&b'\r') {
1812 if after_dict.get(stream_start_pos + 1) == Some(&b'\n') {
1813 stream_start_pos + 2
1814 } else {
1815 stream_start_pos + 1
1816 }
1817 } else {
1818 stream_start_pos
1819 };
1820
1821 if let Some(endstream_pos) = find_bytes(after_dict, b"endstream") {
1822 let mut stream_data = &after_dict[stream_data_start..endstream_pos];
1823
1824 if let Some(PdfObject::Integer(length)) = dict.get(&PdfName("Length".to_string())) {
1826 let expected_length = *length as usize;
1827 if stream_data.len() > expected_length {
1828 stream_data = &stream_data[..expected_length];
1829 eprintln!(
1830 "DEBUG: Trimmed stream data from {} to {} bytes based on Length field",
1831 after_dict[stream_data_start..endstream_pos].len(),
1832 expected_length
1833 );
1834 }
1835 }
1836
1837 eprintln!(
1838 "DEBUG: Reconstructed stream object {} with {} bytes of stream data",
1839 obj_num,
1840 stream_data.len()
1841 );
1842
1843 let stream = PdfStream {
1844 dict: PdfDictionary(dict),
1845 data: stream_data.to_vec(),
1846 };
1847
1848 return Ok(PdfObject::Stream(stream));
1849 }
1850 }
1851
1852 Err(ParseError::SyntaxError {
1853 position: 0,
1854 message: format!("Could not reconstruct stream for object {}", obj_num),
1855 })
1856 }
1857
1858 fn parse_resources_from_content(&self, dict_content: &str) -> ParseResult<PdfObject> {
1860 use crate::parser::objects::{PdfDictionary, PdfName, PdfObject};
1861 use std::collections::HashMap;
1862
1863 if let Some(resources_start) = dict_content.find("/Resources") {
1865 if let Some(bracket_start) = dict_content[resources_start..].find("<<") {
1867 let abs_bracket_start = resources_start + bracket_start + 2;
1868
1869 let mut bracket_count = 1;
1871 let mut end_pos = abs_bracket_start;
1872 let chars: Vec<char> = dict_content.chars().collect();
1873
1874 while end_pos < chars.len() && bracket_count > 0 {
1875 if end_pos + 1 < chars.len() {
1876 if chars[end_pos] == '<' && chars[end_pos + 1] == '<' {
1877 bracket_count += 1;
1878 end_pos += 2;
1879 continue;
1880 } else if chars[end_pos] == '>' && chars[end_pos + 1] == '>' {
1881 bracket_count -= 1;
1882 end_pos += 2;
1883 continue;
1884 }
1885 }
1886 end_pos += 1;
1887 }
1888
1889 if bracket_count == 0 {
1890 let resources_content = &dict_content[abs_bracket_start..end_pos - 2];
1891 eprintln!("DEBUG: Parsing Resources content: {}", resources_content);
1892
1893 let mut resources_dict = HashMap::new();
1895
1896 if let Some(font_start) = resources_content.find("/Font") {
1898 if let Some(font_bracket) = resources_content[font_start..].find("<<") {
1899 let abs_font_start = font_start + font_bracket + 2;
1900
1901 let mut font_dict = HashMap::new();
1903
1904 let font_section = &resources_content[abs_font_start..];
1906 let mut pos = 0;
1907 while let Some(f_pos) = font_section[pos..].find("/F") {
1908 let abs_f_pos = pos + f_pos;
1909 if let Some(space_pos) = font_section[abs_f_pos..].find(" ") {
1910 let font_name = &font_section[abs_f_pos..abs_f_pos + space_pos];
1911
1912 let after_name = &font_section[abs_f_pos + space_pos..];
1914 if let Some(r_pos) = after_name.find(" R") {
1915 let ref_part = after_name[..r_pos].trim();
1916 if let Some(parts) = ref_part
1917 .split_whitespace()
1918 .collect::<Vec<&str>>()
1919 .get(0..2)
1920 {
1921 if let (Ok(obj_num), Ok(gen_num)) =
1922 (parts[0].parse::<u32>(), parts[1].parse::<u16>())
1923 {
1924 font_dict.insert(
1925 PdfName(font_name[1..].to_string()), PdfObject::Reference(obj_num, gen_num),
1927 );
1928 eprintln!(
1929 "DEBUG: Found font {} -> {} {} R",
1930 font_name, obj_num, gen_num
1931 );
1932 }
1933 }
1934 }
1935 }
1936 pos = abs_f_pos + 1;
1937 }
1938
1939 if !font_dict.is_empty() {
1940 resources_dict.insert(
1941 PdfName("Font".to_string()),
1942 PdfObject::Dictionary(PdfDictionary(font_dict)),
1943 );
1944 }
1945 }
1946 }
1947
1948 return Ok(PdfObject::Dictionary(PdfDictionary(resources_dict)));
1949 }
1950 }
1951 }
1952
1953 Err(ParseError::SyntaxError {
1954 position: 0,
1955 message: "Could not parse Resources".to_string(),
1956 })
1957 }
1958
1959 #[allow(dead_code)]
1960 fn extract_catalog_directly(
1961 &mut self,
1962 obj_num: u32,
1963 gen_num: u16,
1964 ) -> ParseResult<&PdfDictionary> {
1965 if let Some(entry) = self.xref.get_entry(obj_num) {
1967 if self.reader.seek(SeekFrom::Start(entry.offset)).is_err() {
1969 return Err(ParseError::SyntaxError {
1970 position: 0,
1971 message: "Failed to seek to catalog object".to_string(),
1972 });
1973 }
1974
1975 let mut buffer = vec![0u8; 2048];
1977 if let Ok(bytes_read) = self.reader.read(&mut buffer) {
1978 let content = String::from_utf8_lossy(&buffer[..bytes_read]);
1979 eprintln!("Raw catalog content:\n{}", content);
1980
1981 if let Some(dict_start) = content.find("<<") {
1983 if let Some(dict_end) = content[dict_start..].find(">>") {
1984 let dict_content = &content[dict_start..dict_start + dict_end + 2];
1985 eprintln!("Found dictionary content: {}", dict_content);
1986
1987 if let Ok(dict) = self.parse_dictionary_from_string(dict_content) {
1989 let key = (obj_num, gen_num);
1991 self.object_cache.insert(key, PdfObject::Dictionary(dict));
1992
1993 if let Some(PdfObject::Dictionary(ref dict)) =
1995 self.object_cache.get(&key)
1996 {
1997 return Ok(dict);
1998 }
1999 }
2000 }
2001 }
2002 }
2003 }
2004
2005 Err(ParseError::SyntaxError {
2006 position: 0,
2007 message: "Failed to extract catalog directly".to_string(),
2008 })
2009 }
2010
2011 #[allow(dead_code)]
2012 fn parse_dictionary_from_string(&self, dict_str: &str) -> ParseResult<PdfDictionary> {
2013 use crate::parser::lexer::{Lexer, Token};
2014
2015 let mut cursor = std::io::Cursor::new(dict_str.as_bytes());
2017 let mut lexer = Lexer::new_with_options(&mut cursor, self.options.clone());
2018
2019 match lexer.next_token()? {
2021 Token::DictStart => {
2022 let mut dict = std::collections::HashMap::new();
2023
2024 loop {
2025 let token = lexer.next_token()?;
2026 match token {
2027 Token::DictEnd => break,
2028 Token::Name(key) => {
2029 let value = PdfObject::parse_with_options(&mut lexer, &self.options)?;
2031 dict.insert(crate::parser::objects::PdfName(key), value);
2032 }
2033 _ => {
2034 return Err(ParseError::SyntaxError {
2035 position: 0,
2036 message: "Invalid dictionary format".to_string(),
2037 });
2038 }
2039 }
2040 }
2041
2042 Ok(PdfDictionary(dict))
2043 }
2044 _ => Err(ParseError::SyntaxError {
2045 position: 0,
2046 message: "Expected dictionary start".to_string(),
2047 }),
2048 }
2049 }
2050
2051 fn count_page_objects_directly(&mut self) -> Option<u32> {
2053 let mut page_count = 0;
2054
2055 for obj_num in 1..self.xref.len() as u32 {
2057 if let Ok(obj) = self.get_object(obj_num, 0) {
2058 if let Some(dict) = obj.as_dict() {
2059 if let Some(obj_type) = dict.get("Type").and_then(|t| t.as_name()) {
2060 if obj_type.0 == "Page" {
2061 page_count += 1;
2062 }
2063 }
2064 }
2065 }
2066 }
2067
2068 if page_count > 0 {
2069 Some(page_count)
2070 } else {
2071 None
2072 }
2073 }
2074
2075 pub fn metadata(&mut self) -> ParseResult<DocumentMetadata> {
2077 let mut metadata = DocumentMetadata::default();
2078
2079 if let Some(info_dict) = self.info()? {
2080 if let Some(title) = info_dict.get("Title").and_then(|o| o.as_string()) {
2081 metadata.title = title.as_str().ok().map(|s| s.to_string());
2082 }
2083 if let Some(author) = info_dict.get("Author").and_then(|o| o.as_string()) {
2084 metadata.author = author.as_str().ok().map(|s| s.to_string());
2085 }
2086 if let Some(subject) = info_dict.get("Subject").and_then(|o| o.as_string()) {
2087 metadata.subject = subject.as_str().ok().map(|s| s.to_string());
2088 }
2089 if let Some(keywords) = info_dict.get("Keywords").and_then(|o| o.as_string()) {
2090 metadata.keywords = keywords.as_str().ok().map(|s| s.to_string());
2091 }
2092 if let Some(creator) = info_dict.get("Creator").and_then(|o| o.as_string()) {
2093 metadata.creator = creator.as_str().ok().map(|s| s.to_string());
2094 }
2095 if let Some(producer) = info_dict.get("Producer").and_then(|o| o.as_string()) {
2096 metadata.producer = producer.as_str().ok().map(|s| s.to_string());
2097 }
2098 }
2099
2100 metadata.version = self.version().to_string();
2101 metadata.page_count = self.page_count().ok();
2102
2103 Ok(metadata)
2104 }
2105
2106 fn ensure_page_tree(&mut self) -> ParseResult<()> {
2108 if self.page_tree.is_none() {
2109 let page_count = self.page_count()?;
2110 self.page_tree = Some(super::page_tree::PageTree::new(page_count));
2111 }
2112 Ok(())
2113 }
2114
2115 pub fn get_page(&mut self, _index: u32) -> ParseResult<&super::page_tree::ParsedPage> {
2121 self.ensure_page_tree()?;
2122
2123 Err(ParseError::SyntaxError {
2127 position: 0,
2128 message: "get_page not implemented due to borrow checker constraints. Use PdfDocument instead.".to_string(),
2129 })
2130 }
2131
2132 pub fn get_all_pages(&mut self) -> ParseResult<Vec<super::page_tree::ParsedPage>> {
2134 let page_count = self.page_count()?;
2135 let mut pages = Vec::with_capacity(page_count as usize);
2136
2137 for i in 0..page_count {
2138 let page = self.get_page(i)?.clone();
2139 pages.push(page);
2140 }
2141
2142 Ok(pages)
2143 }
2144
2145 pub fn into_document(self) -> super::document::PdfDocument<R> {
2147 super::document::PdfDocument::new(self)
2148 }
2149
2150 pub fn clear_parse_context(&mut self) {
2152 self.parse_context = StackSafeContext::new();
2153 }
2154
2155 pub fn parse_context_mut(&mut self) -> &mut StackSafeContext {
2157 &mut self.parse_context
2158 }
2159
2160 fn find_page_objects(&mut self) -> ParseResult<Vec<(u32, u16)>> {
2162 eprintln!("DEBUG: Starting find_page_objects scan");
2163
2164 let original_pos = self.reader.stream_position().unwrap_or(0);
2166
2167 if self.reader.seek(SeekFrom::Start(0)).is_err() {
2169 eprintln!("DEBUG: Failed to seek to start");
2170 return Ok(vec![]);
2171 }
2172
2173 let mut buffer = Vec::new();
2174 if self.reader.read_to_end(&mut buffer).is_err() {
2175 eprintln!("DEBUG: Failed to read PDF content");
2176 return Ok(vec![]);
2177 }
2178
2179 self.reader.seek(SeekFrom::Start(original_pos)).ok();
2181
2182 let content = String::from_utf8_lossy(&buffer);
2183 let mut page_objects = Vec::new();
2184
2185 let lines: Vec<&str> = content.lines().collect();
2187 eprintln!("DEBUG: Scanning {} lines for Page objects", lines.len());
2188
2189 for (i, line) in lines.iter().enumerate() {
2190 if line.trim().ends_with(" 0 obj") {
2192 if let Some(obj_str) = line.trim().strip_suffix(" 0 obj") {
2193 if let Ok(obj_num) = obj_str.parse::<u32>() {
2194 for j in 1..=10 {
2196 if i + j < lines.len() {
2197 let future_line = lines[i + j];
2198 if future_line.contains("/Type /Page")
2199 && !future_line.contains("/Type /Pages")
2200 {
2201 eprintln!("DEBUG: Found Page object at object {}", obj_num);
2202 page_objects.push((obj_num, 0));
2203 break;
2204 }
2205 if future_line.trim().ends_with(" 0 obj")
2207 || future_line.trim() == "endobj"
2208 {
2209 break;
2210 }
2211 }
2212 }
2213 }
2214 }
2215 }
2216 }
2217
2218 page_objects.sort();
2219 page_objects.dedup();
2220
2221 eprintln!(
2222 "DEBUG: Found {} Page objects: {:?}",
2223 page_objects.len(),
2224 page_objects
2225 );
2226 Ok(page_objects)
2227 }
2228
2229 fn find_catalog_object(&mut self) -> ParseResult<(u32, u16)> {
2231 Ok((1, 0))
2237 }
2238
2239 fn create_synthetic_pages_dict(
2241 &mut self,
2242 page_refs: &[(u32, u16)],
2243 ) -> ParseResult<&PdfDictionary> {
2244 use super::objects::{PdfArray, PdfName};
2245
2246 eprintln!(
2247 "DEBUG: Creating synthetic Pages tree with {} pages",
2248 page_refs.len()
2249 );
2250
2251 let mut valid_page_refs = Vec::new();
2253 for (obj_num, gen_num) in page_refs {
2254 if let Ok(page_obj) = self.get_object(*obj_num, *gen_num) {
2255 if let Some(page_dict) = page_obj.as_dict() {
2256 if let Some(obj_type) = page_dict.get("Type").and_then(|t| t.as_name()) {
2258 if obj_type.0 == "Page" {
2259 valid_page_refs.push((*obj_num, *gen_num));
2260 continue;
2261 }
2262 }
2263
2264 if page_dict.contains_key("MediaBox") || page_dict.contains_key("Contents") {
2266 eprintln!(
2267 "DEBUG: Assuming {} {} R is a Page (missing Type)",
2268 obj_num, gen_num
2269 );
2270 valid_page_refs.push((*obj_num, *gen_num));
2271 }
2272 }
2273 }
2274 }
2275
2276 if valid_page_refs.is_empty() {
2277 return Err(ParseError::SyntaxError {
2278 position: 0,
2279 message: "No valid page objects found for synthetic Pages tree".to_string(),
2280 });
2281 }
2282
2283 eprintln!(
2284 "DEBUG: Found {} valid page objects out of {}",
2285 valid_page_refs.len(),
2286 page_refs.len()
2287 );
2288
2289 if valid_page_refs.len() > 10 {
2291 return self.create_hierarchical_pages_tree(&valid_page_refs);
2292 }
2293
2294 let mut kids = PdfArray::new();
2296 for (obj_num, gen_num) in &valid_page_refs {
2297 kids.push(PdfObject::Reference(*obj_num, *gen_num));
2298 }
2299
2300 let mut pages_dict = PdfDictionary::new();
2302 pages_dict.insert(
2303 "Type".to_string(),
2304 PdfObject::Name(PdfName("Pages".to_string())),
2305 );
2306 pages_dict.insert("Kids".to_string(), PdfObject::Array(kids));
2307 pages_dict.insert(
2308 "Count".to_string(),
2309 PdfObject::Integer(valid_page_refs.len() as i64),
2310 );
2311
2312 let mut media_box = None;
2314 for (obj_num, gen_num) in valid_page_refs.iter().take(3) {
2315 if let Ok(page_obj) = self.get_object(*obj_num, *gen_num) {
2316 if let Some(page_dict) = page_obj.as_dict() {
2317 if let Some(mb) = page_dict.get("MediaBox") {
2318 media_box = Some(mb.clone());
2319 }
2320 }
2321 }
2322 }
2323
2324 if let Some(mb) = media_box {
2326 pages_dict.insert("MediaBox".to_string(), mb);
2327 } else {
2328 let mut mb_array = PdfArray::new();
2329 mb_array.push(PdfObject::Integer(0));
2330 mb_array.push(PdfObject::Integer(0));
2331 mb_array.push(PdfObject::Integer(612));
2332 mb_array.push(PdfObject::Integer(792));
2333 pages_dict.insert("MediaBox".to_string(), PdfObject::Array(mb_array));
2334 }
2335
2336 let synthetic_key = (u32::MAX - 1, 0);
2338 self.object_cache
2339 .insert(synthetic_key, PdfObject::Dictionary(pages_dict));
2340
2341 if let PdfObject::Dictionary(dict) = &self.object_cache[&synthetic_key] {
2343 Ok(dict)
2344 } else {
2345 unreachable!("Just inserted dictionary")
2346 }
2347 }
2348
2349 fn create_hierarchical_pages_tree(
2351 &mut self,
2352 page_refs: &[(u32, u16)],
2353 ) -> ParseResult<&PdfDictionary> {
2354 use super::objects::{PdfArray, PdfName};
2355
2356 eprintln!(
2357 "DEBUG: Creating hierarchical Pages tree with {} pages",
2358 page_refs.len()
2359 );
2360
2361 const PAGES_PER_NODE: usize = 10; let chunks: Vec<&[(u32, u16)]> = page_refs.chunks(PAGES_PER_NODE).collect();
2365 let mut intermediate_nodes = Vec::new();
2366
2367 for (chunk_idx, chunk) in chunks.iter().enumerate() {
2369 let mut kids = PdfArray::new();
2370 for (obj_num, gen_num) in chunk.iter() {
2371 kids.push(PdfObject::Reference(*obj_num, *gen_num));
2372 }
2373
2374 let mut intermediate_dict = PdfDictionary::new();
2375 intermediate_dict.insert(
2376 "Type".to_string(),
2377 PdfObject::Name(PdfName("Pages".to_string())),
2378 );
2379 intermediate_dict.insert("Kids".to_string(), PdfObject::Array(kids));
2380 intermediate_dict.insert("Count".to_string(), PdfObject::Integer(chunk.len() as i64));
2381
2382 let intermediate_key = (u32::MAX - 2 - chunk_idx as u32, 0);
2384 self.object_cache
2385 .insert(intermediate_key, PdfObject::Dictionary(intermediate_dict));
2386
2387 intermediate_nodes.push(intermediate_key);
2388 }
2389
2390 let mut root_kids = PdfArray::new();
2392 for (obj_num, gen_num) in &intermediate_nodes {
2393 root_kids.push(PdfObject::Reference(*obj_num, *gen_num));
2394 }
2395
2396 let mut root_pages_dict = PdfDictionary::new();
2397 root_pages_dict.insert(
2398 "Type".to_string(),
2399 PdfObject::Name(PdfName("Pages".to_string())),
2400 );
2401 root_pages_dict.insert("Kids".to_string(), PdfObject::Array(root_kids));
2402 root_pages_dict.insert(
2403 "Count".to_string(),
2404 PdfObject::Integer(page_refs.len() as i64),
2405 );
2406
2407 if let Some((obj_num, gen_num)) = page_refs.first() {
2409 if let Ok(page_obj) = self.get_object(*obj_num, *gen_num) {
2410 if let Some(page_dict) = page_obj.as_dict() {
2411 if let Some(mb) = page_dict.get("MediaBox") {
2412 root_pages_dict.insert("MediaBox".to_string(), mb.clone());
2413 }
2414 }
2415 }
2416 }
2417
2418 let root_key = (u32::MAX - 1, 0);
2420 self.object_cache
2421 .insert(root_key, PdfObject::Dictionary(root_pages_dict));
2422
2423 eprintln!(
2424 "DEBUG: Created hierarchical tree with {} intermediate nodes",
2425 intermediate_nodes.len()
2426 );
2427
2428 if let PdfObject::Dictionary(dict) = &self.object_cache[&root_key] {
2430 Ok(dict)
2431 } else {
2432 unreachable!("Just inserted dictionary")
2433 }
2434 }
2435}
2436
2437#[derive(Debug, Default, Clone)]
2439pub struct DocumentMetadata {
2440 pub title: Option<String>,
2441 pub author: Option<String>,
2442 pub subject: Option<String>,
2443 pub keywords: Option<String>,
2444 pub creator: Option<String>,
2445 pub producer: Option<String>,
2446 pub creation_date: Option<String>,
2447 pub modification_date: Option<String>,
2448 pub version: String,
2449 pub page_count: Option<u32>,
2450}
2451
2452pub struct EOLIter<'s> {
2453 remainder: &'s str,
2454}
2455impl<'s> Iterator for EOLIter<'s> {
2456 type Item = &'s str;
2457
2458 fn next(&mut self) -> Option<Self::Item> {
2459 if self.remainder.is_empty() {
2460 return None;
2461 }
2462
2463 if let Some((i, sep)) = ["\r\n", "\n", "\r"]
2464 .iter()
2465 .filter_map(|&sep| self.remainder.find(sep).map(|i| (i, sep)))
2466 .min_by_key(|(i, _)| *i)
2467 {
2468 let (line, rest) = self.remainder.split_at(i);
2469 self.remainder = &rest[sep.len()..];
2470 Some(line)
2471 } else {
2472 let line = self.remainder;
2473 self.remainder = "";
2474 Some(line)
2475 }
2476 }
2477}
2478pub trait PDFLines: AsRef<str> {
2479 fn pdf_lines(&self) -> EOLIter<'_> {
2480 EOLIter {
2481 remainder: self.as_ref(),
2482 }
2483 }
2484}
2485impl PDFLines for &str {}
2486impl<'a> PDFLines for std::borrow::Cow<'a, str> {}
2487impl PDFLines for String {}
2488
2489#[cfg(test)]
2490mod tests {
2491
2492 use super::*;
2493 use crate::parser::objects::{PdfName, PdfString};
2494 use crate::parser::test_helpers::*;
2495 use crate::parser::ParseOptions;
2496 use std::io::Cursor;
2497
2498 #[test]
2499 fn test_reader_construction() {
2500 let pdf_data = create_minimal_pdf();
2501 let cursor = Cursor::new(pdf_data);
2502 let result = PdfReader::new(cursor);
2503 assert!(result.is_ok());
2504 }
2505
2506 #[test]
2507 fn test_reader_version() {
2508 let pdf_data = create_minimal_pdf();
2509 let cursor = Cursor::new(pdf_data);
2510 let reader = PdfReader::new(cursor).unwrap();
2511 assert_eq!(reader.version().major, 1);
2512 assert_eq!(reader.version().minor, 4);
2513 }
2514
2515 #[test]
2516 fn test_reader_different_versions() {
2517 let versions = vec![
2518 "1.0", "1.1", "1.2", "1.3", "1.4", "1.5", "1.6", "1.7", "2.0",
2519 ];
2520
2521 for version in versions {
2522 let pdf_data = create_pdf_with_version(version);
2523 let cursor = Cursor::new(pdf_data);
2524 let reader = PdfReader::new(cursor).unwrap();
2525
2526 let parts: Vec<&str> = version.split('.').collect();
2527 assert_eq!(reader.version().major, parts[0].parse::<u8>().unwrap());
2528 assert_eq!(reader.version().minor, parts[1].parse::<u8>().unwrap());
2529 }
2530 }
2531
2532 #[test]
2533 fn test_reader_catalog() {
2534 let pdf_data = create_minimal_pdf();
2535 let cursor = Cursor::new(pdf_data);
2536 let mut reader = PdfReader::new(cursor).unwrap();
2537
2538 let catalog = reader.catalog();
2539 assert!(catalog.is_ok());
2540
2541 let catalog_dict = catalog.unwrap();
2542 assert_eq!(
2543 catalog_dict.get("Type"),
2544 Some(&PdfObject::Name(PdfName("Catalog".to_string())))
2545 );
2546 }
2547
2548 #[test]
2549 fn test_reader_info_none() {
2550 let pdf_data = create_minimal_pdf();
2551 let cursor = Cursor::new(pdf_data);
2552 let mut reader = PdfReader::new(cursor).unwrap();
2553
2554 let info = reader.info().unwrap();
2555 assert!(info.is_none());
2556 }
2557
2558 #[test]
2559 fn test_reader_info_present() {
2560 let pdf_data = create_pdf_with_info();
2561 let cursor = Cursor::new(pdf_data);
2562 let mut reader = PdfReader::new(cursor).unwrap();
2563
2564 let info = reader.info().unwrap();
2565 assert!(info.is_some());
2566
2567 let info_dict = info.unwrap();
2568 assert_eq!(
2569 info_dict.get("Title"),
2570 Some(&PdfObject::String(PdfString(
2571 "Test PDF".to_string().into_bytes()
2572 )))
2573 );
2574 assert_eq!(
2575 info_dict.get("Author"),
2576 Some(&PdfObject::String(PdfString(
2577 "Test Author".to_string().into_bytes()
2578 )))
2579 );
2580 }
2581
2582 #[test]
2583 fn test_reader_get_object() {
2584 let pdf_data = create_minimal_pdf();
2585 let cursor = Cursor::new(pdf_data);
2586 let mut reader = PdfReader::new(cursor).unwrap();
2587
2588 let obj = reader.get_object(1, 0);
2590 assert!(obj.is_ok());
2591
2592 let catalog = obj.unwrap();
2593 assert!(catalog.as_dict().is_some());
2594 }
2595
2596 #[test]
2597 fn test_reader_get_invalid_object() {
2598 let pdf_data = create_minimal_pdf();
2599 let cursor = Cursor::new(pdf_data);
2600 let mut reader = PdfReader::new(cursor).unwrap();
2601
2602 let obj = reader.get_object(999, 0);
2604 assert!(obj.is_err());
2605 }
2606
2607 #[test]
2608 fn test_reader_get_free_object() {
2609 let pdf_data = create_minimal_pdf();
2610 let cursor = Cursor::new(pdf_data);
2611 let mut reader = PdfReader::new(cursor).unwrap();
2612
2613 let obj = reader.get_object(0, 65535);
2615 assert!(obj.is_ok());
2616 assert_eq!(obj.unwrap(), &PdfObject::Null);
2617 }
2618
2619 #[test]
2620 fn test_reader_resolve_reference() {
2621 let pdf_data = create_minimal_pdf();
2622 let cursor = Cursor::new(pdf_data);
2623 let mut reader = PdfReader::new(cursor).unwrap();
2624
2625 let ref_obj = PdfObject::Reference(1, 0);
2627 let resolved = reader.resolve(&ref_obj);
2628
2629 assert!(resolved.is_ok());
2630 assert!(resolved.unwrap().as_dict().is_some());
2631 }
2632
2633 #[test]
2634 fn test_reader_resolve_non_reference() {
2635 let pdf_data = create_minimal_pdf();
2636 let cursor = Cursor::new(pdf_data);
2637 let mut reader = PdfReader::new(cursor).unwrap();
2638
2639 let int_obj = PdfObject::Integer(42);
2641 let resolved = reader.resolve(&int_obj).unwrap();
2642
2643 assert_eq!(resolved, &PdfObject::Integer(42));
2644 }
2645
2646 #[test]
2647 fn test_reader_cache_behavior() {
2648 let pdf_data = create_minimal_pdf();
2649 let cursor = Cursor::new(pdf_data);
2650 let mut reader = PdfReader::new(cursor).unwrap();
2651
2652 let obj1 = reader.get_object(1, 0).unwrap();
2654 assert!(obj1.as_dict().is_some());
2655
2656 let obj2 = reader.get_object(1, 0).unwrap();
2658 assert!(obj2.as_dict().is_some());
2659 }
2660
2661 #[test]
2662 fn test_reader_wrong_generation() {
2663 let pdf_data = create_minimal_pdf();
2664 let cursor = Cursor::new(pdf_data);
2665 let mut reader = PdfReader::new(cursor).unwrap();
2666
2667 let obj = reader.get_object(1, 99);
2669 assert!(obj.is_err());
2670 }
2671
2672 #[test]
2673 fn test_reader_invalid_pdf() {
2674 let invalid_data = b"This is not a PDF file";
2675 let cursor = Cursor::new(invalid_data.to_vec());
2676 let result = PdfReader::new(cursor);
2677
2678 assert!(result.is_err());
2679 }
2680
2681 #[test]
2682 fn test_reader_corrupt_xref() {
2683 let corrupt_pdf = b"%PDF-1.4
26841 0 obj
2685<< /Type /Catalog >>
2686endobj
2687xref
2688corrupted xref table
2689trailer
2690<< /Size 2 /Root 1 0 R >>
2691startxref
269224
2693%%EOF"
2694 .to_vec();
2695
2696 let cursor = Cursor::new(corrupt_pdf);
2697 let result = PdfReader::new(cursor);
2698 assert!(result.is_err());
2701 }
2702
2703 #[test]
2704 fn test_reader_missing_trailer() {
2705 let pdf_no_trailer = b"%PDF-1.4
27061 0 obj
2707<< /Type /Catalog >>
2708endobj
2709xref
27100 2
27110000000000 65535 f
27120000000009 00000 n
2713startxref
271424
2715%%EOF"
2716 .to_vec();
2717
2718 let cursor = Cursor::new(pdf_no_trailer);
2719 let result = PdfReader::new(cursor);
2720 assert!(result.is_err());
2723 }
2724
2725 #[test]
2726 fn test_reader_empty_pdf() {
2727 let cursor = Cursor::new(Vec::new());
2728 let result = PdfReader::new(cursor);
2729 assert!(result.is_err());
2730 }
2731
2732 #[test]
2733 fn test_reader_page_count() {
2734 let pdf_data = create_minimal_pdf();
2735 let cursor = Cursor::new(pdf_data);
2736 let mut reader = PdfReader::new(cursor).unwrap();
2737
2738 let count = reader.page_count();
2739 assert!(count.is_ok());
2740 assert_eq!(count.unwrap(), 0); }
2742
2743 #[test]
2744 fn test_reader_into_document() {
2745 let pdf_data = create_minimal_pdf();
2746 let cursor = Cursor::new(pdf_data);
2747 let reader = PdfReader::new(cursor).unwrap();
2748
2749 let document = reader.into_document();
2750 let page_count = document.page_count();
2752 assert!(page_count.is_ok());
2753 }
2754
2755 #[test]
2756 fn test_reader_pages_dict() {
2757 let pdf_data = create_minimal_pdf();
2758 let cursor = Cursor::new(pdf_data);
2759 let mut reader = PdfReader::new(cursor).unwrap();
2760
2761 let pages = reader.pages();
2762 assert!(pages.is_ok());
2763 let pages_dict = pages.unwrap();
2764 assert_eq!(
2765 pages_dict.get("Type"),
2766 Some(&PdfObject::Name(PdfName("Pages".to_string())))
2767 );
2768 }
2769
2770 #[test]
2771 fn test_reader_pdf_with_binary_data() {
2772 let pdf_data = create_pdf_with_binary_marker();
2773
2774 let cursor = Cursor::new(pdf_data);
2775 let result = PdfReader::new(cursor);
2776 assert!(result.is_ok());
2777 }
2778
2779 #[test]
2780 fn test_reader_metadata() {
2781 let pdf_data = create_pdf_with_info();
2782 let cursor = Cursor::new(pdf_data);
2783 let mut reader = PdfReader::new(cursor).unwrap();
2784
2785 let metadata = reader.metadata().unwrap();
2786 assert_eq!(metadata.title, Some("Test PDF".to_string()));
2787 assert_eq!(metadata.author, Some("Test Author".to_string()));
2788 assert_eq!(metadata.subject, Some("Testing".to_string()));
2789 assert_eq!(metadata.version, "1.4".to_string());
2790 }
2791
2792 #[test]
2793 fn test_reader_metadata_empty() {
2794 let pdf_data = create_minimal_pdf();
2795 let cursor = Cursor::new(pdf_data);
2796 let mut reader = PdfReader::new(cursor).unwrap();
2797
2798 let metadata = reader.metadata().unwrap();
2799 assert!(metadata.title.is_none());
2800 assert!(metadata.author.is_none());
2801 assert_eq!(metadata.version, "1.4".to_string());
2802 assert_eq!(metadata.page_count, Some(0));
2803 }
2804
2805 #[test]
2806 fn test_reader_object_number_mismatch() {
2807 let pdf_data = create_minimal_pdf();
2811 let cursor = Cursor::new(pdf_data);
2812 let mut reader = PdfReader::new(cursor).unwrap();
2813
2814 let result = reader.get_object(1, 99);
2817 assert!(result.is_err());
2818
2819 let result2 = reader.get_object(999, 0);
2821 assert!(result2.is_err());
2822 }
2823
2824 #[test]
2825 fn test_document_metadata_struct() {
2826 let metadata = DocumentMetadata {
2827 title: Some("Title".to_string()),
2828 author: Some("Author".to_string()),
2829 subject: Some("Subject".to_string()),
2830 keywords: Some("Keywords".to_string()),
2831 creator: Some("Creator".to_string()),
2832 producer: Some("Producer".to_string()),
2833 creation_date: Some("D:20240101".to_string()),
2834 modification_date: Some("D:20240102".to_string()),
2835 version: "1.5".to_string(),
2836 page_count: Some(10),
2837 };
2838
2839 assert_eq!(metadata.title, Some("Title".to_string()));
2840 assert_eq!(metadata.page_count, Some(10));
2841 }
2842
2843 #[test]
2844 fn test_document_metadata_default() {
2845 let metadata = DocumentMetadata::default();
2846 assert!(metadata.title.is_none());
2847 assert!(metadata.author.is_none());
2848 assert!(metadata.subject.is_none());
2849 assert!(metadata.keywords.is_none());
2850 assert!(metadata.creator.is_none());
2851 assert!(metadata.producer.is_none());
2852 assert!(metadata.creation_date.is_none());
2853 assert!(metadata.modification_date.is_none());
2854 assert_eq!(metadata.version, "".to_string());
2855 assert!(metadata.page_count.is_none());
2856 }
2857
2858 #[test]
2859 fn test_document_metadata_clone() {
2860 let metadata = DocumentMetadata {
2861 title: Some("Test".to_string()),
2862 version: "1.4".to_string(),
2863 ..Default::default()
2864 };
2865
2866 let cloned = metadata.clone();
2867 assert_eq!(cloned.title, Some("Test".to_string()));
2868 assert_eq!(cloned.version, "1.4".to_string());
2869 }
2870
2871 #[test]
2872 fn test_reader_trailer_validation_error() {
2873 let bad_pdf = b"%PDF-1.4
28751 0 obj
2876<< /Type /Catalog >>
2877endobj
2878xref
28790 2
28800000000000 65535 f
28810000000009 00000 n
2882trailer
2883<< /Size 2 >>
2884startxref
288546
2886%%EOF"
2887 .to_vec();
2888
2889 let cursor = Cursor::new(bad_pdf);
2890 let result = PdfReader::new(cursor);
2891 assert!(result.is_err());
2894 }
2895
2896 #[test]
2897 fn test_reader_with_options() {
2898 let pdf_data = create_minimal_pdf();
2899 let cursor = Cursor::new(pdf_data);
2900 let mut options = ParseOptions::default();
2901 options.lenient_streams = true;
2902 options.max_recovery_bytes = 2000;
2903 options.collect_warnings = true;
2904
2905 let reader = PdfReader::new_with_options(cursor, options);
2906 assert!(reader.is_ok());
2907 }
2908
2909 #[test]
2910 fn test_lenient_stream_parsing() {
2911 let pdf_data = b"%PDF-1.4
29131 0 obj
2914<< /Type /Catalog /Pages 2 0 R >>
2915endobj
29162 0 obj
2917<< /Type /Pages /Kids [3 0 R] /Count 1 >>
2918endobj
29193 0 obj
2920<< /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792] /Contents 4 0 R >>
2921endobj
29224 0 obj
2923<< /Length 10 >>
2924stream
2925This is a longer stream than 10 bytes
2926endstream
2927endobj
2928xref
29290 5
29300000000000 65535 f
29310000000009 00000 n
29320000000058 00000 n
29330000000116 00000 n
29340000000219 00000 n
2935trailer
2936<< /Size 5 /Root 1 0 R >>
2937startxref
2938299
2939%%EOF"
2940 .to_vec();
2941
2942 let cursor = Cursor::new(pdf_data.clone());
2944 let strict_options = ParseOptions::strict();
2945 let strict_reader = PdfReader::new_with_options(cursor, strict_options);
2946 assert!(strict_reader.is_err());
2948
2949 let cursor = Cursor::new(pdf_data);
2951 let mut options = ParseOptions::default();
2952 options.lenient_streams = true;
2953 options.max_recovery_bytes = 1000;
2954 options.collect_warnings = false;
2955 let lenient_reader = PdfReader::new_with_options(cursor, options);
2956 assert!(lenient_reader.is_err());
2957 }
2958
2959 #[test]
2960 fn test_parse_options_default() {
2961 let options = ParseOptions::default();
2962 assert!(!options.lenient_streams);
2963 assert_eq!(options.max_recovery_bytes, 1000);
2964 assert!(!options.collect_warnings);
2965 }
2966
2967 #[test]
2968 fn test_parse_options_clone() {
2969 let mut options = ParseOptions::default();
2970 options.lenient_streams = true;
2971 options.max_recovery_bytes = 2000;
2972 options.collect_warnings = true;
2973 let cloned = options.clone();
2974 assert!(cloned.lenient_streams);
2975 assert_eq!(cloned.max_recovery_bytes, 2000);
2976 assert!(cloned.collect_warnings);
2977 }
2978
2979 #[allow(dead_code)]
2982 fn create_encrypted_pdf_dict() -> PdfDictionary {
2983 let mut dict = PdfDictionary::new();
2984 dict.insert(
2985 "Filter".to_string(),
2986 PdfObject::Name(PdfName("Standard".to_string())),
2987 );
2988 dict.insert("V".to_string(), PdfObject::Integer(1));
2989 dict.insert("R".to_string(), PdfObject::Integer(2));
2990 dict.insert("O".to_string(), PdfObject::String(PdfString(vec![0u8; 32])));
2991 dict.insert("U".to_string(), PdfObject::String(PdfString(vec![0u8; 32])));
2992 dict.insert("P".to_string(), PdfObject::Integer(-4));
2993 dict
2994 }
2995
2996 fn create_pdf_with_encryption() -> Vec<u8> {
2997 b"%PDF-1.4
29991 0 obj
3000<< /Type /Catalog /Pages 2 0 R >>
3001endobj
30022 0 obj
3003<< /Type /Pages /Kids [3 0 R] /Count 1 >>
3004endobj
30053 0 obj
3006<< /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792] >>
3007endobj
30084 0 obj
3009<< /Filter /Standard /V 1 /R 2 /O (32 bytes of owner password hash data) /U (32 bytes of user password hash data) /P -4 >>
3010endobj
3011xref
30120 5
30130000000000 65535 f
30140000000009 00000 n
30150000000058 00000 n
30160000000116 00000 n
30170000000201 00000 n
3018trailer
3019<< /Size 5 /Root 1 0 R /Encrypt 4 0 R /ID [(file id)] >>
3020startxref
3021295
3022%%EOF"
3023 .to_vec()
3024 }
3025
3026 #[test]
3027 fn test_reader_encryption_detection() {
3028 let unencrypted_pdf = create_minimal_pdf();
3030 let cursor = Cursor::new(unencrypted_pdf);
3031 let reader = PdfReader::new(cursor).unwrap();
3032 assert!(!reader.is_encrypted());
3033 assert!(reader.is_unlocked()); let encrypted_pdf = create_pdf_with_encryption();
3037 let cursor = Cursor::new(encrypted_pdf);
3038 let result = PdfReader::new(cursor);
3039 assert!(result.is_err());
3041 }
3042
3043 #[test]
3044 fn test_reader_encryption_methods_unencrypted() {
3045 let pdf_data = create_minimal_pdf();
3046 let cursor = Cursor::new(pdf_data);
3047 let mut reader = PdfReader::new(cursor).unwrap();
3048
3049 assert!(!reader.is_encrypted());
3051 assert!(reader.is_unlocked());
3052 assert!(reader.encryption_handler().is_none());
3053 assert!(reader.encryption_handler_mut().is_none());
3054
3055 assert!(reader.unlock_with_password("any_password").unwrap());
3057 assert!(reader.try_empty_password().unwrap());
3058 }
3059
3060 #[test]
3061 fn test_reader_encryption_handler_access() {
3062 let pdf_data = create_minimal_pdf();
3063 let cursor = Cursor::new(pdf_data);
3064 let mut reader = PdfReader::new(cursor).unwrap();
3065
3066 assert!(reader.encryption_handler().is_none());
3068 assert!(reader.encryption_handler_mut().is_none());
3069
3070 assert!(!reader.is_encrypted());
3072 assert!(reader.is_unlocked());
3073 }
3074
3075 #[test]
3076 fn test_reader_multiple_password_attempts() {
3077 let pdf_data = create_minimal_pdf();
3078 let cursor = Cursor::new(pdf_data);
3079 let mut reader = PdfReader::new(cursor).unwrap();
3080
3081 let passwords = vec!["test1", "test2", "admin", "", "password"];
3083 for password in passwords {
3084 assert!(reader.unlock_with_password(password).unwrap());
3085 }
3086
3087 for _ in 0..5 {
3089 assert!(reader.try_empty_password().unwrap());
3090 }
3091 }
3092
3093 #[test]
3094 fn test_reader_encryption_state_consistency() {
3095 let pdf_data = create_minimal_pdf();
3096 let cursor = Cursor::new(pdf_data);
3097 let mut reader = PdfReader::new(cursor).unwrap();
3098
3099 assert!(!reader.is_encrypted());
3101 assert!(reader.is_unlocked());
3102 assert!(reader.encryption_handler().is_none());
3103
3104 let _ = reader.unlock_with_password("test");
3106 assert!(!reader.is_encrypted());
3107 assert!(reader.is_unlocked());
3108 assert!(reader.encryption_handler().is_none());
3109
3110 let _ = reader.try_empty_password();
3111 assert!(!reader.is_encrypted());
3112 assert!(reader.is_unlocked());
3113 assert!(reader.encryption_handler().is_none());
3114 }
3115
3116 #[test]
3117 fn test_reader_encryption_error_handling() {
3118 let encrypted_pdf = create_pdf_with_encryption();
3120 let cursor = Cursor::new(encrypted_pdf);
3121
3122 let result = PdfReader::new(cursor);
3124 match result {
3125 Err(ParseError::EncryptionNotSupported) => {
3126 }
3128 Err(_) => {
3129 }
3131 Ok(_) => {
3132 panic!("Should not successfully create reader for encrypted PDF without password");
3133 }
3134 }
3135 }
3136
3137 #[test]
3138 fn test_reader_encryption_with_options() {
3139 let pdf_data = create_minimal_pdf();
3140 let cursor = Cursor::new(pdf_data);
3141
3142 let strict_options = ParseOptions::strict();
3144 let strict_reader = PdfReader::new_with_options(cursor, strict_options).unwrap();
3145 assert!(!strict_reader.is_encrypted());
3146 assert!(strict_reader.is_unlocked());
3147
3148 let pdf_data = create_minimal_pdf();
3149 let cursor = Cursor::new(pdf_data);
3150 let lenient_options = ParseOptions::lenient();
3151 let lenient_reader = PdfReader::new_with_options(cursor, lenient_options).unwrap();
3152 assert!(!lenient_reader.is_encrypted());
3153 assert!(lenient_reader.is_unlocked());
3154 }
3155
3156 #[test]
3157 fn test_reader_encryption_integration_edge_cases() {
3158 let pdf_data = create_minimal_pdf();
3159 let cursor = Cursor::new(pdf_data);
3160 let mut reader = PdfReader::new(cursor).unwrap();
3161
3162 assert!(reader.unlock_with_password("").unwrap());
3164 assert!(reader.unlock_with_password(" ").unwrap()); assert!(reader
3166 .unlock_with_password("very_long_password_that_exceeds_normal_length")
3167 .unwrap());
3168 assert!(reader.unlock_with_password("unicode_test_ñáéíóú").unwrap());
3169
3170 assert!(reader.unlock_with_password("pass@#$%^&*()").unwrap());
3172 assert!(reader.unlock_with_password("pass\nwith\nnewlines").unwrap());
3173 assert!(reader.unlock_with_password("pass\twith\ttabs").unwrap());
3174 }
3175}