1use super::encryption_handler::EncryptionHandler;
6use super::header::PdfHeader;
7use super::object_stream::ObjectStream;
8use super::objects::{PdfArray, PdfDictionary, PdfObject, PdfString};
9use super::stack_safe::StackSafeContext;
10use super::trailer::PdfTrailer;
11use super::xref::XRefTable;
12use super::{ParseError, ParseResult};
13use crate::objects::ObjectId;
14use std::collections::HashMap;
15use std::fs::File;
16use std::io::{BufReader, Read, Seek, SeekFrom};
17use std::path::Path;
18
19fn find_bytes(haystack: &[u8], needle: &[u8]) -> Option<usize> {
21 haystack
22 .windows(needle.len())
23 .position(|window| window == needle)
24}
25
26fn is_immediate_stream_start(data: &[u8]) -> bool {
28 let mut i = 0;
29
30 while i < data.len() && matches!(data[i], b' ' | b'\t' | b'\n' | b'\r') {
32 i += 1;
33 }
34
35 data[i..].starts_with(b"stream")
37}
38
39pub struct PdfReader<R: Read + Seek> {
41 reader: BufReader<R>,
42 header: PdfHeader,
43 xref: XRefTable,
44 trailer: PdfTrailer,
45 object_cache: HashMap<(u32, u16), PdfObject>,
47 object_stream_cache: HashMap<u32, ObjectStream>,
49 page_tree: Option<super::page_tree::PageTree>,
51 parse_context: StackSafeContext,
53 options: super::ParseOptions,
55 encryption_handler: Option<EncryptionHandler>,
57 objects_being_reconstructed: std::sync::Mutex<std::collections::HashSet<u32>>,
59 max_reconstruction_depth: u32,
61}
62
63impl<R: Read + Seek> PdfReader<R> {
64 pub fn options(&self) -> &super::ParseOptions {
66 &self.options
67 }
68
69 pub fn is_encrypted(&self) -> bool {
71 self.encryption_handler.is_some()
72 }
73
74 pub fn is_unlocked(&self) -> bool {
76 match &self.encryption_handler {
77 Some(handler) => handler.is_unlocked(),
78 None => true, }
80 }
81
82 pub fn encryption_handler_mut(&mut self) -> Option<&mut EncryptionHandler> {
84 self.encryption_handler.as_mut()
85 }
86
87 pub fn encryption_handler(&self) -> Option<&EncryptionHandler> {
89 self.encryption_handler.as_ref()
90 }
91
92 pub fn unlock_with_password(&mut self, password: &str) -> ParseResult<bool> {
94 match &mut self.encryption_handler {
95 Some(handler) => {
96 if handler.unlock_with_user_password(password).unwrap_or(false) {
98 Ok(true)
99 } else {
100 Ok(handler
102 .unlock_with_owner_password(password)
103 .unwrap_or(false))
104 }
105 }
106 None => Ok(true), }
108 }
109
110 pub fn try_empty_password(&mut self) -> ParseResult<bool> {
112 match &mut self.encryption_handler {
113 Some(handler) => Ok(handler.try_empty_password().unwrap_or(false)),
114 None => Ok(true), }
116 }
117
118 pub fn unlock(&mut self, password: &str) -> ParseResult<()> {
149 if !self.is_encrypted() {
151 return Ok(());
152 }
153
154 if self.is_unlocked() {
156 return Ok(());
157 }
158
159 let success = self.unlock_with_password(password)?;
161
162 if success {
163 Ok(())
164 } else {
165 Err(ParseError::WrongPassword)
166 }
167 }
168
169 fn ensure_unlocked(&self) -> ParseResult<()> {
171 if self.is_encrypted() && !self.is_unlocked() {
172 return Err(ParseError::PdfLocked);
173 }
174 Ok(())
175 }
176
177 fn decrypt_object_if_needed(
183 &self,
184 obj: PdfObject,
185 obj_num: u32,
186 gen_num: u16,
187 ) -> ParseResult<PdfObject> {
188 let handler = match &self.encryption_handler {
190 Some(h) if h.is_unlocked() => h,
191 _ => return Ok(obj), };
193
194 let obj_id = ObjectId::new(obj_num, gen_num);
195
196 match obj {
197 PdfObject::String(ref s) => {
198 let decrypted_bytes = handler.decrypt_string(s.as_bytes(), &obj_id)?;
200 Ok(PdfObject::String(PdfString::new(decrypted_bytes)))
201 }
202 PdfObject::Stream(ref stream) => {
203 let should_decrypt = stream
205 .dict
206 .get("StmF")
207 .and_then(|o| o.as_name())
208 .map(|n| n.0.as_str() != "Identity")
209 .unwrap_or(true); if should_decrypt {
212 let decrypted_data = handler.decrypt_stream(&stream.data, &obj_id)?;
213
214 let mut new_stream = stream.clone();
216 new_stream.data = decrypted_data;
217 Ok(PdfObject::Stream(new_stream))
218 } else {
219 Ok(obj) }
221 }
222 PdfObject::Dictionary(ref dict) => {
223 let mut new_dict = PdfDictionary::new();
225 for (key, value) in dict.0.iter() {
226 let decrypted_value =
227 self.decrypt_object_if_needed(value.clone(), obj_num, gen_num)?;
228 new_dict.insert(key.0.clone(), decrypted_value);
229 }
230 Ok(PdfObject::Dictionary(new_dict))
231 }
232 PdfObject::Array(ref arr) => {
233 let mut new_arr = Vec::new();
235 for elem in arr.0.iter() {
236 let decrypted_elem =
237 self.decrypt_object_if_needed(elem.clone(), obj_num, gen_num)?;
238 new_arr.push(decrypted_elem);
239 }
240 Ok(PdfObject::Array(PdfArray(new_arr)))
241 }
242 _ => Ok(obj),
244 }
245 }
246}
247
248impl PdfReader<File> {
249 pub fn open<P: AsRef<Path>>(path: P) -> ParseResult<Self> {
251 use std::io::Write;
252 let mut debug_file = std::fs::File::create("/tmp/pdf_open_debug.log").ok();
253 if let Some(ref mut f) = debug_file {
254 writeln!(f, "Opening file: {:?}", path.as_ref()).ok();
255 }
256 let file = File::open(path)?;
257 if let Some(ref mut f) = debug_file {
258 writeln!(f, "File opened successfully").ok();
259 }
260 let options = super::ParseOptions::lenient();
262 Self::new_with_options(file, options)
263 }
264
265 pub fn open_strict<P: AsRef<Path>>(path: P) -> ParseResult<Self> {
267 let file = File::open(path)?;
268 let options = super::ParseOptions::strict();
269 Self::new_with_options(file, options)
270 }
271
272 pub fn open_with_options<P: AsRef<Path>>(
274 path: P,
275 options: super::ParseOptions,
276 ) -> ParseResult<Self> {
277 let file = File::open(path)?;
278 Self::new_with_options(file, options)
279 }
280
281 pub fn open_document<P: AsRef<Path>>(
283 path: P,
284 ) -> ParseResult<super::document::PdfDocument<File>> {
285 let reader = Self::open(path)?;
286 Ok(reader.into_document())
287 }
288}
289
290impl<R: Read + Seek> PdfReader<R> {
291 pub fn new(reader: R) -> ParseResult<Self> {
298 let mut options = super::ParseOptions::default();
301 options.lenient_streams = true;
302 Self::new_with_options(reader, options)
303 }
304
305 pub fn new_with_options(reader: R, options: super::ParseOptions) -> ParseResult<Self> {
307 let mut buf_reader = BufReader::new(reader);
308
309 let start_pos = buf_reader.stream_position()?;
311 buf_reader.seek(SeekFrom::End(0))?;
312 let file_size = buf_reader.stream_position()?;
313 buf_reader.seek(SeekFrom::Start(start_pos))?;
314
315 if file_size == 0 {
316 return Err(ParseError::EmptyFile);
317 }
318
319 use std::io::Write;
321 let mut debug_file = std::fs::File::create("/tmp/pdf_debug.log").ok();
322 if let Some(ref mut f) = debug_file {
323 writeln!(f, "Parsing PDF header...").ok();
324 }
325 let header = PdfHeader::parse(&mut buf_reader)?;
326 if let Some(ref mut f) = debug_file {
327 writeln!(f, "Header parsed: version {}", header.version).ok();
328 }
329
330 if let Some(ref mut f) = debug_file {
332 writeln!(f, "Parsing XRef table...").ok();
333 }
334 let xref = XRefTable::parse_with_options(&mut buf_reader, &options)?;
335 if let Some(ref mut f) = debug_file {
336 writeln!(f, "XRef table parsed with {} entries", xref.len()).ok();
337 }
338
339 let trailer_dict = xref.trailer().ok_or(ParseError::InvalidTrailer)?.clone();
341
342 let xref_offset = xref.xref_offset();
343 let trailer = PdfTrailer::from_dict(trailer_dict, xref_offset)?;
344
345 trailer.validate()?;
347
348 let encryption_handler = if EncryptionHandler::detect_encryption(trailer.dict()) {
350 if let Ok(Some((encrypt_obj_num, encrypt_gen_num))) = trailer.encrypt() {
351 let mut temp_reader = Self {
353 reader: buf_reader,
354 header: header.clone(),
355 xref: xref.clone(),
356 trailer: trailer.clone(),
357 object_cache: HashMap::new(),
358 object_stream_cache: HashMap::new(),
359 page_tree: None,
360 parse_context: StackSafeContext::new(),
361 options: options.clone(),
362 encryption_handler: None,
363 objects_being_reconstructed: std::sync::Mutex::new(
364 std::collections::HashSet::new(),
365 ),
366 max_reconstruction_depth: 100,
367 };
368
369 let encrypt_obj = temp_reader.get_object(encrypt_obj_num, encrypt_gen_num)?;
371 if let Some(encrypt_dict) = encrypt_obj.as_dict() {
372 let file_id = trailer.id().and_then(|id_obj| {
374 if let PdfObject::Array(ref id_array) = id_obj {
375 if let Some(PdfObject::String(ref id_bytes)) = id_array.get(0) {
376 Some(id_bytes.as_bytes().to_vec())
377 } else {
378 None
379 }
380 } else {
381 None
382 }
383 });
384
385 match EncryptionHandler::new(encrypt_dict, file_id) {
386 Ok(handler) => {
387 buf_reader = temp_reader.reader;
389 Some(handler)
390 }
391 Err(_) => {
392 let _ = temp_reader.reader;
394 return Err(ParseError::EncryptionNotSupported);
395 }
396 }
397 } else {
398 let _ = temp_reader.reader;
399 return Err(ParseError::EncryptionNotSupported);
400 }
401 } else {
402 return Err(ParseError::EncryptionNotSupported);
403 }
404 } else {
405 None
406 };
407
408 Ok(Self {
409 reader: buf_reader,
410 header,
411 xref,
412 trailer,
413 object_cache: HashMap::new(),
414 object_stream_cache: HashMap::new(),
415 page_tree: None,
416 parse_context: StackSafeContext::new(),
417 options,
418 encryption_handler,
419 objects_being_reconstructed: std::sync::Mutex::new(std::collections::HashSet::new()),
420 max_reconstruction_depth: 100,
421 })
422 }
423
424 pub fn version(&self) -> &super::header::PdfVersion {
426 &self.header.version
427 }
428
429 pub fn catalog(&mut self) -> ParseResult<&PdfDictionary> {
431 let (obj_num, gen_num) = match self.trailer.root() {
433 Ok(root) => {
434 if let Ok(obj) = self.get_object(root.0, root.1) {
437 if let Some(dict) = obj.as_dict() {
438 if let Some(type_obj) = dict.get("Type") {
440 if let Some(type_name) = type_obj.as_name() {
441 if type_name.0 != "Catalog" {
442 tracing::warn!("Trailer /Root points to /Type/{} (not Catalog), scanning for real catalog", type_name.0);
443 if let Ok(catalog_ref) = self.find_catalog_object() {
445 catalog_ref
446 } else {
447 root }
449 } else {
450 root }
452 } else {
453 root }
455 } else {
456 root }
458 } else {
459 root }
461 } else {
462 root }
464 }
465 Err(_) => {
466 #[cfg(debug_assertions)]
468 tracing::warn!("Trailer missing Root entry, attempting recovery");
469
470 if let Some(root) = self.trailer.find_root_fallback() {
472 root
473 } else {
474 if let Ok(catalog_ref) = self.find_catalog_object() {
476 catalog_ref
477 } else {
478 return Err(ParseError::MissingKey("Root".to_string()));
479 }
480 }
481 }
482 };
483
484 let key = (obj_num, gen_num);
486 let needs_reconstruction = {
487 match self.get_object(obj_num, gen_num) {
488 Ok(catalog) => {
489 if catalog.as_dict().is_some() {
491 false
493 } else {
494 true
496 }
497 }
498 Err(_) => {
499 true
501 }
502 }
503 };
504
505 if !needs_reconstruction {
506 let catalog = self.get_object(obj_num, gen_num)?;
508 return catalog.as_dict().ok_or_else(|| ParseError::SyntaxError {
509 position: 0,
510 message: format!("Catalog object {} {} is not a dictionary", obj_num, gen_num),
511 });
512 }
513
514 match self.extract_object_manually(obj_num) {
517 Ok(dict) => {
518 let obj = PdfObject::Dictionary(dict);
520 self.object_cache.insert(key, obj);
521
522 use crate::parser::xref::XRefEntry;
524 let xref_entry = XRefEntry {
525 offset: 0, generation: gen_num,
527 in_use: true,
528 };
529 self.xref.add_entry(obj_num, xref_entry);
530
531 if let Some(PdfObject::Dictionary(ref dict)) = self.object_cache.get(&key) {
533 return Ok(dict);
534 }
535 }
536 Err(_e) => {}
537 }
538
539 Err(ParseError::SyntaxError {
541 position: 0,
542 message: format!(
543 "Catalog object {} could not be parsed or reconstructed as a dictionary",
544 obj_num
545 ),
546 })
547 }
548
549 pub fn info(&mut self) -> ParseResult<Option<&PdfDictionary>> {
551 match self.trailer.info() {
552 Some((obj_num, gen_num)) => {
553 let info = self.get_object(obj_num, gen_num)?;
554 Ok(info.as_dict())
555 }
556 None => Ok(None),
557 }
558 }
559
560 pub fn get_object(&mut self, obj_num: u32, gen_num: u16) -> ParseResult<&PdfObject> {
562 self.ensure_unlocked()?;
564
565 let key = (obj_num, gen_num);
566
567 if self.object_cache.contains_key(&key) {
569 return Ok(&self.object_cache[&key]);
570 }
571
572 {
574 let being_loaded =
575 self.objects_being_reconstructed
576 .lock()
577 .map_err(|_| ParseError::SyntaxError {
578 position: 0,
579 message: "Mutex poisoned during circular reference check".to_string(),
580 })?;
581 if being_loaded.contains(&obj_num) {
582 drop(being_loaded);
583 if self.options.collect_warnings {}
584 self.object_cache.insert(key, PdfObject::Null);
585 return Ok(&self.object_cache[&key]);
586 }
587 }
588
589 {
591 let being_loaded =
592 self.objects_being_reconstructed
593 .lock()
594 .map_err(|_| ParseError::SyntaxError {
595 position: 0,
596 message: "Mutex poisoned during depth limit check".to_string(),
597 })?;
598 let depth = being_loaded.len() as u32;
599 if depth >= self.max_reconstruction_depth {
600 drop(being_loaded);
601 if self.options.collect_warnings {}
602 return Err(ParseError::SyntaxError {
603 position: 0,
604 message: format!(
605 "Maximum object loading depth ({}) exceeded",
606 self.max_reconstruction_depth
607 ),
608 });
609 }
610 }
611
612 self.objects_being_reconstructed
614 .lock()
615 .map_err(|_| ParseError::SyntaxError {
616 position: 0,
617 message: "Mutex poisoned while marking object as being loaded".to_string(),
618 })?
619 .insert(obj_num);
620
621 match self.load_object_from_disk(obj_num, gen_num) {
623 Ok(_) => {
624 self.objects_being_reconstructed
626 .lock()
627 .map_err(|_| ParseError::SyntaxError {
628 position: 0,
629 message: "Mutex poisoned while unmarking object after successful load"
630 .to_string(),
631 })?
632 .remove(&obj_num);
633 Ok(&self.object_cache[&key])
635 }
636 Err(e) => {
637 if let Ok(mut guard) = self.objects_being_reconstructed.lock() {
640 guard.remove(&obj_num);
641 }
642 Err(e)
643 }
644 }
645 }
646
647 fn load_object_from_disk(&mut self, obj_num: u32, gen_num: u16) -> ParseResult<&PdfObject> {
649 let key = (obj_num, gen_num);
650
651 if self.object_cache.contains_key(&key) {
653 return Ok(&self.object_cache[&key]);
654 }
655
656 if let Some(ext_entry) = self.xref.get_extended_entry(obj_num) {
658 if let Some((stream_obj_num, index_in_stream)) = ext_entry.compressed_info {
659 return self.get_compressed_object(
661 obj_num,
662 gen_num,
663 stream_obj_num,
664 index_in_stream,
665 );
666 }
667 } else {
668 }
669
670 let (current_offset, _generation) = {
672 let entry = self.xref.get_entry(obj_num);
673
674 match entry {
675 Some(entry) => {
676 if !entry.in_use {
677 self.object_cache.insert(key, PdfObject::Null);
679 return Ok(&self.object_cache[&key]);
680 }
681
682 if entry.generation != gen_num {
683 if self.options.lenient_syntax {
684 if self.options.collect_warnings {
686 tracing::warn!("Object {} generation mismatch - expected {}, found {}, using available",
687 obj_num, gen_num, entry.generation);
688 }
689 } else {
690 return Err(ParseError::InvalidReference(obj_num, gen_num));
691 }
692 }
693
694 (entry.offset, entry.generation)
695 }
696 None => {
697 if self.is_reconstructible_object(obj_num) {
699 return self.attempt_manual_object_reconstruction(obj_num, gen_num, 0);
700 } else {
701 if self.options.lenient_syntax {
702 if self.options.collect_warnings {
704 tracing::warn!(
705 "Object {} {} R not found in XRef, returning null object",
706 obj_num,
707 gen_num
708 );
709 }
710 self.object_cache.insert(key, PdfObject::Null);
711 return Ok(&self.object_cache[&key]);
712 } else {
713 return Err(ParseError::InvalidReference(obj_num, gen_num));
714 }
715 }
716 }
717 }
718 };
719
720 self.reader.seek(std::io::SeekFrom::Start(current_offset))?;
724
725 let mut lexer =
727 super::lexer::Lexer::new_with_options(&mut self.reader, self.options.clone());
728
729 {
731 let token = lexer.next_token()?;
733 let read_obj_num = match token {
734 super::lexer::Token::Integer(n) => n as u32,
735 _ => {
736 if self.options.lenient_syntax {
738 if self.options.collect_warnings {
740 tracing::debug!(
741 "Warning: Using expected object number {obj_num} instead of parsed token: {:?}",
742 token
743 );
744 }
745 obj_num
746 } else {
747 return Err(ParseError::SyntaxError {
748 position: current_offset as usize,
749 message: "Expected object number".to_string(),
750 });
751 }
752 }
753 };
754
755 if read_obj_num != obj_num && !self.options.lenient_syntax {
756 return Err(ParseError::SyntaxError {
757 position: current_offset as usize,
758 message: format!(
759 "Object number mismatch: expected {obj_num}, found {read_obj_num}"
760 ),
761 });
762 }
763
764 let token = lexer.next_token()?;
766 let _read_gen_num = match token {
767 super::lexer::Token::Integer(n) => n as u16,
768 _ => {
769 if self.options.lenient_syntax {
771 if self.options.collect_warnings {
772 tracing::warn!(
773 "Using generation 0 instead of parsed token for object {obj_num}"
774 );
775 }
776 0
777 } else {
778 return Err(ParseError::SyntaxError {
779 position: current_offset as usize,
780 message: "Expected generation number".to_string(),
781 });
782 }
783 }
784 };
785
786 let token = lexer.next_token()?;
788 match token {
789 super::lexer::Token::Obj => {}
790 _ => {
791 if self.options.lenient_syntax {
792 if self.options.collect_warnings {
794 tracing::warn!("Expected 'obj' keyword for object {obj_num} {gen_num}, continuing anyway");
795 }
796 } else {
797 return Err(ParseError::SyntaxError {
798 position: current_offset as usize,
799 message: "Expected 'obj' keyword".to_string(),
800 });
801 }
802 }
803 }
804 }
805
806 self.parse_context.enter()?;
808
809 let obj = match PdfObject::parse_with_options(&mut lexer, &self.options) {
810 Ok(obj) => {
811 self.parse_context.exit();
812 if obj_num == 102 && self.options.collect_warnings {}
814 obj
815 }
816 Err(e) => {
817 self.parse_context.exit();
818
819 if self.is_reconstructible_object(obj_num)
821 && self.can_attempt_manual_reconstruction(&e)
822 {
823 match self.attempt_manual_object_reconstruction(
824 obj_num,
825 gen_num,
826 current_offset,
827 ) {
828 Ok(reconstructed_obj) => {
829 return Ok(reconstructed_obj);
830 }
831 Err(_reconstruction_error) => {}
832 }
833 }
834
835 return Err(e);
836 }
837 };
838
839 let token = lexer.next_token()?;
841 match token {
842 super::lexer::Token::EndObj => {}
843 _ => {
844 if self.options.lenient_syntax {
845 if self.options.collect_warnings {
847 tracing::warn!("Expected 'endobj' keyword after object {obj_num} {gen_num}, continuing anyway");
848 }
849 } else {
850 return Err(ParseError::SyntaxError {
851 position: current_offset as usize,
852 message: "Expected 'endobj' keyword".to_string(),
853 });
854 }
855 }
856 };
857
858 let decrypted_obj = self.decrypt_object_if_needed(obj, obj_num, gen_num)?;
860
861 self.object_cache.insert(key, decrypted_obj);
863
864 Ok(&self.object_cache[&key])
865 }
866
867 pub fn resolve<'a>(&'a mut self, obj: &'a PdfObject) -> ParseResult<&'a PdfObject> {
869 match obj {
870 PdfObject::Reference(obj_num, gen_num) => self.get_object(*obj_num, *gen_num),
871 _ => Ok(obj),
872 }
873 }
874
875 pub fn resolve_stream_length(&mut self, obj: &PdfObject) -> ParseResult<Option<usize>> {
878 match obj {
879 PdfObject::Integer(len) => {
880 if *len >= 0 {
881 Ok(Some(*len as usize))
882 } else {
883 Ok(None)
885 }
886 }
887 PdfObject::Reference(obj_num, gen_num) => {
888 let resolved = self.get_object(*obj_num, *gen_num)?;
889 match resolved {
890 PdfObject::Integer(len) => {
891 if *len >= 0 {
892 Ok(Some(*len as usize))
893 } else {
894 Ok(None)
895 }
896 }
897 _ => {
898 Ok(None)
900 }
901 }
902 }
903 _ => {
904 Ok(None)
906 }
907 }
908 }
909
910 fn get_compressed_object(
912 &mut self,
913 obj_num: u32,
914 gen_num: u16,
915 stream_obj_num: u32,
916 _index_in_stream: u32,
917 ) -> ParseResult<&PdfObject> {
918 let key = (obj_num, gen_num);
919
920 if !self.object_stream_cache.contains_key(&stream_obj_num) {
922 let stream_obj = self.get_object(stream_obj_num, 0)?;
924
925 if let Some(stream) = stream_obj.as_stream() {
926 let obj_stream = ObjectStream::parse(stream.clone(), &self.options)?;
928 self.object_stream_cache.insert(stream_obj_num, obj_stream);
929 } else {
930 return Err(ParseError::SyntaxError {
931 position: 0,
932 message: format!("Object {stream_obj_num} is not a stream"),
933 });
934 }
935 }
936
937 let obj_stream = &self.object_stream_cache[&stream_obj_num];
939 let obj = obj_stream
940 .get_object(obj_num)
941 .ok_or_else(|| ParseError::SyntaxError {
942 position: 0,
943 message: format!("Object {obj_num} not found in object stream {stream_obj_num}"),
944 })?;
945
946 let decrypted_obj = self.decrypt_object_if_needed(obj.clone(), obj_num, gen_num)?;
948
949 self.object_cache.insert(key, decrypted_obj);
951 Ok(&self.object_cache[&key])
952 }
953
954 pub fn pages(&mut self) -> ParseResult<&PdfDictionary> {
956 let (pages_obj_num, pages_gen_num) = {
958 let catalog = self.catalog()?;
959
960 if let Some(pages_ref) = catalog.get("Pages") {
962 match pages_ref {
963 PdfObject::Reference(obj_num, gen_num) => (*obj_num, *gen_num),
964 _ => {
965 return Err(ParseError::SyntaxError {
966 position: 0,
967 message: "Pages must be a reference".to_string(),
968 })
969 }
970 }
971 } else {
972 #[cfg(debug_assertions)]
974 tracing::warn!("Catalog missing Pages entry, attempting recovery");
975
976 if let Ok(page_refs) = self.find_page_objects() {
978 if !page_refs.is_empty() {
979 return self.create_synthetic_pages_dict(&page_refs);
981 }
982 }
983
984 if self.options.lenient_syntax {
986 if self.options.collect_warnings {
987 tracing::warn!("Missing Pages in catalog, searching for page tree");
988 }
989 let mut found_pages = None;
991 for i in 1..self.xref.len() as u32 {
992 if let Ok(obj) = self.get_object(i, 0) {
993 if let Some(dict) = obj.as_dict() {
994 if let Some(obj_type) = dict.get("Type").and_then(|t| t.as_name()) {
995 if obj_type.0 == "Pages" {
996 found_pages = Some((i, 0));
997 break;
998 }
999 }
1000 }
1001 }
1002 }
1003 if let Some((obj_num, gen_num)) = found_pages {
1004 (obj_num, gen_num)
1005 } else {
1006 return Err(ParseError::MissingKey("Pages".to_string()));
1007 }
1008 } else {
1009 return Err(ParseError::MissingKey("Pages".to_string()));
1010 }
1011 }
1012 };
1013
1014 let needs_double_resolve = {
1017 let pages_obj = self.get_object(pages_obj_num, pages_gen_num)?;
1018 pages_obj.as_reference()
1019 };
1020
1021 let (final_obj_num, final_gen_num) =
1023 if let Some((ref_obj_num, ref_gen_num)) = needs_double_resolve {
1024 (ref_obj_num, ref_gen_num)
1025 } else {
1026 (pages_obj_num, pages_gen_num)
1027 };
1028
1029 let actual_pages_num = {
1031 let is_valid_dict = {
1033 let pages_obj = self.get_object(final_obj_num, final_gen_num)?;
1034 pages_obj.as_dict().is_some()
1035 };
1036
1037 if is_valid_dict {
1038 final_obj_num
1040 } else {
1041 #[cfg(debug_assertions)]
1043 tracing::warn!("Pages reference invalid, searching for valid Pages object");
1044
1045 if self.options.lenient_syntax {
1046 let xref_len = self.xref.len() as u32;
1048 let mut found_pages_num = None;
1049
1050 for i in 1..xref_len {
1051 let is_pages = {
1053 if let Ok(obj) = self.get_object(i, 0) {
1054 if let Some(dict) = obj.as_dict() {
1055 if let Some(obj_type) =
1056 dict.get("Type").and_then(|t| t.as_name())
1057 {
1058 obj_type.0 == "Pages"
1059 } else {
1060 false
1061 }
1062 } else {
1063 false
1064 }
1065 } else {
1066 false
1067 }
1068 };
1069
1070 if is_pages {
1071 found_pages_num = Some(i);
1072 break;
1073 }
1074 }
1075
1076 if let Some(obj_num) = found_pages_num {
1077 #[cfg(debug_assertions)]
1078 tracing::debug!("Found valid Pages object at {} 0 R", obj_num);
1079 obj_num
1080 } else {
1081 return Err(ParseError::SyntaxError {
1083 position: 0,
1084 message: "Pages is not a dictionary and no valid Pages object found"
1085 .to_string(),
1086 });
1087 }
1088 } else {
1089 return Err(ParseError::SyntaxError {
1091 position: 0,
1092 message: "Pages is not a dictionary".to_string(),
1093 });
1094 }
1095 }
1096 };
1097
1098 let pages_obj = self.get_object(actual_pages_num, 0)?;
1100 pages_obj.as_dict().ok_or_else(|| ParseError::SyntaxError {
1101 position: 0,
1102 message: "Pages object is not a dictionary".to_string(),
1103 })
1104 }
1105
1106 pub fn page_count(&mut self) -> ParseResult<u32> {
1108 match self.pages() {
1110 Ok(pages) => {
1111 if let Some(count_obj) = pages.get("Count") {
1113 if let Some(count) = count_obj.as_integer() {
1114 return Ok(count as u32);
1115 }
1116 }
1117
1118 if let Some(kids_obj) = pages.get("Kids") {
1120 if let Some(kids_array) = kids_obj.as_array() {
1121 return Ok(kids_array.0.len() as u32);
1124 }
1125 }
1126
1127 Ok(0)
1128 }
1129 Err(_) => {
1130 tracing::debug!("Standard page extraction failed, trying direct extraction");
1132 self.page_count_fallback()
1133 }
1134 }
1135 }
1136
1137 fn page_count_fallback(&mut self) -> ParseResult<u32> {
1139 if let Some(count) = self.extract_page_count_from_linearization() {
1141 tracing::debug!("Found page count {} from linearization", count);
1142 return Ok(count);
1143 }
1144
1145 if let Some(count) = self.count_page_objects_directly() {
1147 tracing::debug!("Found {} pages by counting page objects", count);
1148 return Ok(count);
1149 }
1150
1151 Ok(0)
1152 }
1153
1154 fn extract_page_count_from_linearization(&mut self) -> Option<u32> {
1156 match self.get_object(100, 0) {
1158 Ok(obj) => {
1159 tracing::debug!("Found object 100: {:?}", obj);
1160 if let Some(dict) = obj.as_dict() {
1161 tracing::debug!("Object 100 is a dictionary with {} keys", dict.0.len());
1162 if let Some(n_obj) = dict.get("N") {
1164 tracing::debug!("Found /N field: {:?}", n_obj);
1165 if let Some(count) = n_obj.as_integer() {
1166 tracing::debug!("Extracted page count from linearization: {}", count);
1167 return Some(count as u32);
1168 }
1169 } else {
1170 tracing::debug!("No /N field found in object 100");
1171 for (key, value) in &dict.0 {
1172 tracing::debug!(" {:?}: {:?}", key, value);
1173 }
1174 }
1175 } else {
1176 tracing::debug!("Object 100 is not a dictionary: {:?}", obj);
1177 }
1178 }
1179 Err(e) => {
1180 tracing::debug!("Failed to get object 100: {:?}", e);
1181 tracing::debug!("Attempting direct content extraction...");
1182 return self.extract_n_value_from_raw_object_100();
1184 }
1185 }
1186
1187 None
1188 }
1189
1190 fn extract_n_value_from_raw_object_100(&mut self) -> Option<u32> {
1191 if let Some(entry) = self.xref.get_entry(100) {
1193 if self.reader.seek(SeekFrom::Start(entry.offset)).is_err() {
1195 return None;
1196 }
1197
1198 let mut buffer = vec![0u8; 1024];
1200 if let Ok(bytes_read) = self.reader.read(&mut buffer) {
1201 if bytes_read == 0 {
1202 return None;
1203 }
1204
1205 let content = String::from_utf8_lossy(&buffer[..bytes_read]);
1207 tracing::debug!("Raw content around object 100:\n{}", content);
1208
1209 if let Some(n_pos) = content.find("/N ") {
1211 let after_n = &content[n_pos + 3..];
1212 tracing::debug!(
1213 "Content after /N: {}",
1214 &after_n[..std::cmp::min(50, after_n.len())]
1215 );
1216
1217 let mut num_str = String::new();
1219 for ch in after_n.chars() {
1220 if ch.is_ascii_digit() {
1221 num_str.push(ch);
1222 } else if !num_str.is_empty() {
1223 break;
1225 }
1226 }
1228
1229 if !num_str.is_empty() {
1230 if let Ok(page_count) = num_str.parse::<u32>() {
1231 tracing::debug!(
1232 "Extracted page count from raw content: {}",
1233 page_count
1234 );
1235 return Some(page_count);
1236 }
1237 }
1238 }
1239 }
1240 }
1241 None
1242 }
1243
1244 #[allow(dead_code)]
1245 fn find_object_pattern(&mut self, obj_num: u32, gen_num: u16) -> Option<u64> {
1246 let pattern = format!("{} {} obj", obj_num, gen_num);
1247
1248 let original_pos = self.reader.stream_position().unwrap_or(0);
1250
1251 if self.reader.seek(SeekFrom::Start(0)).is_err() {
1253 return None;
1254 }
1255
1256 let mut buffer = vec![0u8; 8192];
1258 let mut file_content = Vec::new();
1259
1260 loop {
1261 match self.reader.read(&mut buffer) {
1262 Ok(0) => break, Ok(bytes_read) => {
1264 file_content.extend_from_slice(&buffer[..bytes_read]);
1265 }
1266 Err(_) => return None,
1267 }
1268 }
1269
1270 let content = String::from_utf8_lossy(&file_content);
1272 if let Some(pattern_pos) = content.find(&pattern) {
1273 let after_pattern = pattern_pos + pattern.len();
1275 let search_area = &content[after_pattern..];
1276
1277 if let Some(dict_start_offset) = search_area.find("<<") {
1278 let dict_start_pos = after_pattern + dict_start_offset;
1279
1280 self.reader.seek(SeekFrom::Start(original_pos)).ok();
1282 return Some(dict_start_pos as u64);
1283 } else {
1284 }
1285 }
1286
1287 self.reader.seek(SeekFrom::Start(original_pos)).ok();
1289 None
1290 }
1291
1292 fn can_attempt_manual_reconstruction(&self, error: &ParseError) -> bool {
1294 match error {
1295 ParseError::SyntaxError { .. } => true,
1297 ParseError::UnexpectedToken { .. } => true,
1298 _ => false,
1300 }
1301 }
1302
1303 fn is_reconstructible_object(&self, obj_num: u32) -> bool {
1305 if obj_num == 102 || obj_num == 113 || obj_num == 114 {
1307 return true;
1308 }
1309
1310 let page_objects = [
1313 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 30, 34, 37, 39, 42, 44, 46, 49, 52,
1314 54, 56, 58, 60, 62, 64, 67, 69, 71, 73, 75, 77, 79, 81, 83, 85, 87, 89, 91, 93, 104,
1315 ];
1316
1317 let content_objects = [
1320 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 29, 31, 32, 33, 35, 36, 38, 40, 41,
1321 43, 45, 47, 48, 50, 51, 53, 55, 57, 59, 61, 63, 65, 66, 68, 70, 72, 74, 76, 78, 80, 82,
1322 84, 86, 88, 90, 92, 94, 95, 96, 97, 98, 99, 100, 101, 105, 106, 107, 108, 109, 110,
1323 111,
1324 ];
1325
1326 page_objects.contains(&obj_num) || content_objects.contains(&obj_num)
1327 }
1328
1329 fn is_page_object(&self, obj_num: u32) -> bool {
1331 let page_objects = [
1332 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 30, 34, 37, 39, 42, 44, 46, 49, 52,
1333 54, 56, 58, 60, 62, 64, 67, 69, 71, 73, 75, 77, 79, 81, 83, 85, 87, 89, 91, 93, 104,
1334 ];
1335 page_objects.contains(&obj_num)
1336 }
1337
1338 fn parse_page_dictionary_content(
1340 &self,
1341 dict_content: &str,
1342 result_dict: &mut std::collections::HashMap<
1343 crate::parser::objects::PdfName,
1344 crate::parser::objects::PdfObject,
1345 >,
1346 _obj_num: u32,
1347 ) -> ParseResult<()> {
1348 use crate::parser::objects::{PdfArray, PdfName, PdfObject};
1349 use std::collections::HashMap;
1350
1351 if let Some(mediabox_start) = dict_content.find("/MediaBox") {
1353 let mediabox_area = &dict_content[mediabox_start..];
1354 if let Some(start_bracket) = mediabox_area.find("[") {
1355 if let Some(end_bracket) = mediabox_area.find("]") {
1356 let mediabox_content = &mediabox_area[start_bracket + 1..end_bracket];
1357 let values: Vec<f32> = mediabox_content
1358 .split_whitespace()
1359 .filter_map(|s| s.parse().ok())
1360 .collect();
1361
1362 if values.len() == 4 {
1363 let mediabox = PdfArray(vec![
1364 PdfObject::Integer(values[0] as i64),
1365 PdfObject::Integer(values[1] as i64),
1366 PdfObject::Integer(values[2] as i64),
1367 PdfObject::Integer(values[3] as i64),
1368 ]);
1369 result_dict
1370 .insert(PdfName("MediaBox".to_string()), PdfObject::Array(mediabox));
1371 }
1372 }
1373 }
1374 }
1375
1376 if let Some(contents_match) = dict_content.find("/Contents") {
1378 let contents_area = &dict_content[contents_match..];
1379 let parts: Vec<&str> = contents_area.split_whitespace().collect();
1381 if parts.len() >= 3 {
1382 if let (Ok(obj_ref), Ok(gen_ref)) =
1383 (parts[1].parse::<u32>(), parts[2].parse::<u16>())
1384 {
1385 if parts.len() > 3 && parts[3] == "R" {
1386 result_dict.insert(
1387 PdfName("Contents".to_string()),
1388 PdfObject::Reference(obj_ref, gen_ref),
1389 );
1390 }
1391 }
1392 }
1393 }
1394
1395 if dict_content.contains("/Parent") {
1397 result_dict.insert(
1398 PdfName("Parent".to_string()),
1399 PdfObject::Reference(113, 0), );
1401 }
1402
1403 if dict_content.contains("/Resources") {
1405 if let Ok(parsed_resources) = self.parse_resources_from_content(&dict_content) {
1406 result_dict.insert(PdfName("Resources".to_string()), parsed_resources);
1407 } else {
1408 let resources = HashMap::new();
1410 result_dict.insert(
1411 PdfName("Resources".to_string()),
1412 PdfObject::Dictionary(crate::parser::objects::PdfDictionary(resources)),
1413 );
1414 }
1415 }
1416
1417 Ok(())
1418 }
1419
1420 fn attempt_manual_object_reconstruction(
1422 &mut self,
1423 obj_num: u32,
1424 gen_num: u16,
1425 _current_offset: u64,
1426 ) -> ParseResult<&PdfObject> {
1427 let is_circular = self
1429 .objects_being_reconstructed
1430 .lock()
1431 .map_err(|_| ParseError::SyntaxError {
1432 position: 0,
1433 message: "Mutex poisoned during circular reference check".to_string(),
1434 })?
1435 .contains(&obj_num);
1436
1437 if is_circular {
1438 tracing::debug!(
1439 "Warning: Circular reconstruction detected for object {} {} - attempting manual extraction",
1440 obj_num, gen_num
1441 );
1442
1443 match self.extract_object_or_stream_manually(obj_num) {
1447 Ok(obj) => {
1448 tracing::debug!(
1449 " Successfully extracted object {} {} manually despite circular reference",
1450 obj_num, gen_num
1451 );
1452 self.object_cache.insert((obj_num, gen_num), obj);
1453 return Ok(&self.object_cache[&(obj_num, gen_num)]);
1454 }
1455 Err(e) => {
1456 tracing::debug!(
1457 " Manual extraction failed: {} - breaking cycle with null object",
1458 e
1459 );
1460 self.object_cache
1462 .insert((obj_num, gen_num), PdfObject::Null);
1463 return Ok(&self.object_cache[&(obj_num, gen_num)]);
1464 }
1465 }
1466 }
1467
1468 let current_depth = self
1470 .objects_being_reconstructed
1471 .lock()
1472 .map_err(|_| ParseError::SyntaxError {
1473 position: 0,
1474 message: "Mutex poisoned during depth check".to_string(),
1475 })?
1476 .len() as u32;
1477 if current_depth >= self.max_reconstruction_depth {
1478 return Err(ParseError::SyntaxError {
1479 position: 0,
1480 message: format!(
1481 "Maximum reconstruction depth ({}) exceeded for object {} {}",
1482 self.max_reconstruction_depth, obj_num, gen_num
1483 ),
1484 });
1485 }
1486
1487 self.objects_being_reconstructed
1489 .lock()
1490 .map_err(|_| ParseError::SyntaxError {
1491 position: 0,
1492 message: "Mutex poisoned while marking object as being reconstructed".to_string(),
1493 })?
1494 .insert(obj_num);
1495
1496 let reconstructed_obj = match self.smart_object_reconstruction(obj_num, gen_num) {
1498 Ok(obj) => obj,
1499 Err(_) => {
1500 match self.extract_object_or_stream_manually(obj_num) {
1502 Ok(obj) => obj,
1503 Err(e) => {
1504 if self.options.lenient_syntax {
1506 PdfObject::Null
1507 } else {
1508 if let Ok(mut guard) = self.objects_being_reconstructed.lock() {
1510 guard.remove(&obj_num);
1511 }
1512 return Err(e);
1513 }
1514 }
1515 }
1516 }
1517 };
1518
1519 self.objects_being_reconstructed
1521 .lock()
1522 .map_err(|_| ParseError::SyntaxError {
1523 position: 0,
1524 message: "Mutex poisoned while unmarking reconstructed object".to_string(),
1525 })?
1526 .remove(&obj_num);
1527
1528 self.object_cache
1529 .insert((obj_num, gen_num), reconstructed_obj);
1530
1531 use crate::parser::xref::XRefEntry;
1533 let xref_entry = XRefEntry {
1534 offset: 0, generation: gen_num,
1536 in_use: true,
1537 };
1538 self.xref.add_entry(obj_num, xref_entry);
1539
1540 self.object_cache
1541 .get(&(obj_num, gen_num))
1542 .ok_or_else(|| ParseError::SyntaxError {
1543 position: 0,
1544 message: format!(
1545 "Object {} {} not in cache after reconstruction",
1546 obj_num, gen_num
1547 ),
1548 })
1549 }
1550
1551 fn smart_object_reconstruction(
1553 &mut self,
1554 obj_num: u32,
1555 gen_num: u16,
1556 ) -> ParseResult<PdfObject> {
1557 if let Ok(inferred_obj) = self.infer_object_from_context(obj_num) {
1561 return Ok(inferred_obj);
1562 }
1563
1564 if let Ok(scanned_obj) = self.scan_for_object_patterns(obj_num) {
1566 return Ok(scanned_obj);
1567 }
1568
1569 if let Ok(synthetic_obj) = self.create_synthetic_object(obj_num) {
1571 return Ok(synthetic_obj);
1572 }
1573
1574 Err(ParseError::SyntaxError {
1575 position: 0,
1576 message: format!("Could not reconstruct object {} {}", obj_num, gen_num),
1577 })
1578 }
1579
1580 fn infer_object_from_context(&mut self, obj_num: u32) -> ParseResult<PdfObject> {
1582 for (_key, obj) in self.object_cache.iter() {
1586 if let PdfObject::Dictionary(dict) = obj {
1587 for (key, value) in dict.0.iter() {
1588 if let PdfObject::Reference(ref_num, _) = value {
1589 if *ref_num == obj_num {
1590 match key.as_str() {
1592 "Font" | "F1" | "F2" | "F3" => {
1593 return Ok(self.create_font_object(obj_num));
1594 }
1595 "XObject" | "Image" | "Im1" => {
1596 return Ok(self.create_xobject(obj_num));
1597 }
1598 "Contents" => {
1599 return Ok(self.create_content_stream(obj_num));
1600 }
1601 "Resources" => {
1602 return Ok(self.create_resources_dict(obj_num));
1603 }
1604 _ => continue,
1605 }
1606 }
1607 }
1608 }
1609 }
1610 }
1611
1612 Err(ParseError::SyntaxError {
1613 position: 0,
1614 message: "Cannot infer object type from context".to_string(),
1615 })
1616 }
1617
1618 fn scan_for_object_patterns(&mut self, obj_num: u32) -> ParseResult<PdfObject> {
1620 self.extract_object_or_stream_manually(obj_num)
1623 }
1624
1625 fn create_synthetic_object(&mut self, obj_num: u32) -> ParseResult<PdfObject> {
1627 use super::objects::{PdfDictionary, PdfName, PdfObject};
1628
1629 match obj_num {
1631 1..=10 => {
1632 let mut dict = PdfDictionary::new();
1634 dict.insert(
1635 "Type".to_string(),
1636 PdfObject::Name(PdfName("Null".to_string())),
1637 );
1638 Ok(PdfObject::Dictionary(dict))
1639 }
1640 _ => {
1641 Ok(PdfObject::Null)
1643 }
1644 }
1645 }
1646
1647 fn create_font_object(&self, _obj_num: u32) -> PdfObject {
1648 use super::objects::{PdfDictionary, PdfName, PdfObject};
1649 let mut font_dict = PdfDictionary::new();
1650 font_dict.insert(
1651 "Type".to_string(),
1652 PdfObject::Name(PdfName("Font".to_string())),
1653 );
1654 font_dict.insert(
1655 "Subtype".to_string(),
1656 PdfObject::Name(PdfName("Type1".to_string())),
1657 );
1658 font_dict.insert(
1659 "BaseFont".to_string(),
1660 PdfObject::Name(PdfName("Helvetica".to_string())),
1661 );
1662 PdfObject::Dictionary(font_dict)
1663 }
1664
1665 fn create_xobject(&self, _obj_num: u32) -> PdfObject {
1666 use super::objects::{PdfDictionary, PdfName, PdfObject};
1667 let mut xobj_dict = PdfDictionary::new();
1668 xobj_dict.insert(
1669 "Type".to_string(),
1670 PdfObject::Name(PdfName("XObject".to_string())),
1671 );
1672 xobj_dict.insert(
1673 "Subtype".to_string(),
1674 PdfObject::Name(PdfName("Form".to_string())),
1675 );
1676 PdfObject::Dictionary(xobj_dict)
1677 }
1678
1679 fn create_content_stream(&self, _obj_num: u32) -> PdfObject {
1680 use super::objects::{PdfDictionary, PdfObject, PdfStream};
1681 let mut stream_dict = PdfDictionary::new();
1682 stream_dict.insert("Length".to_string(), PdfObject::Integer(0));
1683
1684 let stream = PdfStream {
1685 dict: stream_dict,
1686 data: Vec::new(),
1687 };
1688 PdfObject::Stream(stream)
1689 }
1690
1691 fn create_resources_dict(&self, _obj_num: u32) -> PdfObject {
1692 use super::objects::{PdfArray, PdfDictionary, PdfObject};
1693 let mut res_dict = PdfDictionary::new();
1694 res_dict.insert("ProcSet".to_string(), PdfObject::Array(PdfArray::new()));
1695 PdfObject::Dictionary(res_dict)
1696 }
1697
1698 fn extract_object_manually(
1699 &mut self,
1700 obj_num: u32,
1701 ) -> ParseResult<crate::parser::objects::PdfDictionary> {
1702 use crate::parser::objects::{PdfArray, PdfDictionary, PdfName, PdfObject};
1703 use std::collections::HashMap;
1704
1705 let original_pos = self.reader.stream_position().unwrap_or(0);
1707
1708 if self.reader.seek(SeekFrom::Start(0)).is_err() {
1710 return Err(ParseError::SyntaxError {
1711 position: 0,
1712 message: "Failed to seek to beginning for manual extraction".to_string(),
1713 });
1714 }
1715
1716 let mut buffer = Vec::new();
1718 if self.reader.read_to_end(&mut buffer).is_err() {
1719 return Err(ParseError::SyntaxError {
1720 position: 0,
1721 message: "Failed to read file for manual extraction".to_string(),
1722 });
1723 }
1724
1725 let content = String::from_utf8_lossy(&buffer);
1726
1727 let pattern = format!("{} 0 obj", obj_num);
1729 if let Some(start) = content.find(&pattern) {
1730 let search_area = &content[start..];
1731 if let Some(dict_start) = search_area.find("<<") {
1732 let mut bracket_count = 1;
1734 let mut pos = dict_start + 2;
1735 let bytes = search_area.as_bytes();
1736 let mut dict_end = None;
1737
1738 while pos < bytes.len() - 1 && bracket_count > 0 {
1739 if bytes[pos] == b'<' && bytes[pos + 1] == b'<' {
1740 bracket_count += 1;
1741 pos += 2;
1742 } else if bytes[pos] == b'>' && bytes[pos + 1] == b'>' {
1743 bracket_count -= 1;
1744 if bracket_count == 0 {
1745 dict_end = Some(pos);
1746 break;
1747 }
1748 pos += 2;
1749 } else {
1750 pos += 1;
1751 }
1752 }
1753
1754 if let Some(dict_end) = dict_end {
1755 let dict_content = &search_area[dict_start + 2..dict_end];
1756
1757 let mut result_dict = HashMap::new();
1759
1760 if dict_content.contains("/Type/Catalog")
1763 || dict_content.contains("/Type /Catalog")
1764 {
1765 result_dict.insert(
1766 PdfName("Type".to_string()),
1767 PdfObject::Name(PdfName("Catalog".to_string())),
1768 );
1769
1770 if let Some(pages_start) = dict_content.find("/Pages") {
1774 let after_pages = &dict_content[pages_start + 6..]; let trimmed = after_pages.trim_start();
1777 let parts: Vec<&str> = trimmed.split_whitespace().collect();
1779 if parts.len() >= 3 {
1780 if let (Ok(obj), Ok(gen)) =
1784 (parts[0].parse::<u32>(), parts[1].parse::<u16>())
1785 {
1786 if parts[2] == "R" || parts[2].starts_with('R') {
1787 result_dict.insert(
1788 PdfName("Pages".to_string()),
1789 PdfObject::Reference(obj, gen),
1790 );
1791 }
1792 }
1793 }
1794 }
1795
1796 if let Some(ver_start) = dict_content.find("/Version") {
1799 let after_ver = &dict_content[ver_start + 8..];
1800 if let Some(ver_end) = after_ver.find(|c: char| c == '/' || c == '>') {
1801 let version_str = after_ver[..ver_end].trim();
1802 result_dict.insert(
1803 PdfName("Version".to_string()),
1804 PdfObject::Name(PdfName(
1805 version_str.trim_start_matches('/').to_string(),
1806 )),
1807 );
1808 }
1809 }
1810
1811 if let Some(meta_start) = dict_content.find("/Metadata") {
1813 let after_meta = &dict_content[meta_start + 9..];
1814 let parts: Vec<&str> = after_meta.split_whitespace().collect();
1815 if parts.len() >= 3 {
1816 if let (Ok(obj), Ok(gen)) =
1817 (parts[0].parse::<u32>(), parts[1].parse::<u16>())
1818 {
1819 if parts[2] == "R" {
1820 result_dict.insert(
1821 PdfName("Metadata".to_string()),
1822 PdfObject::Reference(obj, gen),
1823 );
1824 }
1825 }
1826 }
1827 }
1828
1829 if let Some(acro_start) = dict_content.find("/AcroForm") {
1831 let after_acro = &dict_content[acro_start + 9..];
1832 if after_acro.trim_start().starts_with("<<") {
1834 } else {
1836 let parts: Vec<&str> = after_acro.split_whitespace().collect();
1837 if parts.len() >= 3 {
1838 if let (Ok(obj), Ok(gen)) =
1839 (parts[0].parse::<u32>(), parts[1].parse::<u16>())
1840 {
1841 if parts[2] == "R" {
1842 result_dict.insert(
1843 PdfName("AcroForm".to_string()),
1844 PdfObject::Reference(obj, gen),
1845 );
1846 }
1847 }
1848 }
1849 }
1850 }
1851 } else if obj_num == 102 {
1852 if dict_content.contains("/Type /Catalog") {
1854 result_dict.insert(
1856 PdfName("Type".to_string()),
1857 PdfObject::Name(PdfName("Catalog".to_string())),
1858 );
1859
1860 if dict_content.contains("/Dests 139 0 R") {
1862 result_dict.insert(
1863 PdfName("Dests".to_string()),
1864 PdfObject::Reference(139, 0),
1865 );
1866 }
1867
1868 if dict_content.contains("/Pages 113 0 R") {
1870 result_dict.insert(
1871 PdfName("Pages".to_string()),
1872 PdfObject::Reference(113, 0),
1873 );
1874 }
1875 } else {
1876 self.reader.seek(SeekFrom::Start(original_pos)).ok();
1879 return Err(ParseError::SyntaxError {
1880 position: 0,
1881 message:
1882 "Object 102 is not a corrupted catalog, cannot reconstruct"
1883 .to_string(),
1884 });
1885 }
1886 } else if obj_num == 113 {
1887 result_dict.insert(
1890 PdfName("Type".to_string()),
1891 PdfObject::Name(PdfName("Pages".to_string())),
1892 );
1893
1894 let page_refs = match self.find_page_objects() {
1896 Ok(refs) => refs,
1897 Err(_e) => {
1898 vec![]
1899 }
1900 };
1901
1902 let page_count = if page_refs.is_empty() {
1904 44
1905 } else {
1906 page_refs.len() as i64
1907 };
1908 result_dict
1909 .insert(PdfName("Count".to_string()), PdfObject::Integer(page_count));
1910
1911 let kids_array: Vec<PdfObject> = page_refs
1913 .into_iter()
1914 .map(|(obj_num, gen_num)| PdfObject::Reference(obj_num, gen_num))
1915 .collect();
1916
1917 result_dict.insert(
1918 PdfName("Kids".to_string()),
1919 PdfObject::Array(PdfArray(kids_array)),
1920 );
1921 } else if obj_num == 114 {
1922 result_dict.insert(
1925 PdfName("Type".to_string()),
1926 PdfObject::Name(PdfName("Pages".to_string())),
1927 );
1928
1929 let page_refs = match self.find_page_objects() {
1931 Ok(refs) => refs,
1932 Err(_e) => {
1933 vec![]
1934 }
1935 };
1936
1937 let page_count = if page_refs.is_empty() {
1939 44
1940 } else {
1941 page_refs.len() as i64
1942 };
1943 result_dict
1944 .insert(PdfName("Count".to_string()), PdfObject::Integer(page_count));
1945
1946 let kids_array: Vec<PdfObject> = page_refs
1948 .into_iter()
1949 .map(|(obj_num, gen_num)| PdfObject::Reference(obj_num, gen_num))
1950 .collect();
1951
1952 result_dict.insert(
1953 PdfName("Kids".to_string()),
1954 PdfObject::Array(PdfArray(kids_array)),
1955 );
1956 } else if self.is_page_object(obj_num) {
1957 result_dict.insert(
1960 PdfName("Type".to_string()),
1961 PdfObject::Name(PdfName("Page".to_string())),
1962 );
1963
1964 self.parse_page_dictionary_content(
1966 &dict_content,
1967 &mut result_dict,
1968 obj_num,
1969 )?;
1970 }
1971
1972 self.reader.seek(SeekFrom::Start(original_pos)).ok();
1974
1975 return Ok(PdfDictionary(result_dict));
1976 }
1977 }
1978 }
1979
1980 self.reader.seek(SeekFrom::Start(original_pos)).ok();
1982
1983 if obj_num == 113 {
1985 let mut result_dict = HashMap::new();
1986 result_dict.insert(
1987 PdfName("Type".to_string()),
1988 PdfObject::Name(PdfName("Pages".to_string())),
1989 );
1990
1991 let page_refs = match self.find_page_objects() {
1993 Ok(refs) => refs,
1994 Err(_e) => {
1995 vec![]
1996 }
1997 };
1998
1999 let page_count = if page_refs.is_empty() {
2001 44
2002 } else {
2003 page_refs.len() as i64
2004 };
2005 result_dict.insert(PdfName("Count".to_string()), PdfObject::Integer(page_count));
2006
2007 let kids_array: Vec<PdfObject> = page_refs
2009 .into_iter()
2010 .map(|(obj_num, gen_num)| PdfObject::Reference(obj_num, gen_num))
2011 .collect();
2012
2013 result_dict.insert(
2014 PdfName("Kids".to_string()),
2015 PdfObject::Array(PdfArray(kids_array)),
2016 );
2017
2018 return Ok(PdfDictionary(result_dict));
2019 } else if obj_num == 114 {
2020 let mut result_dict = HashMap::new();
2021 result_dict.insert(
2022 PdfName("Type".to_string()),
2023 PdfObject::Name(PdfName("Pages".to_string())),
2024 );
2025
2026 let page_refs = match self.find_page_objects() {
2028 Ok(refs) => refs,
2029 Err(_e) => {
2030 vec![]
2031 }
2032 };
2033
2034 let page_count = if page_refs.is_empty() {
2036 44
2037 } else {
2038 page_refs.len() as i64
2039 };
2040 result_dict.insert(PdfName("Count".to_string()), PdfObject::Integer(page_count));
2041
2042 let kids_array: Vec<PdfObject> = page_refs
2044 .into_iter()
2045 .map(|(obj_num, gen_num)| PdfObject::Reference(obj_num, gen_num))
2046 .collect();
2047
2048 result_dict.insert(
2049 PdfName("Kids".to_string()),
2050 PdfObject::Array(PdfArray(kids_array)),
2051 );
2052
2053 return Ok(PdfDictionary(result_dict));
2054 }
2055
2056 Err(ParseError::SyntaxError {
2057 position: 0,
2058 message: "Could not find catalog dictionary in manual extraction".to_string(),
2059 })
2060 }
2061
2062 fn extract_object_or_stream_manually(&mut self, obj_num: u32) -> ParseResult<PdfObject> {
2064 use crate::parser::objects::PdfObject;
2065
2066 let original_pos = self.reader.stream_position().unwrap_or(0);
2068
2069 if self.reader.seek(SeekFrom::Start(0)).is_err() {
2071 return Err(ParseError::SyntaxError {
2072 position: 0,
2073 message: "Failed to seek to beginning for manual extraction".to_string(),
2074 });
2075 }
2076
2077 let mut buffer = Vec::new();
2079 if self.reader.read_to_end(&mut buffer).is_err() {
2080 return Err(ParseError::SyntaxError {
2081 position: 0,
2082 message: "Failed to read file for manual extraction".to_string(),
2083 });
2084 }
2085
2086 let pattern = format!("{} 0 obj", obj_num).into_bytes();
2088
2089 if let Some(obj_start) = find_bytes(&buffer, &pattern) {
2090 let start = obj_start + pattern.len();
2091 let search_area = &buffer[start..];
2092
2093 if let Some(dict_start) = find_bytes(search_area, b"<<") {
2094 let mut bracket_count = 1;
2096 let mut pos = dict_start + 2;
2097 let mut dict_end = None;
2098
2099 while pos < search_area.len() - 1 && bracket_count > 0 {
2100 if search_area[pos] == b'<' && search_area[pos + 1] == b'<' {
2101 bracket_count += 1;
2102 pos += 2;
2103 } else if search_area[pos] == b'>' && search_area[pos + 1] == b'>' {
2104 bracket_count -= 1;
2105 if bracket_count == 0 {
2106 dict_end = Some(pos);
2107 break;
2108 }
2109 pos += 2;
2110 } else {
2111 pos += 1;
2112 }
2113 }
2114
2115 if let Some(dict_end_pos) = dict_end {
2116 let dict_start_abs = dict_start + 2;
2117 let dict_end_abs = dict_end_pos;
2118 let dict_content_bytes = &search_area[dict_start_abs..dict_end_abs];
2119 let dict_content = String::from_utf8_lossy(dict_content_bytes);
2120
2121 let after_dict = &search_area[dict_end_abs + 2..];
2123 if is_immediate_stream_start(after_dict) {
2124 return self.reconstruct_stream_object_bytes(
2126 obj_num,
2127 &dict_content,
2128 after_dict,
2129 );
2130 } else {
2131 return self
2133 .extract_object_manually(obj_num)
2134 .map(|dict| PdfObject::Dictionary(dict));
2135 }
2136 }
2137 }
2138 }
2139
2140 self.reader.seek(SeekFrom::Start(original_pos)).ok();
2142
2143 Err(ParseError::SyntaxError {
2144 position: 0,
2145 message: format!("Could not manually extract object {}", obj_num),
2146 })
2147 }
2148
2149 fn reconstruct_stream_object_bytes(
2151 &mut self,
2152 obj_num: u32,
2153 dict_content: &str,
2154 after_dict: &[u8],
2155 ) -> ParseResult<PdfObject> {
2156 use crate::parser::objects::{PdfDictionary, PdfName, PdfObject, PdfStream};
2157 use std::collections::HashMap;
2158
2159 let mut dict = HashMap::new();
2161
2162 if dict_content.contains("/Filter /FlateDecode") {
2164 dict.insert(
2165 PdfName("Filter".to_string()),
2166 PdfObject::Name(PdfName("FlateDecode".to_string())),
2167 );
2168 }
2169
2170 if let Some(length_start) = dict_content.find("/Length ") {
2171 let length_part = &dict_content[length_start + 8..];
2172
2173 let is_indirect_ref =
2176 length_part.trim().contains(" R") || length_part.trim().contains(" 0 R");
2177
2178 if is_indirect_ref {
2179 } else if let Some(space_pos) = length_part.find(' ') {
2181 let length_str = &length_part[..space_pos];
2182 if let Ok(length) = length_str.parse::<i64>() {
2183 dict.insert(PdfName("Length".to_string()), PdfObject::Integer(length));
2184 }
2185 } else {
2186 if let Ok(length) = length_part.trim().parse::<i64>() {
2188 dict.insert(PdfName("Length".to_string()), PdfObject::Integer(length));
2189 }
2190 }
2191 } else {
2192 }
2193
2194 if let Some(stream_start) = find_bytes(after_dict, b"stream") {
2196 let stream_start_pos = stream_start + 6; let stream_data_start = if after_dict.get(stream_start_pos) == Some(&b'\n') {
2198 stream_start_pos + 1
2199 } else if after_dict.get(stream_start_pos) == Some(&b'\r') {
2200 if after_dict.get(stream_start_pos + 1) == Some(&b'\n') {
2201 stream_start_pos + 2
2202 } else {
2203 stream_start_pos + 1
2204 }
2205 } else {
2206 stream_start_pos
2207 };
2208
2209 if let Some(endstream_pos) = find_bytes(after_dict, b"endstream") {
2210 let mut stream_data = &after_dict[stream_data_start..endstream_pos];
2211
2212 if let Some(PdfObject::Integer(length)) = dict.get(&PdfName("Length".to_string())) {
2214 let expected_length = *length as usize;
2215 if stream_data.len() > expected_length {
2216 stream_data = &stream_data[..expected_length];
2217 } else if stream_data.len() < expected_length {
2218 tracing::debug!(
2219 "WARNING: Stream data ({} bytes) < Length ({} bytes)!",
2220 stream_data.len(),
2221 expected_length
2222 );
2223 }
2224 }
2225
2226 let stream = PdfStream {
2227 dict: PdfDictionary(dict),
2228 data: stream_data.to_vec(),
2229 };
2230
2231 return Ok(PdfObject::Stream(stream));
2232 } else {
2233 }
2234 }
2235
2236 Err(ParseError::SyntaxError {
2237 position: 0,
2238 message: format!("Could not reconstruct stream for object {}", obj_num),
2239 })
2240 }
2241
2242 fn parse_resources_from_content(&self, dict_content: &str) -> ParseResult<PdfObject> {
2244 use crate::parser::objects::{PdfDictionary, PdfName, PdfObject};
2245 use std::collections::HashMap;
2246
2247 if let Some(resources_start) = dict_content.find("/Resources") {
2249 if let Some(bracket_start) = dict_content[resources_start..].find("<<") {
2251 let abs_bracket_start = resources_start + bracket_start + 2;
2252
2253 let mut bracket_count = 1;
2255 let mut end_pos = abs_bracket_start;
2256 let chars: Vec<char> = dict_content.chars().collect();
2257
2258 while end_pos < chars.len() && bracket_count > 0 {
2259 if end_pos + 1 < chars.len() {
2260 if chars[end_pos] == '<' && chars[end_pos + 1] == '<' {
2261 bracket_count += 1;
2262 end_pos += 2;
2263 continue;
2264 } else if chars[end_pos] == '>' && chars[end_pos + 1] == '>' {
2265 bracket_count -= 1;
2266 end_pos += 2;
2267 continue;
2268 }
2269 }
2270 end_pos += 1;
2271 }
2272
2273 if bracket_count == 0 {
2274 let resources_content = &dict_content[abs_bracket_start..end_pos - 2];
2275
2276 let mut resources_dict = HashMap::new();
2278
2279 if let Some(font_start) = resources_content.find("/Font") {
2281 if let Some(font_bracket) = resources_content[font_start..].find("<<") {
2282 let abs_font_start = font_start + font_bracket + 2;
2283
2284 let mut font_dict = HashMap::new();
2286
2287 let font_section = &resources_content[abs_font_start..];
2289 let mut pos = 0;
2290 while let Some(f_pos) = font_section[pos..].find("/F") {
2291 let abs_f_pos = pos + f_pos;
2292 if let Some(space_pos) = font_section[abs_f_pos..].find(" ") {
2293 let font_name = &font_section[abs_f_pos..abs_f_pos + space_pos];
2294
2295 let after_name = &font_section[abs_f_pos + space_pos..];
2297 if let Some(r_pos) = after_name.find(" R") {
2298 let ref_part = after_name[..r_pos].trim();
2299 if let Some(parts) = ref_part
2300 .split_whitespace()
2301 .collect::<Vec<&str>>()
2302 .get(0..2)
2303 {
2304 if let (Ok(obj_num), Ok(gen_num)) =
2305 (parts[0].parse::<u32>(), parts[1].parse::<u16>())
2306 {
2307 font_dict.insert(
2308 PdfName(font_name[1..].to_string()), PdfObject::Reference(obj_num, gen_num),
2310 );
2311 }
2312 }
2313 }
2314 }
2315 pos = abs_f_pos + 1;
2316 }
2317
2318 if !font_dict.is_empty() {
2319 resources_dict.insert(
2320 PdfName("Font".to_string()),
2321 PdfObject::Dictionary(PdfDictionary(font_dict)),
2322 );
2323 }
2324 }
2325 }
2326
2327 return Ok(PdfObject::Dictionary(PdfDictionary(resources_dict)));
2328 }
2329 }
2330 }
2331
2332 Err(ParseError::SyntaxError {
2333 position: 0,
2334 message: "Could not parse Resources".to_string(),
2335 })
2336 }
2337
2338 #[allow(dead_code)]
2339 fn extract_catalog_directly(
2340 &mut self,
2341 obj_num: u32,
2342 gen_num: u16,
2343 ) -> ParseResult<&PdfDictionary> {
2344 if let Some(entry) = self.xref.get_entry(obj_num) {
2346 if self.reader.seek(SeekFrom::Start(entry.offset)).is_err() {
2348 return Err(ParseError::SyntaxError {
2349 position: 0,
2350 message: "Failed to seek to catalog object".to_string(),
2351 });
2352 }
2353
2354 let mut buffer = vec![0u8; 2048];
2356 if let Ok(bytes_read) = self.reader.read(&mut buffer) {
2357 let content = String::from_utf8_lossy(&buffer[..bytes_read]);
2358 tracing::debug!("Raw catalog content:\n{}", content);
2359
2360 if let Some(dict_start) = content.find("<<") {
2362 if let Some(dict_end) = content[dict_start..].find(">>") {
2363 let dict_content = &content[dict_start..dict_start + dict_end + 2];
2364 tracing::debug!("Found dictionary content: {}", dict_content);
2365
2366 if let Ok(dict) = self.parse_dictionary_from_string(dict_content) {
2368 let key = (obj_num, gen_num);
2370 self.object_cache.insert(key, PdfObject::Dictionary(dict));
2371
2372 if let Some(PdfObject::Dictionary(ref dict)) =
2374 self.object_cache.get(&key)
2375 {
2376 return Ok(dict);
2377 }
2378 }
2379 }
2380 }
2381 }
2382 }
2383
2384 Err(ParseError::SyntaxError {
2385 position: 0,
2386 message: "Failed to extract catalog directly".to_string(),
2387 })
2388 }
2389
2390 #[allow(dead_code)]
2391 fn parse_dictionary_from_string(&self, dict_str: &str) -> ParseResult<PdfDictionary> {
2392 use crate::parser::lexer::{Lexer, Token};
2393
2394 let mut cursor = std::io::Cursor::new(dict_str.as_bytes());
2396 let mut lexer = Lexer::new_with_options(&mut cursor, self.options.clone());
2397
2398 match lexer.next_token()? {
2400 Token::DictStart => {
2401 let mut dict = std::collections::HashMap::new();
2402
2403 loop {
2404 let token = lexer.next_token()?;
2405 match token {
2406 Token::DictEnd => break,
2407 Token::Name(key) => {
2408 let value = PdfObject::parse_with_options(&mut lexer, &self.options)?;
2410 dict.insert(crate::parser::objects::PdfName(key), value);
2411 }
2412 _ => {
2413 return Err(ParseError::SyntaxError {
2414 position: 0,
2415 message: "Invalid dictionary format".to_string(),
2416 });
2417 }
2418 }
2419 }
2420
2421 Ok(PdfDictionary(dict))
2422 }
2423 _ => Err(ParseError::SyntaxError {
2424 position: 0,
2425 message: "Expected dictionary start".to_string(),
2426 }),
2427 }
2428 }
2429
2430 fn count_page_objects_directly(&mut self) -> Option<u32> {
2432 let mut page_count = 0;
2433
2434 for obj_num in 1..self.xref.len() as u32 {
2436 if let Ok(obj) = self.get_object(obj_num, 0) {
2437 if let Some(dict) = obj.as_dict() {
2438 if let Some(obj_type) = dict.get("Type").and_then(|t| t.as_name()) {
2439 if obj_type.0 == "Page" {
2440 page_count += 1;
2441 }
2442 }
2443 }
2444 }
2445 }
2446
2447 if page_count > 0 {
2448 Some(page_count)
2449 } else {
2450 None
2451 }
2452 }
2453
2454 pub fn metadata(&mut self) -> ParseResult<DocumentMetadata> {
2456 let mut metadata = DocumentMetadata::default();
2457
2458 if let Some(info_dict) = self.info()? {
2459 if let Some(title) = info_dict.get("Title").and_then(|o| o.as_string()) {
2460 metadata.title = title.as_str().ok().map(|s| s.to_string());
2461 }
2462 if let Some(author) = info_dict.get("Author").and_then(|o| o.as_string()) {
2463 metadata.author = author.as_str().ok().map(|s| s.to_string());
2464 }
2465 if let Some(subject) = info_dict.get("Subject").and_then(|o| o.as_string()) {
2466 metadata.subject = subject.as_str().ok().map(|s| s.to_string());
2467 }
2468 if let Some(keywords) = info_dict.get("Keywords").and_then(|o| o.as_string()) {
2469 metadata.keywords = keywords.as_str().ok().map(|s| s.to_string());
2470 }
2471 if let Some(creator) = info_dict.get("Creator").and_then(|o| o.as_string()) {
2472 metadata.creator = creator.as_str().ok().map(|s| s.to_string());
2473 }
2474 if let Some(producer) = info_dict.get("Producer").and_then(|o| o.as_string()) {
2475 metadata.producer = producer.as_str().ok().map(|s| s.to_string());
2476 }
2477 }
2478
2479 metadata.version = self.version().to_string();
2480 metadata.page_count = self.page_count().ok();
2481
2482 Ok(metadata)
2483 }
2484
2485 fn ensure_page_tree(&mut self) -> ParseResult<()> {
2487 if self.page_tree.is_none() {
2488 let page_count = self.page_count()?;
2489 self.page_tree = Some(super::page_tree::PageTree::new(page_count));
2490 }
2491 Ok(())
2492 }
2493
2494 pub fn get_page(&mut self, _index: u32) -> ParseResult<&super::page_tree::ParsedPage> {
2500 self.ensure_page_tree()?;
2501
2502 Err(ParseError::SyntaxError {
2506 position: 0,
2507 message: "get_page not implemented due to borrow checker constraints. Use PdfDocument instead.".to_string(),
2508 })
2509 }
2510
2511 pub fn get_all_pages(&mut self) -> ParseResult<Vec<super::page_tree::ParsedPage>> {
2513 let page_count = self.page_count()?;
2514 let mut pages = Vec::with_capacity(page_count as usize);
2515
2516 for i in 0..page_count {
2517 let page = self.get_page(i)?.clone();
2518 pages.push(page);
2519 }
2520
2521 Ok(pages)
2522 }
2523
2524 pub fn into_document(self) -> super::document::PdfDocument<R> {
2526 super::document::PdfDocument::new(self)
2527 }
2528
2529 pub fn clear_parse_context(&mut self) {
2531 self.parse_context = StackSafeContext::new();
2532 }
2533
2534 pub fn parse_context_mut(&mut self) -> &mut StackSafeContext {
2536 &mut self.parse_context
2537 }
2538
2539 fn find_page_objects(&mut self) -> ParseResult<Vec<(u32, u16)>> {
2541 let original_pos = self.reader.stream_position().unwrap_or(0);
2543
2544 if self.reader.seek(SeekFrom::Start(0)).is_err() {
2546 return Ok(vec![]);
2547 }
2548
2549 let mut buffer = Vec::new();
2550 if self.reader.read_to_end(&mut buffer).is_err() {
2551 return Ok(vec![]);
2552 }
2553
2554 self.reader.seek(SeekFrom::Start(original_pos)).ok();
2556
2557 let content = String::from_utf8_lossy(&buffer);
2558 let mut page_objects = Vec::new();
2559
2560 let lines: Vec<&str> = content.lines().collect();
2562
2563 for (i, line) in lines.iter().enumerate() {
2564 if line.trim().ends_with(" 0 obj") {
2566 if let Some(obj_str) = line.trim().strip_suffix(" 0 obj") {
2567 if let Ok(obj_num) = obj_str.parse::<u32>() {
2568 for j in 1..=10 {
2570 if i + j < lines.len() {
2571 let future_line = lines[i + j];
2572 if future_line.contains("/Type /Page")
2573 && !future_line.contains("/Type /Pages")
2574 {
2575 page_objects.push((obj_num, 0));
2576 break;
2577 }
2578 if future_line.trim().ends_with(" 0 obj")
2580 || future_line.trim() == "endobj"
2581 {
2582 break;
2583 }
2584 }
2585 }
2586 }
2587 }
2588 }
2589 }
2590
2591 page_objects.sort();
2592 page_objects.dedup();
2593
2594 Ok(page_objects)
2595 }
2596
2597 fn find_catalog_object(&mut self) -> ParseResult<(u32, u16)> {
2599 let obj_numbers: Vec<u32> = self.xref.entries().keys().copied().collect();
2604
2605 for obj_num in obj_numbers {
2607 if let Ok(obj) = self.get_object(obj_num, 0) {
2609 if let Some(dict) = obj.as_dict() {
2610 if let Some(type_obj) = dict.get("Type") {
2612 if let Some(type_name) = type_obj.as_name() {
2613 if type_name.0 == "Catalog" {
2614 return Ok((obj_num, 0));
2615 }
2616 if type_name.0 == "Sig"
2618 || type_name.0 == "Pages"
2619 || type_name.0 == "Page"
2620 {
2621 continue;
2622 }
2623 }
2624 }
2625 }
2626 }
2627 }
2628
2629 for obj_num in [1, 2, 3, 4, 5] {
2631 if let Ok(obj) = self.get_object(obj_num, 0) {
2632 if let Some(dict) = obj.as_dict() {
2633 if dict.contains_key("Pages") {
2635 return Ok((obj_num, 0));
2636 }
2637 }
2638 }
2639 }
2640
2641 Err(ParseError::MissingKey(
2642 "Could not find Catalog object".to_string(),
2643 ))
2644 }
2645
2646 fn create_synthetic_pages_dict(
2648 &mut self,
2649 page_refs: &[(u32, u16)],
2650 ) -> ParseResult<&PdfDictionary> {
2651 use super::objects::{PdfArray, PdfName};
2652
2653 let mut valid_page_refs = Vec::new();
2655 for (obj_num, gen_num) in page_refs {
2656 if let Ok(page_obj) = self.get_object(*obj_num, *gen_num) {
2657 if let Some(page_dict) = page_obj.as_dict() {
2658 if let Some(obj_type) = page_dict.get("Type").and_then(|t| t.as_name()) {
2660 if obj_type.0 == "Page" {
2661 valid_page_refs.push((*obj_num, *gen_num));
2662 continue;
2663 }
2664 }
2665
2666 if page_dict.contains_key("MediaBox") || page_dict.contains_key("Contents") {
2668 valid_page_refs.push((*obj_num, *gen_num));
2669 }
2670 }
2671 }
2672 }
2673
2674 if valid_page_refs.is_empty() {
2675 return Err(ParseError::SyntaxError {
2676 position: 0,
2677 message: "No valid page objects found for synthetic Pages tree".to_string(),
2678 });
2679 }
2680
2681 if valid_page_refs.len() > 10 {
2683 return self.create_hierarchical_pages_tree(&valid_page_refs);
2684 }
2685
2686 let mut kids = PdfArray::new();
2688 for (obj_num, gen_num) in &valid_page_refs {
2689 kids.push(PdfObject::Reference(*obj_num, *gen_num));
2690 }
2691
2692 let mut pages_dict = PdfDictionary::new();
2694 pages_dict.insert(
2695 "Type".to_string(),
2696 PdfObject::Name(PdfName("Pages".to_string())),
2697 );
2698 pages_dict.insert("Kids".to_string(), PdfObject::Array(kids));
2699 pages_dict.insert(
2700 "Count".to_string(),
2701 PdfObject::Integer(valid_page_refs.len() as i64),
2702 );
2703
2704 let mut media_box = None;
2706 for (obj_num, gen_num) in valid_page_refs.iter().take(3) {
2707 if let Ok(page_obj) = self.get_object(*obj_num, *gen_num) {
2708 if let Some(page_dict) = page_obj.as_dict() {
2709 if let Some(mb) = page_dict.get("MediaBox") {
2710 media_box = Some(mb.clone());
2711 }
2712 }
2713 }
2714 }
2715
2716 if let Some(mb) = media_box {
2718 pages_dict.insert("MediaBox".to_string(), mb);
2719 } else {
2720 let mut mb_array = PdfArray::new();
2721 mb_array.push(PdfObject::Integer(0));
2722 mb_array.push(PdfObject::Integer(0));
2723 mb_array.push(PdfObject::Integer(612));
2724 mb_array.push(PdfObject::Integer(792));
2725 pages_dict.insert("MediaBox".to_string(), PdfObject::Array(mb_array));
2726 }
2727
2728 let synthetic_key = (u32::MAX - 1, 0);
2730 self.object_cache
2731 .insert(synthetic_key, PdfObject::Dictionary(pages_dict));
2732
2733 if let PdfObject::Dictionary(dict) = &self.object_cache[&synthetic_key] {
2735 Ok(dict)
2736 } else {
2737 unreachable!("Just inserted dictionary")
2738 }
2739 }
2740
2741 fn create_hierarchical_pages_tree(
2743 &mut self,
2744 page_refs: &[(u32, u16)],
2745 ) -> ParseResult<&PdfDictionary> {
2746 use super::objects::{PdfArray, PdfName};
2747
2748 const PAGES_PER_NODE: usize = 10; let chunks: Vec<&[(u32, u16)]> = page_refs.chunks(PAGES_PER_NODE).collect();
2752 let mut intermediate_nodes = Vec::new();
2753
2754 for (chunk_idx, chunk) in chunks.iter().enumerate() {
2756 let mut kids = PdfArray::new();
2757 for (obj_num, gen_num) in chunk.iter() {
2758 kids.push(PdfObject::Reference(*obj_num, *gen_num));
2759 }
2760
2761 let mut intermediate_dict = PdfDictionary::new();
2762 intermediate_dict.insert(
2763 "Type".to_string(),
2764 PdfObject::Name(PdfName("Pages".to_string())),
2765 );
2766 intermediate_dict.insert("Kids".to_string(), PdfObject::Array(kids));
2767 intermediate_dict.insert("Count".to_string(), PdfObject::Integer(chunk.len() as i64));
2768
2769 let intermediate_key = (u32::MAX - 2 - chunk_idx as u32, 0);
2771 self.object_cache
2772 .insert(intermediate_key, PdfObject::Dictionary(intermediate_dict));
2773
2774 intermediate_nodes.push(intermediate_key);
2775 }
2776
2777 let mut root_kids = PdfArray::new();
2779 for (obj_num, gen_num) in &intermediate_nodes {
2780 root_kids.push(PdfObject::Reference(*obj_num, *gen_num));
2781 }
2782
2783 let mut root_pages_dict = PdfDictionary::new();
2784 root_pages_dict.insert(
2785 "Type".to_string(),
2786 PdfObject::Name(PdfName("Pages".to_string())),
2787 );
2788 root_pages_dict.insert("Kids".to_string(), PdfObject::Array(root_kids));
2789 root_pages_dict.insert(
2790 "Count".to_string(),
2791 PdfObject::Integer(page_refs.len() as i64),
2792 );
2793
2794 if let Some((obj_num, gen_num)) = page_refs.first() {
2796 if let Ok(page_obj) = self.get_object(*obj_num, *gen_num) {
2797 if let Some(page_dict) = page_obj.as_dict() {
2798 if let Some(mb) = page_dict.get("MediaBox") {
2799 root_pages_dict.insert("MediaBox".to_string(), mb.clone());
2800 }
2801 }
2802 }
2803 }
2804
2805 let root_key = (u32::MAX - 1, 0);
2807 self.object_cache
2808 .insert(root_key, PdfObject::Dictionary(root_pages_dict));
2809
2810 if let PdfObject::Dictionary(dict) = &self.object_cache[&root_key] {
2812 Ok(dict)
2813 } else {
2814 unreachable!("Just inserted dictionary")
2815 }
2816 }
2817
2818 pub fn signatures(&mut self) -> ParseResult<Vec<crate::signatures::SignatureField>> {
2848 crate::signatures::detect_signature_fields(self).map_err(|e| ParseError::SyntaxError {
2849 position: 0,
2850 message: format!("Failed to detect signatures: {}", e),
2851 })
2852 }
2853
2854 pub fn verify_signatures(
2889 &mut self,
2890 ) -> ParseResult<Vec<crate::signatures::FullSignatureValidationResult>> {
2891 self.verify_signatures_with_trust_store(crate::signatures::TrustStore::default())
2892 }
2893
2894 pub fn verify_signatures_with_trust_store(
2926 &mut self,
2927 trust_store: crate::signatures::TrustStore,
2928 ) -> ParseResult<Vec<crate::signatures::FullSignatureValidationResult>> {
2929 use crate::signatures::{
2930 has_incremental_update, parse_pkcs7_signature, validate_certificate, verify_signature,
2931 FullSignatureValidationResult,
2932 };
2933
2934 let original_pos = self.reader.stream_position().unwrap_or(0);
2936 self.reader.seek(SeekFrom::Start(0))?;
2937
2938 let mut pdf_bytes = Vec::new();
2939 self.reader.read_to_end(&mut pdf_bytes)?;
2940
2941 self.reader.seek(SeekFrom::Start(original_pos)).ok();
2943
2944 let signature_fields = self.signatures()?;
2946
2947 let mut results = Vec::new();
2948
2949 for field in signature_fields {
2950 let mut result = FullSignatureValidationResult {
2951 field: field.clone(),
2952 signer_name: None,
2953 signing_time: None,
2954 hash_valid: false,
2955 signature_valid: false,
2956 certificate_result: None,
2957 has_modifications_after_signing: false,
2958 errors: Vec::new(),
2959 warnings: Vec::new(),
2960 };
2961
2962 result.has_modifications_after_signing =
2964 has_incremental_update(&pdf_bytes, &field.byte_range);
2965
2966 let parsed_sig = match parse_pkcs7_signature(&field.contents) {
2968 Ok(sig) => sig,
2969 Err(e) => {
2970 result
2971 .errors
2972 .push(format!("Failed to parse signature: {}", e));
2973 results.push(result);
2974 continue;
2975 }
2976 };
2977
2978 result.signing_time = parsed_sig.signing_time.clone();
2980 result.signer_name = parsed_sig.signer_common_name().ok();
2981
2982 match verify_signature(&pdf_bytes, &parsed_sig, &field.byte_range) {
2984 Ok(verification) => {
2985 result.hash_valid = verification.hash_valid;
2986 result.signature_valid = verification.signature_valid;
2987 if let Some(details) = verification.details {
2988 result.warnings.push(details);
2989 }
2990 }
2991 Err(e) => {
2992 result
2993 .errors
2994 .push(format!("Signature verification failed: {}", e));
2995 }
2996 }
2997
2998 match validate_certificate(&parsed_sig.signer_certificate_der, &trust_store) {
3000 Ok(cert_result) => {
3001 result.certificate_result = Some(cert_result);
3002 }
3003 Err(e) => {
3004 result
3005 .warnings
3006 .push(format!("Certificate validation failed: {}", e));
3007 }
3008 }
3009
3010 results.push(result);
3011 }
3012
3013 Ok(results)
3014 }
3015}
3016
3017#[derive(Debug, Default, Clone)]
3019pub struct DocumentMetadata {
3020 pub title: Option<String>,
3021 pub author: Option<String>,
3022 pub subject: Option<String>,
3023 pub keywords: Option<String>,
3024 pub creator: Option<String>,
3025 pub producer: Option<String>,
3026 pub creation_date: Option<String>,
3027 pub modification_date: Option<String>,
3028 pub version: String,
3029 pub page_count: Option<u32>,
3030}
3031
3032pub struct EOLIter<'s> {
3033 remainder: &'s str,
3034}
3035impl<'s> Iterator for EOLIter<'s> {
3036 type Item = &'s str;
3037
3038 fn next(&mut self) -> Option<Self::Item> {
3039 if self.remainder.is_empty() {
3040 return None;
3041 }
3042
3043 if let Some((i, sep)) = ["\r\n", "\n", "\r"]
3044 .iter()
3045 .filter_map(|&sep| self.remainder.find(sep).map(|i| (i, sep)))
3046 .min_by_key(|(i, _)| *i)
3047 {
3048 let (line, rest) = self.remainder.split_at(i);
3049 self.remainder = &rest[sep.len()..];
3050 Some(line)
3051 } else {
3052 let line = self.remainder;
3053 self.remainder = "";
3054 Some(line)
3055 }
3056 }
3057}
3058pub trait PDFLines: AsRef<str> {
3059 fn pdf_lines(&self) -> EOLIter<'_> {
3060 EOLIter {
3061 remainder: self.as_ref(),
3062 }
3063 }
3064}
3065impl PDFLines for &str {}
3066impl<'a> PDFLines for std::borrow::Cow<'a, str> {}
3067impl PDFLines for String {}
3068
3069#[cfg(test)]
3070mod tests {
3071
3072 use super::*;
3073 use crate::parser::objects::{PdfName, PdfString};
3074 use crate::parser::test_helpers::*;
3075 use crate::parser::ParseOptions;
3076 use std::io::Cursor;
3077
3078 #[test]
3079 fn test_reader_construction() {
3080 let pdf_data = create_minimal_pdf();
3081 let cursor = Cursor::new(pdf_data);
3082 let result = PdfReader::new(cursor);
3083 assert!(result.is_ok());
3084 }
3085
3086 #[test]
3087 fn test_reader_version() {
3088 let pdf_data = create_minimal_pdf();
3089 let cursor = Cursor::new(pdf_data);
3090 let reader = PdfReader::new(cursor).unwrap();
3091 assert_eq!(reader.version().major, 1);
3092 assert_eq!(reader.version().minor, 4);
3093 }
3094
3095 #[test]
3096 fn test_reader_different_versions() {
3097 let versions = vec![
3098 "1.0", "1.1", "1.2", "1.3", "1.4", "1.5", "1.6", "1.7", "2.0",
3099 ];
3100
3101 for version in versions {
3102 let pdf_data = create_pdf_with_version(version);
3103 let cursor = Cursor::new(pdf_data);
3104 let reader = PdfReader::new(cursor).unwrap();
3105
3106 let parts: Vec<&str> = version.split('.').collect();
3107 assert_eq!(reader.version().major, parts[0].parse::<u8>().unwrap());
3108 assert_eq!(reader.version().minor, parts[1].parse::<u8>().unwrap());
3109 }
3110 }
3111
3112 #[test]
3113 fn test_reader_catalog() {
3114 let pdf_data = create_minimal_pdf();
3115 let cursor = Cursor::new(pdf_data);
3116 let mut reader = PdfReader::new(cursor).unwrap();
3117
3118 let catalog = reader.catalog();
3119 assert!(catalog.is_ok());
3120
3121 let catalog_dict = catalog.unwrap();
3122 assert_eq!(
3123 catalog_dict.get("Type"),
3124 Some(&PdfObject::Name(PdfName("Catalog".to_string())))
3125 );
3126 }
3127
3128 #[test]
3129 fn test_reader_info_none() {
3130 let pdf_data = create_minimal_pdf();
3131 let cursor = Cursor::new(pdf_data);
3132 let mut reader = PdfReader::new(cursor).unwrap();
3133
3134 let info = reader.info().unwrap();
3135 assert!(info.is_none());
3136 }
3137
3138 #[test]
3139 fn test_reader_info_present() {
3140 let pdf_data = create_pdf_with_info();
3141 let cursor = Cursor::new(pdf_data);
3142 let mut reader = PdfReader::new(cursor).unwrap();
3143
3144 let info = reader.info().unwrap();
3145 assert!(info.is_some());
3146
3147 let info_dict = info.unwrap();
3148 assert_eq!(
3149 info_dict.get("Title"),
3150 Some(&PdfObject::String(PdfString(
3151 "Test PDF".to_string().into_bytes()
3152 )))
3153 );
3154 assert_eq!(
3155 info_dict.get("Author"),
3156 Some(&PdfObject::String(PdfString(
3157 "Test Author".to_string().into_bytes()
3158 )))
3159 );
3160 }
3161
3162 #[test]
3163 fn test_reader_get_object() {
3164 let pdf_data = create_minimal_pdf();
3165 let cursor = Cursor::new(pdf_data);
3166 let mut reader = PdfReader::new(cursor).unwrap();
3167
3168 let obj = reader.get_object(1, 0);
3170 assert!(obj.is_ok());
3171
3172 let catalog = obj.unwrap();
3173 assert!(catalog.as_dict().is_some());
3174 }
3175
3176 #[test]
3177 fn test_reader_get_invalid_object() {
3178 let pdf_data = create_minimal_pdf();
3179 let cursor = Cursor::new(pdf_data);
3180 let mut reader = PdfReader::new(cursor).unwrap();
3181
3182 let obj = reader.get_object(999, 0);
3184 assert!(obj.is_err());
3185 }
3186
3187 #[test]
3188 fn test_reader_get_free_object() {
3189 let pdf_data = create_minimal_pdf();
3190 let cursor = Cursor::new(pdf_data);
3191 let mut reader = PdfReader::new(cursor).unwrap();
3192
3193 let obj = reader.get_object(0, 65535);
3195 assert!(obj.is_ok());
3196 assert_eq!(obj.unwrap(), &PdfObject::Null);
3197 }
3198
3199 #[test]
3200 fn test_reader_resolve_reference() {
3201 let pdf_data = create_minimal_pdf();
3202 let cursor = Cursor::new(pdf_data);
3203 let mut reader = PdfReader::new(cursor).unwrap();
3204
3205 let ref_obj = PdfObject::Reference(1, 0);
3207 let resolved = reader.resolve(&ref_obj);
3208
3209 assert!(resolved.is_ok());
3210 assert!(resolved.unwrap().as_dict().is_some());
3211 }
3212
3213 #[test]
3214 fn test_reader_resolve_non_reference() {
3215 let pdf_data = create_minimal_pdf();
3216 let cursor = Cursor::new(pdf_data);
3217 let mut reader = PdfReader::new(cursor).unwrap();
3218
3219 let int_obj = PdfObject::Integer(42);
3221 let resolved = reader.resolve(&int_obj).unwrap();
3222
3223 assert_eq!(resolved, &PdfObject::Integer(42));
3224 }
3225
3226 #[test]
3227 fn test_reader_cache_behavior() {
3228 let pdf_data = create_minimal_pdf();
3229 let cursor = Cursor::new(pdf_data);
3230 let mut reader = PdfReader::new(cursor).unwrap();
3231
3232 let obj1 = reader.get_object(1, 0).unwrap();
3234 assert!(obj1.as_dict().is_some());
3235
3236 let obj2 = reader.get_object(1, 0).unwrap();
3238 assert!(obj2.as_dict().is_some());
3239 }
3240
3241 #[test]
3242 fn test_reader_wrong_generation() {
3243 let pdf_data = create_minimal_pdf();
3244 let cursor = Cursor::new(pdf_data);
3245 let mut reader = PdfReader::new(cursor).unwrap();
3246
3247 let obj = reader.get_object(1, 99);
3249 assert!(obj.is_err());
3250 }
3251
3252 #[test]
3253 fn test_reader_invalid_pdf() {
3254 let invalid_data = b"This is not a PDF file";
3255 let cursor = Cursor::new(invalid_data.to_vec());
3256 let result = PdfReader::new(cursor);
3257
3258 assert!(result.is_err());
3259 }
3260
3261 #[test]
3262 fn test_reader_corrupt_xref() {
3263 let corrupt_pdf = b"%PDF-1.4
32641 0 obj
3265<< /Type /Catalog >>
3266endobj
3267xref
3268corrupted xref table
3269trailer
3270<< /Size 2 /Root 1 0 R >>
3271startxref
327224
3273%%EOF"
3274 .to_vec();
3275
3276 let cursor = Cursor::new(corrupt_pdf);
3277 let result = PdfReader::new(cursor);
3278 assert!(result.is_err());
3281 }
3282
3283 #[test]
3284 fn test_reader_missing_trailer() {
3285 let pdf_no_trailer = b"%PDF-1.4
32861 0 obj
3287<< /Type /Catalog >>
3288endobj
3289xref
32900 2
32910000000000 65535 f
32920000000009 00000 n
3293startxref
329424
3295%%EOF"
3296 .to_vec();
3297
3298 let cursor = Cursor::new(pdf_no_trailer);
3299 let result = PdfReader::new(cursor);
3300 assert!(result.is_err());
3303 }
3304
3305 #[test]
3306 fn test_reader_empty_pdf() {
3307 let cursor = Cursor::new(Vec::new());
3308 let result = PdfReader::new(cursor);
3309 assert!(result.is_err());
3310 }
3311
3312 #[test]
3313 fn test_reader_page_count() {
3314 let pdf_data = create_minimal_pdf();
3315 let cursor = Cursor::new(pdf_data);
3316 let mut reader = PdfReader::new(cursor).unwrap();
3317
3318 let count = reader.page_count();
3319 assert!(count.is_ok());
3320 assert_eq!(count.unwrap(), 0); }
3322
3323 #[test]
3324 fn test_reader_into_document() {
3325 let pdf_data = create_minimal_pdf();
3326 let cursor = Cursor::new(pdf_data);
3327 let reader = PdfReader::new(cursor).unwrap();
3328
3329 let document = reader.into_document();
3330 let page_count = document.page_count();
3332 assert!(page_count.is_ok());
3333 }
3334
3335 #[test]
3336 fn test_reader_pages_dict() {
3337 let pdf_data = create_minimal_pdf();
3338 let cursor = Cursor::new(pdf_data);
3339 let mut reader = PdfReader::new(cursor).unwrap();
3340
3341 let pages = reader.pages();
3342 assert!(pages.is_ok());
3343 let pages_dict = pages.unwrap();
3344 assert_eq!(
3345 pages_dict.get("Type"),
3346 Some(&PdfObject::Name(PdfName("Pages".to_string())))
3347 );
3348 }
3349
3350 #[test]
3351 fn test_reader_pdf_with_binary_data() {
3352 let pdf_data = create_pdf_with_binary_marker();
3353
3354 let cursor = Cursor::new(pdf_data);
3355 let result = PdfReader::new(cursor);
3356 assert!(result.is_ok());
3357 }
3358
3359 #[test]
3360 fn test_reader_metadata() {
3361 let pdf_data = create_pdf_with_info();
3362 let cursor = Cursor::new(pdf_data);
3363 let mut reader = PdfReader::new(cursor).unwrap();
3364
3365 let metadata = reader.metadata().unwrap();
3366 assert_eq!(metadata.title, Some("Test PDF".to_string()));
3367 assert_eq!(metadata.author, Some("Test Author".to_string()));
3368 assert_eq!(metadata.subject, Some("Testing".to_string()));
3369 assert_eq!(metadata.version, "1.4".to_string());
3370 }
3371
3372 #[test]
3373 fn test_reader_metadata_empty() {
3374 let pdf_data = create_minimal_pdf();
3375 let cursor = Cursor::new(pdf_data);
3376 let mut reader = PdfReader::new(cursor).unwrap();
3377
3378 let metadata = reader.metadata().unwrap();
3379 assert!(metadata.title.is_none());
3380 assert!(metadata.author.is_none());
3381 assert_eq!(metadata.version, "1.4".to_string());
3382 assert_eq!(metadata.page_count, Some(0));
3383 }
3384
3385 #[test]
3386 fn test_reader_object_number_mismatch() {
3387 let pdf_data = create_minimal_pdf();
3391 let cursor = Cursor::new(pdf_data);
3392 let mut reader = PdfReader::new(cursor).unwrap();
3393
3394 let result = reader.get_object(1, 99);
3397 assert!(result.is_err());
3398
3399 let result2 = reader.get_object(999, 0);
3401 assert!(result2.is_err());
3402 }
3403
3404 #[test]
3405 fn test_document_metadata_struct() {
3406 let metadata = DocumentMetadata {
3407 title: Some("Title".to_string()),
3408 author: Some("Author".to_string()),
3409 subject: Some("Subject".to_string()),
3410 keywords: Some("Keywords".to_string()),
3411 creator: Some("Creator".to_string()),
3412 producer: Some("Producer".to_string()),
3413 creation_date: Some("D:20240101".to_string()),
3414 modification_date: Some("D:20240102".to_string()),
3415 version: "1.5".to_string(),
3416 page_count: Some(10),
3417 };
3418
3419 assert_eq!(metadata.title, Some("Title".to_string()));
3420 assert_eq!(metadata.page_count, Some(10));
3421 }
3422
3423 #[test]
3424 fn test_document_metadata_default() {
3425 let metadata = DocumentMetadata::default();
3426 assert!(metadata.title.is_none());
3427 assert!(metadata.author.is_none());
3428 assert!(metadata.subject.is_none());
3429 assert!(metadata.keywords.is_none());
3430 assert!(metadata.creator.is_none());
3431 assert!(metadata.producer.is_none());
3432 assert!(metadata.creation_date.is_none());
3433 assert!(metadata.modification_date.is_none());
3434 assert_eq!(metadata.version, "".to_string());
3435 assert!(metadata.page_count.is_none());
3436 }
3437
3438 #[test]
3439 fn test_document_metadata_clone() {
3440 let metadata = DocumentMetadata {
3441 title: Some("Test".to_string()),
3442 version: "1.4".to_string(),
3443 ..Default::default()
3444 };
3445
3446 let cloned = metadata;
3447 assert_eq!(cloned.title, Some("Test".to_string()));
3448 assert_eq!(cloned.version, "1.4".to_string());
3449 }
3450
3451 #[test]
3452 fn test_reader_trailer_validation_error() {
3453 let bad_pdf = b"%PDF-1.4
34551 0 obj
3456<< /Type /Catalog >>
3457endobj
3458xref
34590 2
34600000000000 65535 f
34610000000009 00000 n
3462trailer
3463<< /Size 2 >>
3464startxref
346546
3466%%EOF"
3467 .to_vec();
3468
3469 let cursor = Cursor::new(bad_pdf);
3470 let result = PdfReader::new(cursor);
3471 assert!(result.is_err());
3474 }
3475
3476 #[test]
3477 fn test_reader_with_options() {
3478 let pdf_data = create_minimal_pdf();
3479 let cursor = Cursor::new(pdf_data);
3480 let mut options = ParseOptions::default();
3481 options.lenient_streams = true;
3482 options.max_recovery_bytes = 2000;
3483 options.collect_warnings = true;
3484
3485 let reader = PdfReader::new_with_options(cursor, options);
3486 assert!(reader.is_ok());
3487 }
3488
3489 #[test]
3490 fn test_lenient_stream_parsing() {
3491 let pdf_data = b"%PDF-1.4
34931 0 obj
3494<< /Type /Catalog /Pages 2 0 R >>
3495endobj
34962 0 obj
3497<< /Type /Pages /Kids [3 0 R] /Count 1 >>
3498endobj
34993 0 obj
3500<< /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792] /Contents 4 0 R >>
3501endobj
35024 0 obj
3503<< /Length 10 >>
3504stream
3505This is a longer stream than 10 bytes
3506endstream
3507endobj
3508xref
35090 5
35100000000000 65535 f
35110000000009 00000 n
35120000000058 00000 n
35130000000116 00000 n
35140000000219 00000 n
3515trailer
3516<< /Size 5 /Root 1 0 R >>
3517startxref
3518299
3519%%EOF"
3520 .to_vec();
3521
3522 let cursor = Cursor::new(pdf_data.clone());
3524 let strict_options = ParseOptions::strict();
3525 let strict_reader = PdfReader::new_with_options(cursor, strict_options);
3526 assert!(strict_reader.is_err());
3528
3529 let cursor = Cursor::new(pdf_data);
3531 let mut options = ParseOptions::default();
3532 options.lenient_streams = true;
3533 options.max_recovery_bytes = 1000;
3534 options.collect_warnings = false;
3535 let lenient_reader = PdfReader::new_with_options(cursor, options);
3536 assert!(lenient_reader.is_err());
3537 }
3538
3539 #[test]
3540 fn test_parse_options_default() {
3541 let options = ParseOptions::default();
3542 assert!(!options.lenient_streams);
3543 assert_eq!(options.max_recovery_bytes, 1000);
3544 assert!(!options.collect_warnings);
3545 }
3546
3547 #[test]
3548 fn test_parse_options_clone() {
3549 let mut options = ParseOptions::default();
3550 options.lenient_streams = true;
3551 options.max_recovery_bytes = 2000;
3552 options.collect_warnings = true;
3553 let cloned = options;
3554 assert!(cloned.lenient_streams);
3555 assert_eq!(cloned.max_recovery_bytes, 2000);
3556 assert!(cloned.collect_warnings);
3557 }
3558
3559 #[allow(dead_code)]
3562 fn create_encrypted_pdf_dict() -> PdfDictionary {
3563 let mut dict = PdfDictionary::new();
3564 dict.insert(
3565 "Filter".to_string(),
3566 PdfObject::Name(PdfName("Standard".to_string())),
3567 );
3568 dict.insert("V".to_string(), PdfObject::Integer(1));
3569 dict.insert("R".to_string(), PdfObject::Integer(2));
3570 dict.insert("O".to_string(), PdfObject::String(PdfString(vec![0u8; 32])));
3571 dict.insert("U".to_string(), PdfObject::String(PdfString(vec![0u8; 32])));
3572 dict.insert("P".to_string(), PdfObject::Integer(-4));
3573 dict
3574 }
3575
3576 fn create_pdf_with_encryption() -> Vec<u8> {
3577 b"%PDF-1.4
35791 0 obj
3580<< /Type /Catalog /Pages 2 0 R >>
3581endobj
35822 0 obj
3583<< /Type /Pages /Kids [3 0 R] /Count 1 >>
3584endobj
35853 0 obj
3586<< /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792] >>
3587endobj
35884 0 obj
3589<< /Filter /Standard /V 1 /R 2 /O (32 bytes of owner password hash data) /U (32 bytes of user password hash data) /P -4 >>
3590endobj
3591xref
35920 5
35930000000000 65535 f
35940000000009 00000 n
35950000000058 00000 n
35960000000116 00000 n
35970000000201 00000 n
3598trailer
3599<< /Size 5 /Root 1 0 R /Encrypt 4 0 R /ID [(file id)] >>
3600startxref
3601295
3602%%EOF"
3603 .to_vec()
3604 }
3605
3606 #[test]
3607 fn test_reader_encryption_detection() {
3608 let unencrypted_pdf = create_minimal_pdf();
3610 let cursor = Cursor::new(unencrypted_pdf);
3611 let reader = PdfReader::new(cursor).unwrap();
3612 assert!(!reader.is_encrypted());
3613 assert!(reader.is_unlocked()); let encrypted_pdf = create_pdf_with_encryption();
3617 let cursor = Cursor::new(encrypted_pdf);
3618 let result = PdfReader::new(cursor);
3619 assert!(result.is_err());
3621 }
3622
3623 #[test]
3624 fn test_reader_encryption_methods_unencrypted() {
3625 let pdf_data = create_minimal_pdf();
3626 let cursor = Cursor::new(pdf_data);
3627 let mut reader = PdfReader::new(cursor).unwrap();
3628
3629 assert!(!reader.is_encrypted());
3631 assert!(reader.is_unlocked());
3632 assert!(reader.encryption_handler().is_none());
3633 assert!(reader.encryption_handler_mut().is_none());
3634
3635 assert!(reader.unlock_with_password("any_password").unwrap());
3637 assert!(reader.try_empty_password().unwrap());
3638 }
3639
3640 #[test]
3641 fn test_reader_encryption_handler_access() {
3642 let pdf_data = create_minimal_pdf();
3643 let cursor = Cursor::new(pdf_data);
3644 let mut reader = PdfReader::new(cursor).unwrap();
3645
3646 assert!(reader.encryption_handler().is_none());
3648 assert!(reader.encryption_handler_mut().is_none());
3649
3650 assert!(!reader.is_encrypted());
3652 assert!(reader.is_unlocked());
3653 }
3654
3655 #[test]
3656 fn test_reader_multiple_password_attempts() {
3657 let pdf_data = create_minimal_pdf();
3658 let cursor = Cursor::new(pdf_data);
3659 let mut reader = PdfReader::new(cursor).unwrap();
3660
3661 let passwords = vec!["test1", "test2", "admin", "", "password"];
3663 for password in passwords {
3664 assert!(reader.unlock_with_password(password).unwrap());
3665 }
3666
3667 for _ in 0..5 {
3669 assert!(reader.try_empty_password().unwrap());
3670 }
3671 }
3672
3673 #[test]
3674 fn test_reader_encryption_state_consistency() {
3675 let pdf_data = create_minimal_pdf();
3676 let cursor = Cursor::new(pdf_data);
3677 let mut reader = PdfReader::new(cursor).unwrap();
3678
3679 assert!(!reader.is_encrypted());
3681 assert!(reader.is_unlocked());
3682 assert!(reader.encryption_handler().is_none());
3683
3684 let _ = reader.unlock_with_password("test");
3686 assert!(!reader.is_encrypted());
3687 assert!(reader.is_unlocked());
3688 assert!(reader.encryption_handler().is_none());
3689
3690 let _ = reader.try_empty_password();
3691 assert!(!reader.is_encrypted());
3692 assert!(reader.is_unlocked());
3693 assert!(reader.encryption_handler().is_none());
3694 }
3695
3696 #[test]
3697 fn test_reader_encryption_error_handling() {
3698 let encrypted_pdf = create_pdf_with_encryption();
3700 let cursor = Cursor::new(encrypted_pdf);
3701
3702 let result = PdfReader::new(cursor);
3704 match result {
3705 Err(ParseError::EncryptionNotSupported) => {
3706 }
3708 Err(_) => {
3709 }
3711 Ok(_) => {
3712 panic!("Should not successfully create reader for encrypted PDF without password");
3713 }
3714 }
3715 }
3716
3717 #[test]
3718 fn test_reader_encryption_with_options() {
3719 let pdf_data = create_minimal_pdf();
3720 let cursor = Cursor::new(pdf_data);
3721
3722 let strict_options = ParseOptions::strict();
3724 let strict_reader = PdfReader::new_with_options(cursor, strict_options).unwrap();
3725 assert!(!strict_reader.is_encrypted());
3726 assert!(strict_reader.is_unlocked());
3727
3728 let pdf_data = create_minimal_pdf();
3729 let cursor = Cursor::new(pdf_data);
3730 let lenient_options = ParseOptions::lenient();
3731 let lenient_reader = PdfReader::new_with_options(cursor, lenient_options).unwrap();
3732 assert!(!lenient_reader.is_encrypted());
3733 assert!(lenient_reader.is_unlocked());
3734 }
3735
3736 #[test]
3737 fn test_reader_encryption_integration_edge_cases() {
3738 let pdf_data = create_minimal_pdf();
3739 let cursor = Cursor::new(pdf_data);
3740 let mut reader = PdfReader::new(cursor).unwrap();
3741
3742 assert!(reader.unlock_with_password("").unwrap());
3744 assert!(reader.unlock_with_password(" ").unwrap()); assert!(reader
3746 .unlock_with_password("very_long_password_that_exceeds_normal_length")
3747 .unwrap());
3748 assert!(reader.unlock_with_password("unicode_test_ñáéíóú").unwrap());
3749
3750 assert!(reader.unlock_with_password("pass@#$%^&*()").unwrap());
3752 assert!(reader.unlock_with_password("pass\nwith\nnewlines").unwrap());
3753 assert!(reader.unlock_with_password("pass\twith\ttabs").unwrap());
3754 }
3755
3756 mod rigorous {
3757 use super::*;
3758
3759 #[test]
3764 fn test_reader_invalid_pdf_header() {
3765 let invalid_data = b"This is not a PDF file";
3767 let cursor = Cursor::new(invalid_data.to_vec());
3768 let result = PdfReader::new(cursor);
3769
3770 assert!(result.is_err(), "Should fail on invalid PDF header");
3771 }
3772
3773 #[test]
3774 fn test_reader_truncated_header() {
3775 let truncated = b"%PDF";
3777 let cursor = Cursor::new(truncated.to_vec());
3778 let result = PdfReader::new(cursor);
3779
3780 assert!(result.is_err(), "Should fail on truncated header");
3781 }
3782
3783 #[test]
3784 fn test_reader_empty_file() {
3785 let empty = Vec::new();
3786 let cursor = Cursor::new(empty);
3787 let result = PdfReader::new(cursor);
3788
3789 assert!(result.is_err(), "Should fail on empty file");
3790 }
3791
3792 #[test]
3793 fn test_reader_malformed_version() {
3794 let malformed = b"%PDF-X.Y\n%%\xE2\xE3\xCF\xD3\n";
3796 let cursor = Cursor::new(malformed.to_vec());
3797 let result = PdfReader::new(cursor);
3798
3799 if let Ok(reader) = result {
3801 let _version = reader.version();
3803 }
3804 }
3805
3806 #[test]
3807 fn test_reader_get_nonexistent_object() {
3808 let pdf_data = create_minimal_pdf();
3809 let cursor = Cursor::new(pdf_data);
3810 let mut reader = PdfReader::new(cursor).unwrap();
3811
3812 let result = reader.get_object(999, 0);
3814
3815 assert!(result.is_err(), "Should fail when object doesn't exist");
3816 }
3817
3818 #[test]
3819 fn test_reader_get_object_wrong_generation() {
3820 let pdf_data = create_minimal_pdf();
3821 let cursor = Cursor::new(pdf_data);
3822 let mut reader = PdfReader::new(cursor).unwrap();
3823
3824 let result = reader.get_object(1, 99);
3826
3827 if let Err(e) = result {
3829 let _ = e;
3831 }
3832 }
3833
3834 #[test]
3839 fn test_resolve_direct_object() {
3840 let pdf_data = create_minimal_pdf();
3841 let cursor = Cursor::new(pdf_data);
3842 let mut reader = PdfReader::new(cursor).unwrap();
3843
3844 let direct_obj = PdfObject::Integer(42);
3846
3847 let resolved = reader.resolve(&direct_obj).unwrap();
3848
3849 assert_eq!(resolved, &PdfObject::Integer(42));
3851 }
3852
3853 #[test]
3854 fn test_resolve_reference() {
3855 let pdf_data = create_minimal_pdf();
3856 let cursor = Cursor::new(pdf_data);
3857 let mut reader = PdfReader::new(cursor).unwrap();
3858
3859 let pages_ref = {
3861 let catalog = reader.catalog().unwrap();
3862 if let Some(PdfObject::Reference(obj_num, gen_num)) = catalog.get("Pages") {
3863 PdfObject::Reference(*obj_num, *gen_num)
3864 } else {
3865 panic!("Catalog /Pages must be a Reference");
3866 }
3867 };
3868
3869 let resolved = reader.resolve(&pages_ref).unwrap();
3871
3872 if let PdfObject::Dictionary(dict) = resolved {
3874 assert_eq!(
3875 dict.get("Type"),
3876 Some(&PdfObject::Name(PdfName("Pages".to_string())))
3877 );
3878 } else {
3879 panic!("Expected dictionary, got: {:?}", resolved);
3880 }
3881 }
3882
3883 #[test]
3888 fn test_is_encrypted_on_unencrypted() {
3889 let pdf_data = create_minimal_pdf();
3890 let cursor = Cursor::new(pdf_data);
3891 let reader = PdfReader::new(cursor).unwrap();
3892
3893 assert!(
3894 !reader.is_encrypted(),
3895 "Minimal PDF should not be encrypted"
3896 );
3897 }
3898
3899 #[test]
3900 fn test_is_unlocked_on_unencrypted() {
3901 let pdf_data = create_minimal_pdf();
3902 let cursor = Cursor::new(pdf_data);
3903 let reader = PdfReader::new(cursor).unwrap();
3904
3905 assert!(reader.is_unlocked(), "Unencrypted PDF should be unlocked");
3907 }
3908
3909 #[test]
3910 fn test_try_empty_password_on_unencrypted() {
3911 let pdf_data = create_minimal_pdf();
3912 let cursor = Cursor::new(pdf_data);
3913 let mut reader = PdfReader::new(cursor).unwrap();
3914
3915 let result = reader.try_empty_password();
3917 assert!(result.is_ok());
3918 }
3919
3920 #[test]
3925 fn test_reader_with_strict_options() {
3926 let pdf_data = create_minimal_pdf();
3927 let cursor = Cursor::new(pdf_data);
3928
3929 let options = ParseOptions::strict();
3930 let result = PdfReader::new_with_options(cursor, options);
3931
3932 assert!(result.is_ok(), "Minimal PDF should parse in strict mode");
3933 }
3934
3935 #[test]
3936 fn test_reader_with_lenient_options() {
3937 let pdf_data = create_minimal_pdf();
3938 let cursor = Cursor::new(pdf_data);
3939
3940 let options = ParseOptions::lenient();
3941 let result = PdfReader::new_with_options(cursor, options);
3942
3943 assert!(result.is_ok(), "Minimal PDF should parse in lenient mode");
3944 }
3945
3946 #[test]
3947 fn test_reader_options_accessible() {
3948 let pdf_data = create_minimal_pdf();
3949 let cursor = Cursor::new(pdf_data);
3950
3951 let options = ParseOptions::lenient();
3952 let reader = PdfReader::new_with_options(cursor, options.clone()).unwrap();
3953
3954 let reader_options = reader.options();
3956 assert_eq!(reader_options.strict_mode, options.strict_mode);
3957 }
3958
3959 #[test]
3964 fn test_catalog_has_required_fields() {
3965 let pdf_data = create_minimal_pdf();
3966 let cursor = Cursor::new(pdf_data);
3967 let mut reader = PdfReader::new(cursor).unwrap();
3968
3969 let catalog = reader.catalog().unwrap();
3970
3971 assert_eq!(
3973 catalog.get("Type"),
3974 Some(&PdfObject::Name(PdfName("Catalog".to_string()))),
3975 "Catalog must have /Type /Catalog"
3976 );
3977
3978 assert!(
3980 catalog.contains_key("Pages"),
3981 "Catalog must have /Pages entry"
3982 );
3983 }
3984
3985 #[test]
3986 fn test_info_fields_when_present() {
3987 let pdf_data = create_pdf_with_info();
3988 let cursor = Cursor::new(pdf_data);
3989 let mut reader = PdfReader::new(cursor).unwrap();
3990
3991 let info = reader.info().unwrap();
3992 assert!(info.is_some(), "PDF should have Info dictionary");
3993
3994 let info_dict = info.unwrap();
3995
3996 assert!(info_dict.contains_key("Title"), "Info should have Title");
3998 assert!(info_dict.contains_key("Author"), "Info should have Author");
3999 }
4000
4001 #[test]
4002 fn test_info_none_when_absent() {
4003 let pdf_data = create_minimal_pdf();
4004 let cursor = Cursor::new(pdf_data);
4005 let mut reader = PdfReader::new(cursor).unwrap();
4006
4007 let info = reader.info().unwrap();
4008 assert!(info.is_none(), "Minimal PDF should not have Info");
4009 }
4010
4011 #[test]
4016 fn test_version_exact_values() {
4017 let pdf_data = create_pdf_with_version("1.7");
4018 let cursor = Cursor::new(pdf_data);
4019 let reader = PdfReader::new(cursor).unwrap();
4020
4021 let version = reader.version();
4022 assert_eq!(version.major, 1, "Major version must be exact");
4023 assert_eq!(version.minor, 7, "Minor version must be exact");
4024 }
4025
4026 #[test]
4027 fn test_version_pdf_20() {
4028 let pdf_data = create_pdf_with_version("2.0");
4029 let cursor = Cursor::new(pdf_data);
4030 let reader = PdfReader::new(cursor).unwrap();
4031
4032 let version = reader.version();
4033 assert_eq!(version.major, 2, "PDF 2.0 major version");
4034 assert_eq!(version.minor, 0, "PDF 2.0 minor version");
4035 }
4036
4037 #[test]
4042 fn test_pages_returns_pages_dict() {
4043 let pdf_data = create_minimal_pdf();
4044 let cursor = Cursor::new(pdf_data);
4045 let mut reader = PdfReader::new(cursor).unwrap();
4046
4047 let pages_dict = reader
4048 .pages()
4049 .expect("pages() must return Pages dictionary");
4050
4051 assert_eq!(
4052 pages_dict.get("Type"),
4053 Some(&PdfObject::Name(PdfName("Pages".to_string()))),
4054 "Pages dict must have /Type /Pages"
4055 );
4056 }
4057
4058 #[test]
4059 fn test_page_count_minimal_pdf() {
4060 let pdf_data = create_minimal_pdf();
4061 let cursor = Cursor::new(pdf_data);
4062 let mut reader = PdfReader::new(cursor).unwrap();
4063
4064 let count = reader.page_count().expect("page_count() must succeed");
4065 assert_eq!(count, 0, "Minimal PDF has 0 pages");
4066 }
4067
4068 #[test]
4069 fn test_page_count_with_info_pdf() {
4070 let pdf_data = create_pdf_with_info();
4071 let cursor = Cursor::new(pdf_data);
4072 let mut reader = PdfReader::new(cursor).unwrap();
4073
4074 let count = reader.page_count().expect("page_count() must succeed");
4075 assert_eq!(count, 0, "create_pdf_with_info() has Count 0 in Pages dict");
4076 }
4077
4078 #[test]
4083 fn test_metadata_minimal_pdf() {
4084 let pdf_data = create_minimal_pdf();
4085 let cursor = Cursor::new(pdf_data);
4086 let mut reader = PdfReader::new(cursor).unwrap();
4087
4088 let meta = reader.metadata().expect("metadata() must succeed");
4089
4090 assert!(meta.title.is_none(), "Minimal PDF has no title");
4092 assert!(meta.author.is_none(), "Minimal PDF has no author");
4093 }
4094
4095 #[test]
4096 fn test_metadata_with_info() {
4097 let pdf_data = create_pdf_with_info();
4098 let cursor = Cursor::new(pdf_data);
4099 let mut reader = PdfReader::new(cursor).unwrap();
4100
4101 let meta = reader.metadata().expect("metadata() must succeed");
4102
4103 assert!(meta.title.is_some(), "PDF with Info has title");
4104 assert_eq!(meta.title.unwrap(), "Test PDF", "Title must match");
4105 assert!(meta.author.is_some(), "PDF with Info has author");
4106 assert_eq!(meta.author.unwrap(), "Test Author", "Author must match");
4107 }
4108
4109 #[test]
4114 fn test_resolve_stream_length_direct_integer() {
4115 let pdf_data = create_minimal_pdf();
4116 let cursor = Cursor::new(pdf_data);
4117 let mut reader = PdfReader::new(cursor).unwrap();
4118
4119 let length_obj = PdfObject::Integer(100);
4121
4122 let length = reader
4123 .resolve_stream_length(&length_obj)
4124 .expect("resolve_stream_length must succeed");
4125 assert_eq!(length, Some(100), "Direct integer must be resolved");
4126 }
4127
4128 #[test]
4129 fn test_resolve_stream_length_negative_integer() {
4130 let pdf_data = create_minimal_pdf();
4131 let cursor = Cursor::new(pdf_data);
4132 let mut reader = PdfReader::new(cursor).unwrap();
4133
4134 let length_obj = PdfObject::Integer(-10);
4136
4137 let length = reader
4138 .resolve_stream_length(&length_obj)
4139 .expect("resolve_stream_length must succeed");
4140 assert_eq!(length, None, "Negative integer returns None");
4141 }
4142
4143 #[test]
4144 fn test_resolve_stream_length_non_integer() {
4145 let pdf_data = create_minimal_pdf();
4146 let cursor = Cursor::new(pdf_data);
4147 let mut reader = PdfReader::new(cursor).unwrap();
4148
4149 let name_obj = PdfObject::Name(PdfName("Test".to_string()));
4151
4152 let length = reader
4153 .resolve_stream_length(&name_obj)
4154 .expect("resolve_stream_length must succeed");
4155 assert_eq!(length, None, "Non-integer object returns None");
4156 }
4157
4158 #[test]
4163 fn test_get_all_pages_empty_pdf() {
4164 let pdf_data = create_minimal_pdf();
4165 let cursor = Cursor::new(pdf_data);
4166 let mut reader = PdfReader::new(cursor).unwrap();
4167
4168 let pages = reader
4169 .get_all_pages()
4170 .expect("get_all_pages() must succeed");
4171 assert_eq!(pages.len(), 0, "Minimal PDF has 0 pages");
4172 }
4173
4174 #[test]
4175 fn test_get_all_pages_with_info() {
4176 let pdf_data = create_pdf_with_info();
4177 let cursor = Cursor::new(pdf_data);
4178 let mut reader = PdfReader::new(cursor).unwrap();
4179
4180 let pages = reader
4181 .get_all_pages()
4182 .expect("get_all_pages() must succeed");
4183 assert_eq!(
4184 pages.len(),
4185 0,
4186 "create_pdf_with_info() has 0 pages (Count 0)"
4187 );
4188 }
4189
4190 #[test]
4195 fn test_into_document_consumes_reader() {
4196 let pdf_data = create_minimal_pdf();
4197 let cursor = Cursor::new(pdf_data);
4198 let reader = PdfReader::new(cursor).unwrap();
4199
4200 let document = reader.into_document();
4201
4202 let version = document.version().expect("Document must have version");
4204 assert!(
4205 version.starts_with("1."),
4206 "Document must have PDF 1.x version, got: {}",
4207 version
4208 );
4209
4210 let page_count = document
4212 .page_count()
4213 .expect("Document must allow page_count()");
4214 assert_eq!(
4215 page_count, 0,
4216 "Minimal PDF has 0 pages (Count 0 in test helper)"
4217 );
4218 }
4219
4220 #[test]
4225 fn test_clear_parse_context() {
4226 let pdf_data = create_minimal_pdf();
4227 let cursor = Cursor::new(pdf_data);
4228 let mut reader = PdfReader::new(cursor).unwrap();
4229
4230 reader.clear_parse_context();
4232
4233 let version = reader.version();
4235 assert_eq!(version.major, 1, "Reader must still work after clear");
4236 }
4237
4238 #[test]
4239 fn test_parse_context_mut_accessible() {
4240 let pdf_data = create_minimal_pdf();
4241 let cursor = Cursor::new(pdf_data);
4242 let mut reader = PdfReader::new(cursor).unwrap();
4243
4244 let context = reader.parse_context_mut();
4245
4246 let initial_depth = context.depth;
4248 assert_eq!(initial_depth, 0, "Parse context must start with depth 0");
4249
4250 assert!(
4252 context.max_depth > 0,
4253 "Parse context must have positive max_depth"
4254 );
4255 }
4256
4257 #[test]
4262 fn test_find_bytes_basic() {
4263 let haystack = b"Hello World";
4264 let needle = b"World";
4265 let pos = find_bytes(haystack, needle);
4266 assert_eq!(pos, Some(6), "Must find 'World' at position 6");
4267 }
4268
4269 #[test]
4270 fn test_find_bytes_not_found() {
4271 let haystack = b"Hello World";
4272 let needle = b"Rust";
4273 let pos = find_bytes(haystack, needle);
4274 assert_eq!(pos, None, "Must return None when not found");
4275 }
4276
4277 #[test]
4278 fn test_find_bytes_at_start() {
4279 let haystack = b"Hello World";
4280 let needle = b"Hello";
4281 let pos = find_bytes(haystack, needle);
4282 assert_eq!(pos, Some(0), "Must find at position 0");
4283 }
4284
4285 #[test]
4286 fn test_is_immediate_stream_start_with_stream() {
4287 let data = b"stream\ndata";
4288 assert!(
4289 is_immediate_stream_start(data),
4290 "Must detect 'stream' at start"
4291 );
4292 }
4293
4294 #[test]
4295 fn test_is_immediate_stream_start_with_whitespace() {
4296 let data = b" \n\tstream\ndata";
4297 assert!(
4298 is_immediate_stream_start(data),
4299 "Must detect 'stream' after whitespace"
4300 );
4301 }
4302
4303 #[test]
4304 fn test_is_immediate_stream_start_no_stream() {
4305 let data = b"endobj";
4306 assert!(
4307 !is_immediate_stream_start(data),
4308 "Must return false when 'stream' absent"
4309 );
4310 }
4311 }
4312}