1use super::encryption_handler::EncryptionHandler;
6use super::header::PdfHeader;
7use super::object_stream::ObjectStream;
8use super::objects::{PdfArray, PdfDictionary, PdfObject, PdfString};
9use super::stack_safe::StackSafeContext;
10use super::trailer::PdfTrailer;
11use super::xref::XRefTable;
12use super::{ParseError, ParseResult};
13use crate::objects::ObjectId;
14use std::collections::HashMap;
15use std::fs::File;
16use std::io::{BufReader, Read, Seek, SeekFrom};
17use std::path::Path;
18
19fn find_bytes(haystack: &[u8], needle: &[u8]) -> Option<usize> {
21 haystack
22 .windows(needle.len())
23 .position(|window| window == needle)
24}
25
26fn is_immediate_stream_start(data: &[u8]) -> bool {
28 let mut i = 0;
29
30 while i < data.len() && matches!(data[i], b' ' | b'\t' | b'\n' | b'\r') {
32 i += 1;
33 }
34
35 data[i..].starts_with(b"stream")
37}
38
39pub struct PdfReader<R: Read + Seek> {
41 reader: BufReader<R>,
42 header: PdfHeader,
43 xref: XRefTable,
44 trailer: PdfTrailer,
45 object_cache: HashMap<(u32, u16), PdfObject>,
47 object_stream_cache: HashMap<u32, ObjectStream>,
49 page_tree: Option<super::page_tree::PageTree>,
51 parse_context: StackSafeContext,
53 options: super::ParseOptions,
55 encryption_handler: Option<EncryptionHandler>,
57 objects_being_reconstructed: std::sync::Mutex<std::collections::HashSet<u32>>,
59 max_reconstruction_depth: u32,
61}
62
63impl<R: Read + Seek> PdfReader<R> {
64 pub fn options(&self) -> &super::ParseOptions {
66 &self.options
67 }
68
69 pub fn is_encrypted(&self) -> bool {
71 self.encryption_handler.is_some()
72 }
73
74 pub fn is_unlocked(&self) -> bool {
76 match &self.encryption_handler {
77 Some(handler) => handler.is_unlocked(),
78 None => true, }
80 }
81
82 pub fn encryption_handler_mut(&mut self) -> Option<&mut EncryptionHandler> {
84 self.encryption_handler.as_mut()
85 }
86
87 pub fn encryption_handler(&self) -> Option<&EncryptionHandler> {
89 self.encryption_handler.as_ref()
90 }
91
92 pub fn unlock_with_password(&mut self, password: &str) -> ParseResult<bool> {
94 match &mut self.encryption_handler {
95 Some(handler) => {
96 if handler.unlock_with_user_password(password).unwrap_or(false) {
98 Ok(true)
99 } else {
100 Ok(handler
102 .unlock_with_owner_password(password)
103 .unwrap_or(false))
104 }
105 }
106 None => Ok(true), }
108 }
109
110 pub fn try_empty_password(&mut self) -> ParseResult<bool> {
112 match &mut self.encryption_handler {
113 Some(handler) => Ok(handler.try_empty_password().unwrap_or(false)),
114 None => Ok(true), }
116 }
117
118 pub fn unlock(&mut self, password: &str) -> ParseResult<()> {
149 if !self.is_encrypted() {
151 return Ok(());
152 }
153
154 if self.is_unlocked() {
156 return Ok(());
157 }
158
159 let success = self.unlock_with_password(password)?;
161
162 if success {
163 Ok(())
164 } else {
165 Err(ParseError::WrongPassword)
166 }
167 }
168
169 fn ensure_unlocked(&self) -> ParseResult<()> {
171 if self.is_encrypted() && !self.is_unlocked() {
172 return Err(ParseError::PdfLocked);
173 }
174 Ok(())
175 }
176
177 fn decrypt_object_if_needed(
183 &self,
184 obj: PdfObject,
185 obj_num: u32,
186 gen_num: u16,
187 ) -> ParseResult<PdfObject> {
188 let handler = match &self.encryption_handler {
190 Some(h) if h.is_unlocked() => h,
191 _ => return Ok(obj), };
193
194 let obj_id = ObjectId::new(obj_num, gen_num);
195
196 match obj {
197 PdfObject::String(ref s) => {
198 let decrypted_bytes = handler.decrypt_string(s.as_bytes(), &obj_id)?;
200 Ok(PdfObject::String(PdfString::new(decrypted_bytes)))
201 }
202 PdfObject::Stream(ref stream) => {
203 let should_decrypt = stream
205 .dict
206 .get("StmF")
207 .and_then(|o| o.as_name())
208 .map(|n| n.0.as_str() != "Identity")
209 .unwrap_or(true); if should_decrypt {
212 let decrypted_data = handler.decrypt_stream(&stream.data, &obj_id)?;
213
214 let mut new_stream = stream.clone();
216 new_stream.data = decrypted_data;
217 Ok(PdfObject::Stream(new_stream))
218 } else {
219 Ok(obj) }
221 }
222 PdfObject::Dictionary(ref dict) => {
223 let mut new_dict = PdfDictionary::new();
225 for (key, value) in dict.0.iter() {
226 let decrypted_value =
227 self.decrypt_object_if_needed(value.clone(), obj_num, gen_num)?;
228 new_dict.insert(key.0.clone(), decrypted_value);
229 }
230 Ok(PdfObject::Dictionary(new_dict))
231 }
232 PdfObject::Array(ref arr) => {
233 let mut new_arr = Vec::new();
235 for elem in arr.0.iter() {
236 let decrypted_elem =
237 self.decrypt_object_if_needed(elem.clone(), obj_num, gen_num)?;
238 new_arr.push(decrypted_elem);
239 }
240 Ok(PdfObject::Array(PdfArray(new_arr)))
241 }
242 _ => Ok(obj),
244 }
245 }
246}
247
248impl PdfReader<File> {
249 pub fn open<P: AsRef<Path>>(path: P) -> ParseResult<Self> {
251 #[cfg(feature = "verbose-debug")]
252 {
253 use std::io::Write;
254 if let Ok(mut f) = std::fs::File::create("/tmp/pdf_open_debug.log") {
255 writeln!(f, "Opening file: {:?}", path.as_ref()).ok();
256 }
257 }
258 let file = File::open(path)?;
259 let options = super::ParseOptions::lenient();
261 Self::new_with_options(file, options)
262 }
263
264 pub fn open_strict<P: AsRef<Path>>(path: P) -> ParseResult<Self> {
266 let file = File::open(path)?;
267 let options = super::ParseOptions::strict();
268 Self::new_with_options(file, options)
269 }
270
271 pub fn open_with_options<P: AsRef<Path>>(
273 path: P,
274 options: super::ParseOptions,
275 ) -> ParseResult<Self> {
276 let file = File::open(path)?;
277 Self::new_with_options(file, options)
278 }
279
280 pub fn open_document<P: AsRef<Path>>(
282 path: P,
283 ) -> ParseResult<super::document::PdfDocument<File>> {
284 let reader = Self::open(path)?;
285 Ok(reader.into_document())
286 }
287}
288
289impl<R: Read + Seek> PdfReader<R> {
290 pub fn new(reader: R) -> ParseResult<Self> {
297 let mut options = super::ParseOptions::default();
300 options.lenient_streams = true;
301 Self::new_with_options(reader, options)
302 }
303
304 pub fn new_with_options(reader: R, options: super::ParseOptions) -> ParseResult<Self> {
306 let mut buf_reader = BufReader::new(reader);
307
308 let start_pos = buf_reader.stream_position()?;
310 buf_reader.seek(SeekFrom::End(0))?;
311 let file_size = buf_reader.stream_position()?;
312 buf_reader.seek(SeekFrom::Start(start_pos))?;
313
314 if file_size == 0 {
315 return Err(ParseError::EmptyFile);
316 }
317
318 let header = PdfHeader::parse(&mut buf_reader)?;
320 #[cfg(feature = "verbose-debug")]
321 tracing::debug!("Header parsed: version {}", header.version);
322
323 let xref = XRefTable::parse_with_options(&mut buf_reader, &options)?;
325 #[cfg(feature = "verbose-debug")]
326 tracing::debug!("XRef table parsed with {} entries", xref.len());
327
328 let trailer_dict = xref.trailer().ok_or(ParseError::InvalidTrailer)?.clone();
330
331 let xref_offset = xref.xref_offset();
332 let trailer = PdfTrailer::from_dict(trailer_dict, xref_offset)?;
333
334 trailer.validate()?;
336
337 let encryption_handler = if EncryptionHandler::detect_encryption(trailer.dict()) {
339 if let Ok(Some((encrypt_obj_num, encrypt_gen_num))) = trailer.encrypt() {
340 let mut temp_reader = Self {
342 reader: buf_reader,
343 header: header.clone(),
344 xref: xref.clone(),
345 trailer: trailer.clone(),
346 object_cache: HashMap::new(),
347 object_stream_cache: HashMap::new(),
348 page_tree: None,
349 parse_context: StackSafeContext::new(),
350 options: options.clone(),
351 encryption_handler: None,
352 objects_being_reconstructed: std::sync::Mutex::new(
353 std::collections::HashSet::new(),
354 ),
355 max_reconstruction_depth: 100,
356 };
357
358 let encrypt_obj = temp_reader.get_object(encrypt_obj_num, encrypt_gen_num)?;
360 if let Some(encrypt_dict) = encrypt_obj.as_dict() {
361 let file_id = trailer.id().and_then(|id_obj| {
363 if let PdfObject::Array(ref id_array) = id_obj {
364 if let Some(PdfObject::String(ref id_bytes)) = id_array.get(0) {
365 Some(id_bytes.as_bytes().to_vec())
366 } else {
367 None
368 }
369 } else {
370 None
371 }
372 });
373
374 match EncryptionHandler::new(encrypt_dict, file_id) {
375 Ok(mut handler) => {
376 let _ = handler.try_empty_password();
378 buf_reader = temp_reader.reader;
380 Some(handler)
381 }
382 Err(_) => {
383 let _ = temp_reader.reader;
385 return Err(ParseError::EncryptionNotSupported);
386 }
387 }
388 } else {
389 let _ = temp_reader.reader;
390 return Err(ParseError::EncryptionNotSupported);
391 }
392 } else {
393 return Err(ParseError::EncryptionNotSupported);
394 }
395 } else {
396 None
397 };
398
399 Ok(Self {
400 reader: buf_reader,
401 header,
402 xref,
403 trailer,
404 object_cache: HashMap::new(),
405 object_stream_cache: HashMap::new(),
406 page_tree: None,
407 parse_context: StackSafeContext::new(),
408 options,
409 encryption_handler,
410 objects_being_reconstructed: std::sync::Mutex::new(std::collections::HashSet::new()),
411 max_reconstruction_depth: 100,
412 })
413 }
414
415 pub fn version(&self) -> &super::header::PdfVersion {
417 &self.header.version
418 }
419
420 pub fn catalog(&mut self) -> ParseResult<&PdfDictionary> {
422 let (obj_num, gen_num) = match self.trailer.root() {
424 Ok(root) => {
425 if let Ok(obj) = self.get_object(root.0, root.1) {
428 if let Some(dict) = obj.as_dict() {
429 if let Some(type_obj) = dict.get("Type") {
431 if let Some(type_name) = type_obj.as_name() {
432 if type_name.0 != "Catalog" {
433 tracing::warn!("Trailer /Root points to /Type/{} (not Catalog), scanning for real catalog", type_name.0);
434 if let Ok(catalog_ref) = self.find_catalog_object() {
436 catalog_ref
437 } else {
438 root }
440 } else {
441 root }
443 } else {
444 root }
446 } else {
447 root }
449 } else {
450 root }
452 } else {
453 root }
455 }
456 Err(_) => {
457 #[cfg(debug_assertions)]
459 tracing::warn!("Trailer missing Root entry, attempting recovery");
460
461 if let Some(root) = self.trailer.find_root_fallback() {
463 root
464 } else {
465 if let Ok(catalog_ref) = self.find_catalog_object() {
467 catalog_ref
468 } else {
469 return Err(ParseError::MissingKey("Root".to_string()));
470 }
471 }
472 }
473 };
474
475 let key = (obj_num, gen_num);
477 let needs_reconstruction = {
478 match self.get_object(obj_num, gen_num) {
479 Ok(catalog) => {
480 if catalog.as_dict().is_some() {
482 false
484 } else {
485 true
487 }
488 }
489 Err(_) => {
490 true
492 }
493 }
494 };
495
496 if !needs_reconstruction {
497 let catalog = self.get_object(obj_num, gen_num)?;
499 return catalog.as_dict().ok_or_else(|| ParseError::SyntaxError {
500 position: 0,
501 message: format!("Catalog object {} {} is not a dictionary", obj_num, gen_num),
502 });
503 }
504
505 match self.extract_object_manually(obj_num) {
508 Ok(dict) => {
509 let obj = PdfObject::Dictionary(dict);
511 self.object_cache.insert(key, obj);
512
513 use crate::parser::xref::XRefEntry;
515 let xref_entry = XRefEntry {
516 offset: 0, generation: gen_num,
518 in_use: true,
519 };
520 self.xref.add_entry(obj_num, xref_entry);
521
522 if let Some(PdfObject::Dictionary(ref dict)) = self.object_cache.get(&key) {
524 return Ok(dict);
525 }
526 }
527 Err(_e) => {}
528 }
529
530 Err(ParseError::SyntaxError {
532 position: 0,
533 message: format!(
534 "Catalog object {} could not be parsed or reconstructed as a dictionary",
535 obj_num
536 ),
537 })
538 }
539
540 pub fn info(&mut self) -> ParseResult<Option<&PdfDictionary>> {
542 match self.trailer.info() {
543 Some((obj_num, gen_num)) => {
544 let info = self.get_object(obj_num, gen_num)?;
545 Ok(info.as_dict())
546 }
547 None => Ok(None),
548 }
549 }
550
551 pub fn get_object(&mut self, obj_num: u32, gen_num: u16) -> ParseResult<&PdfObject> {
553 self.ensure_unlocked()?;
555
556 let key = (obj_num, gen_num);
557
558 if self.object_cache.contains_key(&key) {
560 return Ok(&self.object_cache[&key]);
561 }
562
563 {
565 let being_loaded =
566 self.objects_being_reconstructed
567 .lock()
568 .map_err(|_| ParseError::SyntaxError {
569 position: 0,
570 message: "Mutex poisoned during circular reference check".to_string(),
571 })?;
572 if being_loaded.contains(&obj_num) {
573 drop(being_loaded);
574 if self.options.collect_warnings {}
575 self.object_cache.insert(key, PdfObject::Null);
576 return Ok(&self.object_cache[&key]);
577 }
578 }
579
580 {
582 let being_loaded =
583 self.objects_being_reconstructed
584 .lock()
585 .map_err(|_| ParseError::SyntaxError {
586 position: 0,
587 message: "Mutex poisoned during depth limit check".to_string(),
588 })?;
589 let depth = being_loaded.len() as u32;
590 if depth >= self.max_reconstruction_depth {
591 drop(being_loaded);
592 if self.options.collect_warnings {}
593 return Err(ParseError::SyntaxError {
594 position: 0,
595 message: format!(
596 "Maximum object loading depth ({}) exceeded",
597 self.max_reconstruction_depth
598 ),
599 });
600 }
601 }
602
603 self.objects_being_reconstructed
605 .lock()
606 .map_err(|_| ParseError::SyntaxError {
607 position: 0,
608 message: "Mutex poisoned while marking object as being loaded".to_string(),
609 })?
610 .insert(obj_num);
611
612 match self.load_object_from_disk(obj_num, gen_num) {
614 Ok(_) => {
615 self.objects_being_reconstructed
617 .lock()
618 .map_err(|_| ParseError::SyntaxError {
619 position: 0,
620 message: "Mutex poisoned while unmarking object after successful load"
621 .to_string(),
622 })?
623 .remove(&obj_num);
624 Ok(&self.object_cache[&key])
626 }
627 Err(e) => {
628 if let Ok(mut guard) = self.objects_being_reconstructed.lock() {
631 guard.remove(&obj_num);
632 }
633 Err(e)
634 }
635 }
636 }
637
638 fn load_object_from_disk(&mut self, obj_num: u32, gen_num: u16) -> ParseResult<&PdfObject> {
640 let key = (obj_num, gen_num);
641
642 if self.object_cache.contains_key(&key) {
644 return Ok(&self.object_cache[&key]);
645 }
646
647 if let Some(ext_entry) = self.xref.get_extended_entry(obj_num) {
649 if let Some((stream_obj_num, index_in_stream)) = ext_entry.compressed_info {
650 return self.get_compressed_object(
652 obj_num,
653 gen_num,
654 stream_obj_num,
655 index_in_stream,
656 );
657 }
658 } else {
659 }
660
661 let (current_offset, _generation) = {
663 let entry = self.xref.get_entry(obj_num);
664
665 match entry {
666 Some(entry) => {
667 if !entry.in_use {
668 self.object_cache.insert(key, PdfObject::Null);
670 return Ok(&self.object_cache[&key]);
671 }
672
673 if entry.generation != gen_num {
674 if self.options.lenient_syntax {
675 if self.options.collect_warnings {
677 tracing::warn!("Object {} generation mismatch - expected {}, found {}, using available",
678 obj_num, gen_num, entry.generation);
679 }
680 } else {
681 return Err(ParseError::InvalidReference(obj_num, gen_num));
682 }
683 }
684
685 (entry.offset, entry.generation)
686 }
687 None => {
688 if self.is_reconstructible_object(obj_num) {
690 return self.attempt_manual_object_reconstruction(obj_num, gen_num, 0);
691 } else {
692 if self.options.lenient_syntax {
693 if self.options.collect_warnings {
695 tracing::warn!(
696 "Object {} {} R not found in XRef, returning null object",
697 obj_num,
698 gen_num
699 );
700 }
701 self.object_cache.insert(key, PdfObject::Null);
702 return Ok(&self.object_cache[&key]);
703 } else {
704 return Err(ParseError::InvalidReference(obj_num, gen_num));
705 }
706 }
707 }
708 }
709 };
710
711 self.reader.seek(std::io::SeekFrom::Start(current_offset))?;
715
716 let mut lexer =
718 super::lexer::Lexer::new_with_options(&mut self.reader, self.options.clone());
719
720 {
722 let token = lexer.next_token()?;
724 let read_obj_num = match token {
725 super::lexer::Token::Integer(n) => n as u32,
726 _ => {
727 if self.options.lenient_syntax {
729 if self.options.collect_warnings {
731 tracing::debug!(
732 "Warning: Using expected object number {obj_num} instead of parsed token: {:?}",
733 token
734 );
735 }
736 obj_num
737 } else {
738 return Err(ParseError::SyntaxError {
739 position: current_offset as usize,
740 message: "Expected object number".to_string(),
741 });
742 }
743 }
744 };
745
746 if read_obj_num != obj_num && !self.options.lenient_syntax {
747 return Err(ParseError::SyntaxError {
748 position: current_offset as usize,
749 message: format!(
750 "Object number mismatch: expected {obj_num}, found {read_obj_num}"
751 ),
752 });
753 }
754
755 let token = lexer.next_token()?;
757 let _read_gen_num = match token {
758 super::lexer::Token::Integer(n) => n as u16,
759 _ => {
760 if self.options.lenient_syntax {
762 if self.options.collect_warnings {
763 tracing::warn!(
764 "Using generation 0 instead of parsed token for object {obj_num}"
765 );
766 }
767 0
768 } else {
769 return Err(ParseError::SyntaxError {
770 position: current_offset as usize,
771 message: "Expected generation number".to_string(),
772 });
773 }
774 }
775 };
776
777 let token = lexer.next_token()?;
779 match token {
780 super::lexer::Token::Obj => {}
781 _ => {
782 if self.options.lenient_syntax {
783 if self.options.collect_warnings {
785 tracing::warn!("Expected 'obj' keyword for object {obj_num} {gen_num}, continuing anyway");
786 }
787 } else {
788 return Err(ParseError::SyntaxError {
789 position: current_offset as usize,
790 message: "Expected 'obj' keyword".to_string(),
791 });
792 }
793 }
794 }
795 }
796
797 self.parse_context.enter()?;
799
800 let obj = match PdfObject::parse_with_options(&mut lexer, &self.options) {
801 Ok(obj) => {
802 self.parse_context.exit();
803 if obj_num == 102 && self.options.collect_warnings {}
805 obj
806 }
807 Err(e) => {
808 self.parse_context.exit();
809
810 if self.is_reconstructible_object(obj_num)
812 && self.can_attempt_manual_reconstruction(&e)
813 {
814 match self.attempt_manual_object_reconstruction(
815 obj_num,
816 gen_num,
817 current_offset,
818 ) {
819 Ok(reconstructed_obj) => {
820 return Ok(reconstructed_obj);
821 }
822 Err(_reconstruction_error) => {}
823 }
824 }
825
826 return Err(e);
827 }
828 };
829
830 let token = lexer.next_token()?;
832 match token {
833 super::lexer::Token::EndObj => {}
834 _ => {
835 if self.options.lenient_syntax {
836 if self.options.collect_warnings {
838 tracing::warn!("Expected 'endobj' keyword after object {obj_num} {gen_num}, continuing anyway");
839 }
840 } else {
841 return Err(ParseError::SyntaxError {
842 position: current_offset as usize,
843 message: "Expected 'endobj' keyword".to_string(),
844 });
845 }
846 }
847 };
848
849 let decrypted_obj = self.decrypt_object_if_needed(obj, obj_num, gen_num)?;
851
852 self.object_cache.insert(key, decrypted_obj);
854
855 Ok(&self.object_cache[&key])
856 }
857
858 pub fn resolve<'a>(&'a mut self, obj: &'a PdfObject) -> ParseResult<&'a PdfObject> {
860 match obj {
861 PdfObject::Reference(obj_num, gen_num) => self.get_object(*obj_num, *gen_num),
862 _ => Ok(obj),
863 }
864 }
865
866 pub fn resolve_stream_length(&mut self, obj: &PdfObject) -> ParseResult<Option<usize>> {
869 match obj {
870 PdfObject::Integer(len) => {
871 if *len >= 0 {
872 Ok(Some(*len as usize))
873 } else {
874 Ok(None)
876 }
877 }
878 PdfObject::Reference(obj_num, gen_num) => {
879 let resolved = self.get_object(*obj_num, *gen_num)?;
880 match resolved {
881 PdfObject::Integer(len) => {
882 if *len >= 0 {
883 Ok(Some(*len as usize))
884 } else {
885 Ok(None)
886 }
887 }
888 _ => {
889 Ok(None)
891 }
892 }
893 }
894 _ => {
895 Ok(None)
897 }
898 }
899 }
900
901 fn get_compressed_object(
903 &mut self,
904 obj_num: u32,
905 gen_num: u16,
906 stream_obj_num: u32,
907 _index_in_stream: u32,
908 ) -> ParseResult<&PdfObject> {
909 let key = (obj_num, gen_num);
910
911 if !self.object_stream_cache.contains_key(&stream_obj_num) {
913 let stream_obj = self.get_object(stream_obj_num, 0)?;
915
916 if let Some(stream) = stream_obj.as_stream() {
917 let obj_stream = ObjectStream::parse(stream.clone(), &self.options)?;
919 self.object_stream_cache.insert(stream_obj_num, obj_stream);
920 } else {
921 return Err(ParseError::SyntaxError {
922 position: 0,
923 message: format!("Object {stream_obj_num} is not a stream"),
924 });
925 }
926 }
927
928 let obj_stream = &self.object_stream_cache[&stream_obj_num];
930 let obj = obj_stream
931 .get_object(obj_num)
932 .ok_or_else(|| ParseError::SyntaxError {
933 position: 0,
934 message: format!("Object {obj_num} not found in object stream {stream_obj_num}"),
935 })?;
936
937 let decrypted_obj = self.decrypt_object_if_needed(obj.clone(), obj_num, gen_num)?;
939
940 self.object_cache.insert(key, decrypted_obj);
942 Ok(&self.object_cache[&key])
943 }
944
945 pub fn pages(&mut self) -> ParseResult<&PdfDictionary> {
947 let (pages_obj_num, pages_gen_num) = {
949 let catalog = self.catalog()?;
950
951 if let Some(pages_ref) = catalog.get("Pages") {
953 match pages_ref {
954 PdfObject::Reference(obj_num, gen_num) => (*obj_num, *gen_num),
955 _ => {
956 return Err(ParseError::SyntaxError {
957 position: 0,
958 message: "Pages must be a reference".to_string(),
959 })
960 }
961 }
962 } else {
963 #[cfg(debug_assertions)]
965 tracing::warn!("Catalog missing Pages entry, attempting recovery");
966
967 if let Ok(page_refs) = self.find_page_objects() {
969 if !page_refs.is_empty() {
970 return self.create_synthetic_pages_dict(&page_refs);
972 }
973 }
974
975 if self.options.lenient_syntax {
977 if self.options.collect_warnings {
978 tracing::warn!("Missing Pages in catalog, searching for page tree");
979 }
980 let mut found_pages = None;
982 for i in 1..self.xref.len() as u32 {
983 if let Ok(obj) = self.get_object(i, 0) {
984 if let Some(dict) = obj.as_dict() {
985 if let Some(obj_type) = dict.get("Type").and_then(|t| t.as_name()) {
986 if obj_type.0 == "Pages" {
987 found_pages = Some((i, 0));
988 break;
989 }
990 }
991 }
992 }
993 }
994 if let Some((obj_num, gen_num)) = found_pages {
995 (obj_num, gen_num)
996 } else {
997 return Err(ParseError::MissingKey("Pages".to_string()));
998 }
999 } else {
1000 return Err(ParseError::MissingKey("Pages".to_string()));
1001 }
1002 }
1003 };
1004
1005 let needs_double_resolve = {
1008 let pages_obj = self.get_object(pages_obj_num, pages_gen_num)?;
1009 pages_obj.as_reference()
1010 };
1011
1012 let (final_obj_num, final_gen_num) =
1014 if let Some((ref_obj_num, ref_gen_num)) = needs_double_resolve {
1015 (ref_obj_num, ref_gen_num)
1016 } else {
1017 (pages_obj_num, pages_gen_num)
1018 };
1019
1020 let actual_pages_num = {
1022 let is_valid_dict = {
1024 let pages_obj = self.get_object(final_obj_num, final_gen_num)?;
1025 pages_obj.as_dict().is_some()
1026 };
1027
1028 if is_valid_dict {
1029 final_obj_num
1031 } else {
1032 #[cfg(debug_assertions)]
1034 tracing::warn!("Pages reference invalid, searching for valid Pages object");
1035
1036 if self.options.lenient_syntax {
1037 let xref_len = self.xref.len() as u32;
1039 let mut found_pages_num = None;
1040
1041 for i in 1..xref_len {
1042 let is_pages = {
1044 if let Ok(obj) = self.get_object(i, 0) {
1045 if let Some(dict) = obj.as_dict() {
1046 if let Some(obj_type) =
1047 dict.get("Type").and_then(|t| t.as_name())
1048 {
1049 obj_type.0 == "Pages"
1050 } else {
1051 false
1052 }
1053 } else {
1054 false
1055 }
1056 } else {
1057 false
1058 }
1059 };
1060
1061 if is_pages {
1062 found_pages_num = Some(i);
1063 break;
1064 }
1065 }
1066
1067 if let Some(obj_num) = found_pages_num {
1068 #[cfg(debug_assertions)]
1069 tracing::debug!("Found valid Pages object at {} 0 R", obj_num);
1070 obj_num
1071 } else {
1072 return Err(ParseError::SyntaxError {
1074 position: 0,
1075 message: "Pages is not a dictionary and no valid Pages object found"
1076 .to_string(),
1077 });
1078 }
1079 } else {
1080 return Err(ParseError::SyntaxError {
1082 position: 0,
1083 message: "Pages is not a dictionary".to_string(),
1084 });
1085 }
1086 }
1087 };
1088
1089 let pages_obj = self.get_object(actual_pages_num, 0)?;
1091 pages_obj.as_dict().ok_or_else(|| ParseError::SyntaxError {
1092 position: 0,
1093 message: "Pages object is not a dictionary".to_string(),
1094 })
1095 }
1096
1097 pub fn page_count(&mut self) -> ParseResult<u32> {
1099 const MAX_PAGE_COUNT: u32 = 100_000;
1102
1103 match self.pages() {
1105 Ok(pages) => {
1106 if let Some(count_obj) = pages.get("Count") {
1108 if let Some(count) = count_obj.as_integer() {
1109 let count = count as u32;
1110 if count <= MAX_PAGE_COUNT {
1111 return Ok(count);
1112 }
1113 tracing::warn!(
1114 "PDF /Count {} exceeds limit {}, falling back to Kids array length",
1115 count,
1116 MAX_PAGE_COUNT
1117 );
1118 }
1120 }
1121
1122 if let Some(kids_obj) = pages.get("Kids") {
1124 if let Some(kids_array) = kids_obj.as_array() {
1125 return Ok(kids_array.0.len() as u32);
1126 }
1127 }
1128
1129 Ok(0)
1130 }
1131 Err(_) => {
1132 tracing::debug!("Standard page extraction failed, trying direct extraction");
1134 self.page_count_fallback()
1135 }
1136 }
1137 }
1138
1139 fn page_count_fallback(&mut self) -> ParseResult<u32> {
1141 if let Some(count) = self.extract_page_count_from_linearization() {
1143 tracing::debug!("Found page count {} from linearization", count);
1144 return Ok(count);
1145 }
1146
1147 if let Some(count) = self.count_page_objects_directly() {
1149 tracing::debug!("Found {} pages by counting page objects", count);
1150 return Ok(count);
1151 }
1152
1153 Ok(0)
1154 }
1155
1156 fn extract_page_count_from_linearization(&mut self) -> Option<u32> {
1158 match self.get_object(100, 0) {
1160 Ok(obj) => {
1161 tracing::debug!("Found object 100: {:?}", obj);
1162 if let Some(dict) = obj.as_dict() {
1163 tracing::debug!("Object 100 is a dictionary with {} keys", dict.0.len());
1164 if let Some(n_obj) = dict.get("N") {
1166 tracing::debug!("Found /N field: {:?}", n_obj);
1167 if let Some(count) = n_obj.as_integer() {
1168 tracing::debug!("Extracted page count from linearization: {}", count);
1169 return Some(count as u32);
1170 }
1171 } else {
1172 tracing::debug!("No /N field found in object 100");
1173 for (key, value) in &dict.0 {
1174 tracing::debug!(" {:?}: {:?}", key, value);
1175 }
1176 }
1177 } else {
1178 tracing::debug!("Object 100 is not a dictionary: {:?}", obj);
1179 }
1180 }
1181 Err(e) => {
1182 tracing::debug!("Failed to get object 100: {:?}", e);
1183 tracing::debug!("Attempting direct content extraction...");
1184 return self.extract_n_value_from_raw_object_100();
1186 }
1187 }
1188
1189 None
1190 }
1191
1192 fn extract_n_value_from_raw_object_100(&mut self) -> Option<u32> {
1193 if let Some(entry) = self.xref.get_entry(100) {
1195 if self.reader.seek(SeekFrom::Start(entry.offset)).is_err() {
1197 return None;
1198 }
1199
1200 let mut buffer = vec![0u8; 1024];
1202 if let Ok(bytes_read) = self.reader.read(&mut buffer) {
1203 if bytes_read == 0 {
1204 return None;
1205 }
1206
1207 let content = String::from_utf8_lossy(&buffer[..bytes_read]);
1209 tracing::debug!("Raw content around object 100:\n{}", content);
1210
1211 if let Some(n_pos) = content.find("/N ") {
1213 let after_n = &content[n_pos + 3..];
1214 tracing::debug!(
1215 "Content after /N: {}",
1216 &after_n[..std::cmp::min(50, after_n.len())]
1217 );
1218
1219 let mut num_str = String::new();
1221 for ch in after_n.chars() {
1222 if ch.is_ascii_digit() {
1223 num_str.push(ch);
1224 } else if !num_str.is_empty() {
1225 break;
1227 }
1228 }
1230
1231 if !num_str.is_empty() {
1232 if let Ok(page_count) = num_str.parse::<u32>() {
1233 tracing::debug!(
1234 "Extracted page count from raw content: {}",
1235 page_count
1236 );
1237 return Some(page_count);
1238 }
1239 }
1240 }
1241 }
1242 }
1243 None
1244 }
1245
1246 #[allow(dead_code)]
1247 fn find_object_pattern(&mut self, obj_num: u32, gen_num: u16) -> Option<u64> {
1248 let pattern = format!("{} {} obj", obj_num, gen_num);
1249
1250 let original_pos = self.reader.stream_position().unwrap_or(0);
1252
1253 if self.reader.seek(SeekFrom::Start(0)).is_err() {
1255 return None;
1256 }
1257
1258 let mut buffer = vec![0u8; 8192];
1260 let mut file_content = Vec::new();
1261
1262 loop {
1263 match self.reader.read(&mut buffer) {
1264 Ok(0) => break, Ok(bytes_read) => {
1266 file_content.extend_from_slice(&buffer[..bytes_read]);
1267 }
1268 Err(_) => return None,
1269 }
1270 }
1271
1272 let content = String::from_utf8_lossy(&file_content);
1274 if let Some(pattern_pos) = content.find(&pattern) {
1275 let after_pattern = pattern_pos + pattern.len();
1277 let search_area = &content[after_pattern..];
1278
1279 if let Some(dict_start_offset) = search_area.find("<<") {
1280 let dict_start_pos = after_pattern + dict_start_offset;
1281
1282 self.reader.seek(SeekFrom::Start(original_pos)).ok();
1284 return Some(dict_start_pos as u64);
1285 } else {
1286 }
1287 }
1288
1289 self.reader.seek(SeekFrom::Start(original_pos)).ok();
1291 None
1292 }
1293
1294 fn can_attempt_manual_reconstruction(&self, error: &ParseError) -> bool {
1296 match error {
1297 ParseError::SyntaxError { .. } => true,
1299 ParseError::UnexpectedToken { .. } => true,
1300 _ => false,
1302 }
1303 }
1304
1305 fn is_reconstructible_object(&self, obj_num: u32) -> bool {
1307 if obj_num == 102 || obj_num == 113 || obj_num == 114 {
1309 return true;
1310 }
1311
1312 let page_objects = [
1315 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 30, 34, 37, 39, 42, 44, 46, 49, 52,
1316 54, 56, 58, 60, 62, 64, 67, 69, 71, 73, 75, 77, 79, 81, 83, 85, 87, 89, 91, 93, 104,
1317 ];
1318
1319 let content_objects = [
1322 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 29, 31, 32, 33, 35, 36, 38, 40, 41,
1323 43, 45, 47, 48, 50, 51, 53, 55, 57, 59, 61, 63, 65, 66, 68, 70, 72, 74, 76, 78, 80, 82,
1324 84, 86, 88, 90, 92, 94, 95, 96, 97, 98, 99, 100, 101, 105, 106, 107, 108, 109, 110,
1325 111,
1326 ];
1327
1328 page_objects.contains(&obj_num) || content_objects.contains(&obj_num)
1329 }
1330
1331 fn is_page_object(&self, obj_num: u32) -> bool {
1333 let page_objects = [
1334 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 30, 34, 37, 39, 42, 44, 46, 49, 52,
1335 54, 56, 58, 60, 62, 64, 67, 69, 71, 73, 75, 77, 79, 81, 83, 85, 87, 89, 91, 93, 104,
1336 ];
1337 page_objects.contains(&obj_num)
1338 }
1339
1340 fn parse_page_dictionary_content(
1342 &self,
1343 dict_content: &str,
1344 result_dict: &mut std::collections::HashMap<
1345 crate::parser::objects::PdfName,
1346 crate::parser::objects::PdfObject,
1347 >,
1348 _obj_num: u32,
1349 ) -> ParseResult<()> {
1350 use crate::parser::objects::{PdfArray, PdfName, PdfObject};
1351 use std::collections::HashMap;
1352
1353 if let Some(mediabox_start) = dict_content.find("/MediaBox") {
1355 let mediabox_area = &dict_content[mediabox_start..];
1356 if let Some(start_bracket) = mediabox_area.find("[") {
1357 if let Some(end_bracket) = mediabox_area.find("]") {
1358 let mediabox_content = &mediabox_area[start_bracket + 1..end_bracket];
1359 let values: Vec<f32> = mediabox_content
1360 .split_whitespace()
1361 .filter_map(|s| s.parse().ok())
1362 .collect();
1363
1364 if values.len() == 4 {
1365 let mediabox = PdfArray(vec![
1366 PdfObject::Integer(values[0] as i64),
1367 PdfObject::Integer(values[1] as i64),
1368 PdfObject::Integer(values[2] as i64),
1369 PdfObject::Integer(values[3] as i64),
1370 ]);
1371 result_dict
1372 .insert(PdfName("MediaBox".to_string()), PdfObject::Array(mediabox));
1373 }
1374 }
1375 }
1376 }
1377
1378 if let Some(contents_match) = dict_content.find("/Contents") {
1380 let contents_area = &dict_content[contents_match..];
1381 let parts: Vec<&str> = contents_area.split_whitespace().collect();
1383 if parts.len() >= 3 {
1384 if let (Ok(obj_ref), Ok(gen_ref)) =
1385 (parts[1].parse::<u32>(), parts[2].parse::<u16>())
1386 {
1387 if parts.len() > 3 && parts[3] == "R" {
1388 result_dict.insert(
1389 PdfName("Contents".to_string()),
1390 PdfObject::Reference(obj_ref, gen_ref),
1391 );
1392 }
1393 }
1394 }
1395 }
1396
1397 if dict_content.contains("/Parent") {
1399 result_dict.insert(
1400 PdfName("Parent".to_string()),
1401 PdfObject::Reference(113, 0), );
1403 }
1404
1405 if dict_content.contains("/Resources") {
1407 if let Ok(parsed_resources) = self.parse_resources_from_content(&dict_content) {
1408 result_dict.insert(PdfName("Resources".to_string()), parsed_resources);
1409 } else {
1410 let resources = HashMap::new();
1412 result_dict.insert(
1413 PdfName("Resources".to_string()),
1414 PdfObject::Dictionary(crate::parser::objects::PdfDictionary(resources)),
1415 );
1416 }
1417 }
1418
1419 Ok(())
1420 }
1421
1422 fn attempt_manual_object_reconstruction(
1424 &mut self,
1425 obj_num: u32,
1426 gen_num: u16,
1427 _current_offset: u64,
1428 ) -> ParseResult<&PdfObject> {
1429 let is_circular = self
1431 .objects_being_reconstructed
1432 .lock()
1433 .map_err(|_| ParseError::SyntaxError {
1434 position: 0,
1435 message: "Mutex poisoned during circular reference check".to_string(),
1436 })?
1437 .contains(&obj_num);
1438
1439 if is_circular {
1440 tracing::debug!(
1441 "Warning: Circular reconstruction detected for object {} {} - attempting manual extraction",
1442 obj_num, gen_num
1443 );
1444
1445 match self.extract_object_or_stream_manually(obj_num) {
1449 Ok(obj) => {
1450 tracing::debug!(
1451 " Successfully extracted object {} {} manually despite circular reference",
1452 obj_num, gen_num
1453 );
1454 self.object_cache.insert((obj_num, gen_num), obj);
1455 return Ok(&self.object_cache[&(obj_num, gen_num)]);
1456 }
1457 Err(e) => {
1458 tracing::debug!(
1459 " Manual extraction failed: {} - breaking cycle with null object",
1460 e
1461 );
1462 self.object_cache
1464 .insert((obj_num, gen_num), PdfObject::Null);
1465 return Ok(&self.object_cache[&(obj_num, gen_num)]);
1466 }
1467 }
1468 }
1469
1470 let current_depth = self
1472 .objects_being_reconstructed
1473 .lock()
1474 .map_err(|_| ParseError::SyntaxError {
1475 position: 0,
1476 message: "Mutex poisoned during depth check".to_string(),
1477 })?
1478 .len() as u32;
1479 if current_depth >= self.max_reconstruction_depth {
1480 return Err(ParseError::SyntaxError {
1481 position: 0,
1482 message: format!(
1483 "Maximum reconstruction depth ({}) exceeded for object {} {}",
1484 self.max_reconstruction_depth, obj_num, gen_num
1485 ),
1486 });
1487 }
1488
1489 self.objects_being_reconstructed
1491 .lock()
1492 .map_err(|_| ParseError::SyntaxError {
1493 position: 0,
1494 message: "Mutex poisoned while marking object as being reconstructed".to_string(),
1495 })?
1496 .insert(obj_num);
1497
1498 let reconstructed_obj = match self.smart_object_reconstruction(obj_num, gen_num) {
1500 Ok(obj) => obj,
1501 Err(_) => {
1502 match self.extract_object_or_stream_manually(obj_num) {
1504 Ok(obj) => obj,
1505 Err(e) => {
1506 if self.options.lenient_syntax {
1508 PdfObject::Null
1509 } else {
1510 if let Ok(mut guard) = self.objects_being_reconstructed.lock() {
1512 guard.remove(&obj_num);
1513 }
1514 return Err(e);
1515 }
1516 }
1517 }
1518 }
1519 };
1520
1521 self.objects_being_reconstructed
1523 .lock()
1524 .map_err(|_| ParseError::SyntaxError {
1525 position: 0,
1526 message: "Mutex poisoned while unmarking reconstructed object".to_string(),
1527 })?
1528 .remove(&obj_num);
1529
1530 self.object_cache
1531 .insert((obj_num, gen_num), reconstructed_obj);
1532
1533 use crate::parser::xref::XRefEntry;
1535 let xref_entry = XRefEntry {
1536 offset: 0, generation: gen_num,
1538 in_use: true,
1539 };
1540 self.xref.add_entry(obj_num, xref_entry);
1541
1542 self.object_cache
1543 .get(&(obj_num, gen_num))
1544 .ok_or_else(|| ParseError::SyntaxError {
1545 position: 0,
1546 message: format!(
1547 "Object {} {} not in cache after reconstruction",
1548 obj_num, gen_num
1549 ),
1550 })
1551 }
1552
1553 fn smart_object_reconstruction(
1555 &mut self,
1556 obj_num: u32,
1557 gen_num: u16,
1558 ) -> ParseResult<PdfObject> {
1559 if let Ok(inferred_obj) = self.infer_object_from_context(obj_num) {
1563 return Ok(inferred_obj);
1564 }
1565
1566 if let Ok(scanned_obj) = self.scan_for_object_patterns(obj_num) {
1568 return Ok(scanned_obj);
1569 }
1570
1571 if let Ok(synthetic_obj) = self.create_synthetic_object(obj_num) {
1573 return Ok(synthetic_obj);
1574 }
1575
1576 Err(ParseError::SyntaxError {
1577 position: 0,
1578 message: format!("Could not reconstruct object {} {}", obj_num, gen_num),
1579 })
1580 }
1581
1582 fn infer_object_from_context(&mut self, obj_num: u32) -> ParseResult<PdfObject> {
1584 for (_key, obj) in self.object_cache.iter() {
1588 if let PdfObject::Dictionary(dict) = obj {
1589 for (key, value) in dict.0.iter() {
1590 if let PdfObject::Reference(ref_num, _) = value {
1591 if *ref_num == obj_num {
1592 match key.as_str() {
1594 "Font" | "F1" | "F2" | "F3" => {
1595 return Ok(self.create_font_object(obj_num));
1596 }
1597 "XObject" | "Image" | "Im1" => {
1598 return Ok(self.create_xobject(obj_num));
1599 }
1600 "Contents" => {
1601 return Ok(self.create_content_stream(obj_num));
1602 }
1603 "Resources" => {
1604 return Ok(self.create_resources_dict(obj_num));
1605 }
1606 _ => continue,
1607 }
1608 }
1609 }
1610 }
1611 }
1612 }
1613
1614 Err(ParseError::SyntaxError {
1615 position: 0,
1616 message: "Cannot infer object type from context".to_string(),
1617 })
1618 }
1619
1620 fn scan_for_object_patterns(&mut self, obj_num: u32) -> ParseResult<PdfObject> {
1622 self.extract_object_or_stream_manually(obj_num)
1625 }
1626
1627 fn create_synthetic_object(&mut self, obj_num: u32) -> ParseResult<PdfObject> {
1629 use super::objects::{PdfDictionary, PdfName, PdfObject};
1630
1631 match obj_num {
1633 1..=10 => {
1634 let mut dict = PdfDictionary::new();
1636 dict.insert(
1637 "Type".to_string(),
1638 PdfObject::Name(PdfName("Null".to_string())),
1639 );
1640 Ok(PdfObject::Dictionary(dict))
1641 }
1642 _ => {
1643 Ok(PdfObject::Null)
1645 }
1646 }
1647 }
1648
1649 fn create_font_object(&self, _obj_num: u32) -> PdfObject {
1650 use super::objects::{PdfDictionary, PdfName, PdfObject};
1651 let mut font_dict = PdfDictionary::new();
1652 font_dict.insert(
1653 "Type".to_string(),
1654 PdfObject::Name(PdfName("Font".to_string())),
1655 );
1656 font_dict.insert(
1657 "Subtype".to_string(),
1658 PdfObject::Name(PdfName("Type1".to_string())),
1659 );
1660 font_dict.insert(
1661 "BaseFont".to_string(),
1662 PdfObject::Name(PdfName("Helvetica".to_string())),
1663 );
1664 PdfObject::Dictionary(font_dict)
1665 }
1666
1667 fn create_xobject(&self, _obj_num: u32) -> PdfObject {
1668 use super::objects::{PdfDictionary, PdfName, PdfObject};
1669 let mut xobj_dict = PdfDictionary::new();
1670 xobj_dict.insert(
1671 "Type".to_string(),
1672 PdfObject::Name(PdfName("XObject".to_string())),
1673 );
1674 xobj_dict.insert(
1675 "Subtype".to_string(),
1676 PdfObject::Name(PdfName("Form".to_string())),
1677 );
1678 PdfObject::Dictionary(xobj_dict)
1679 }
1680
1681 fn create_content_stream(&self, _obj_num: u32) -> PdfObject {
1682 use super::objects::{PdfDictionary, PdfObject, PdfStream};
1683 let mut stream_dict = PdfDictionary::new();
1684 stream_dict.insert("Length".to_string(), PdfObject::Integer(0));
1685
1686 let stream = PdfStream {
1687 dict: stream_dict,
1688 data: Vec::new(),
1689 };
1690 PdfObject::Stream(stream)
1691 }
1692
1693 fn create_resources_dict(&self, _obj_num: u32) -> PdfObject {
1694 use super::objects::{PdfArray, PdfDictionary, PdfObject};
1695 let mut res_dict = PdfDictionary::new();
1696 res_dict.insert("ProcSet".to_string(), PdfObject::Array(PdfArray::new()));
1697 PdfObject::Dictionary(res_dict)
1698 }
1699
1700 fn extract_object_manually(
1701 &mut self,
1702 obj_num: u32,
1703 ) -> ParseResult<crate::parser::objects::PdfDictionary> {
1704 use crate::parser::objects::{PdfArray, PdfDictionary, PdfName, PdfObject};
1705 use std::collections::HashMap;
1706
1707 let original_pos = self.reader.stream_position().unwrap_or(0);
1709
1710 if self.reader.seek(SeekFrom::Start(0)).is_err() {
1712 return Err(ParseError::SyntaxError {
1713 position: 0,
1714 message: "Failed to seek to beginning for manual extraction".to_string(),
1715 });
1716 }
1717
1718 let mut buffer = Vec::new();
1720 if self.reader.read_to_end(&mut buffer).is_err() {
1721 return Err(ParseError::SyntaxError {
1722 position: 0,
1723 message: "Failed to read file for manual extraction".to_string(),
1724 });
1725 }
1726
1727 let content = String::from_utf8_lossy(&buffer);
1728
1729 let pattern = format!("{} 0 obj", obj_num);
1731 if let Some(start) = content.find(&pattern) {
1732 let search_area = &content[start..];
1733 if let Some(dict_start) = search_area.find("<<") {
1734 let mut bracket_count = 1;
1736 let mut pos = dict_start + 2;
1737 let bytes = search_area.as_bytes();
1738 let mut dict_end = None;
1739
1740 while pos < bytes.len() - 1 && bracket_count > 0 {
1741 if bytes[pos] == b'<' && bytes[pos + 1] == b'<' {
1742 bracket_count += 1;
1743 pos += 2;
1744 } else if bytes[pos] == b'>' && bytes[pos + 1] == b'>' {
1745 bracket_count -= 1;
1746 if bracket_count == 0 {
1747 dict_end = Some(pos);
1748 break;
1749 }
1750 pos += 2;
1751 } else {
1752 pos += 1;
1753 }
1754 }
1755
1756 if let Some(dict_end) = dict_end {
1757 let dict_content = &search_area[dict_start + 2..dict_end];
1758
1759 let mut result_dict = HashMap::new();
1761
1762 if dict_content.contains("/Type/Catalog")
1765 || dict_content.contains("/Type /Catalog")
1766 {
1767 result_dict.insert(
1768 PdfName("Type".to_string()),
1769 PdfObject::Name(PdfName("Catalog".to_string())),
1770 );
1771
1772 if let Some(pages_start) = dict_content.find("/Pages") {
1776 let after_pages = &dict_content[pages_start + 6..]; let trimmed = after_pages.trim_start();
1779 let parts: Vec<&str> = trimmed.split_whitespace().collect();
1781 if parts.len() >= 3 {
1782 if let (Ok(obj), Ok(gen)) =
1786 (parts[0].parse::<u32>(), parts[1].parse::<u16>())
1787 {
1788 if parts[2] == "R" || parts[2].starts_with('R') {
1789 result_dict.insert(
1790 PdfName("Pages".to_string()),
1791 PdfObject::Reference(obj, gen),
1792 );
1793 }
1794 }
1795 }
1796 }
1797
1798 if let Some(ver_start) = dict_content.find("/Version") {
1801 let after_ver = &dict_content[ver_start + 8..];
1802 if let Some(ver_end) = after_ver.find(|c: char| c == '/' || c == '>') {
1803 let version_str = after_ver[..ver_end].trim();
1804 result_dict.insert(
1805 PdfName("Version".to_string()),
1806 PdfObject::Name(PdfName(
1807 version_str.trim_start_matches('/').to_string(),
1808 )),
1809 );
1810 }
1811 }
1812
1813 if let Some(meta_start) = dict_content.find("/Metadata") {
1815 let after_meta = &dict_content[meta_start + 9..];
1816 let parts: Vec<&str> = after_meta.split_whitespace().collect();
1817 if parts.len() >= 3 {
1818 if let (Ok(obj), Ok(gen)) =
1819 (parts[0].parse::<u32>(), parts[1].parse::<u16>())
1820 {
1821 if parts[2] == "R" {
1822 result_dict.insert(
1823 PdfName("Metadata".to_string()),
1824 PdfObject::Reference(obj, gen),
1825 );
1826 }
1827 }
1828 }
1829 }
1830
1831 if let Some(acro_start) = dict_content.find("/AcroForm") {
1833 let after_acro = &dict_content[acro_start + 9..];
1834 if after_acro.trim_start().starts_with("<<") {
1836 } else {
1838 let parts: Vec<&str> = after_acro.split_whitespace().collect();
1839 if parts.len() >= 3 {
1840 if let (Ok(obj), Ok(gen)) =
1841 (parts[0].parse::<u32>(), parts[1].parse::<u16>())
1842 {
1843 if parts[2] == "R" {
1844 result_dict.insert(
1845 PdfName("AcroForm".to_string()),
1846 PdfObject::Reference(obj, gen),
1847 );
1848 }
1849 }
1850 }
1851 }
1852 }
1853 } else if obj_num == 102 {
1854 if dict_content.contains("/Type /Catalog") {
1856 result_dict.insert(
1858 PdfName("Type".to_string()),
1859 PdfObject::Name(PdfName("Catalog".to_string())),
1860 );
1861
1862 if dict_content.contains("/Dests 139 0 R") {
1864 result_dict.insert(
1865 PdfName("Dests".to_string()),
1866 PdfObject::Reference(139, 0),
1867 );
1868 }
1869
1870 if dict_content.contains("/Pages 113 0 R") {
1872 result_dict.insert(
1873 PdfName("Pages".to_string()),
1874 PdfObject::Reference(113, 0),
1875 );
1876 }
1877 } else {
1878 self.reader.seek(SeekFrom::Start(original_pos)).ok();
1881 return Err(ParseError::SyntaxError {
1882 position: 0,
1883 message:
1884 "Object 102 is not a corrupted catalog, cannot reconstruct"
1885 .to_string(),
1886 });
1887 }
1888 } else if obj_num == 113 {
1889 result_dict.insert(
1892 PdfName("Type".to_string()),
1893 PdfObject::Name(PdfName("Pages".to_string())),
1894 );
1895
1896 let page_refs = match self.find_page_objects() {
1898 Ok(refs) => refs,
1899 Err(_e) => {
1900 vec![]
1901 }
1902 };
1903
1904 let page_count = if page_refs.is_empty() {
1906 44
1907 } else {
1908 page_refs.len() as i64
1909 };
1910 result_dict
1911 .insert(PdfName("Count".to_string()), PdfObject::Integer(page_count));
1912
1913 let kids_array: Vec<PdfObject> = page_refs
1915 .into_iter()
1916 .map(|(obj_num, gen_num)| PdfObject::Reference(obj_num, gen_num))
1917 .collect();
1918
1919 result_dict.insert(
1920 PdfName("Kids".to_string()),
1921 PdfObject::Array(PdfArray(kids_array)),
1922 );
1923 } else if obj_num == 114 {
1924 result_dict.insert(
1927 PdfName("Type".to_string()),
1928 PdfObject::Name(PdfName("Pages".to_string())),
1929 );
1930
1931 let page_refs = match self.find_page_objects() {
1933 Ok(refs) => refs,
1934 Err(_e) => {
1935 vec![]
1936 }
1937 };
1938
1939 let page_count = if page_refs.is_empty() {
1941 44
1942 } else {
1943 page_refs.len() as i64
1944 };
1945 result_dict
1946 .insert(PdfName("Count".to_string()), PdfObject::Integer(page_count));
1947
1948 let kids_array: Vec<PdfObject> = page_refs
1950 .into_iter()
1951 .map(|(obj_num, gen_num)| PdfObject::Reference(obj_num, gen_num))
1952 .collect();
1953
1954 result_dict.insert(
1955 PdfName("Kids".to_string()),
1956 PdfObject::Array(PdfArray(kids_array)),
1957 );
1958 } else if self.is_page_object(obj_num) {
1959 result_dict.insert(
1962 PdfName("Type".to_string()),
1963 PdfObject::Name(PdfName("Page".to_string())),
1964 );
1965
1966 self.parse_page_dictionary_content(
1968 &dict_content,
1969 &mut result_dict,
1970 obj_num,
1971 )?;
1972 }
1973
1974 self.reader.seek(SeekFrom::Start(original_pos)).ok();
1976
1977 return Ok(PdfDictionary(result_dict));
1978 }
1979 }
1980 }
1981
1982 self.reader.seek(SeekFrom::Start(original_pos)).ok();
1984
1985 if obj_num == 113 {
1987 let mut result_dict = HashMap::new();
1988 result_dict.insert(
1989 PdfName("Type".to_string()),
1990 PdfObject::Name(PdfName("Pages".to_string())),
1991 );
1992
1993 let page_refs = match self.find_page_objects() {
1995 Ok(refs) => refs,
1996 Err(_e) => {
1997 vec![]
1998 }
1999 };
2000
2001 let page_count = if page_refs.is_empty() {
2003 44
2004 } else {
2005 page_refs.len() as i64
2006 };
2007 result_dict.insert(PdfName("Count".to_string()), PdfObject::Integer(page_count));
2008
2009 let kids_array: Vec<PdfObject> = page_refs
2011 .into_iter()
2012 .map(|(obj_num, gen_num)| PdfObject::Reference(obj_num, gen_num))
2013 .collect();
2014
2015 result_dict.insert(
2016 PdfName("Kids".to_string()),
2017 PdfObject::Array(PdfArray(kids_array)),
2018 );
2019
2020 return Ok(PdfDictionary(result_dict));
2021 } else if obj_num == 114 {
2022 let mut result_dict = HashMap::new();
2023 result_dict.insert(
2024 PdfName("Type".to_string()),
2025 PdfObject::Name(PdfName("Pages".to_string())),
2026 );
2027
2028 let page_refs = match self.find_page_objects() {
2030 Ok(refs) => refs,
2031 Err(_e) => {
2032 vec![]
2033 }
2034 };
2035
2036 let page_count = if page_refs.is_empty() {
2038 44
2039 } else {
2040 page_refs.len() as i64
2041 };
2042 result_dict.insert(PdfName("Count".to_string()), PdfObject::Integer(page_count));
2043
2044 let kids_array: Vec<PdfObject> = page_refs
2046 .into_iter()
2047 .map(|(obj_num, gen_num)| PdfObject::Reference(obj_num, gen_num))
2048 .collect();
2049
2050 result_dict.insert(
2051 PdfName("Kids".to_string()),
2052 PdfObject::Array(PdfArray(kids_array)),
2053 );
2054
2055 return Ok(PdfDictionary(result_dict));
2056 }
2057
2058 Err(ParseError::SyntaxError {
2059 position: 0,
2060 message: "Could not find catalog dictionary in manual extraction".to_string(),
2061 })
2062 }
2063
2064 fn extract_object_or_stream_manually(&mut self, obj_num: u32) -> ParseResult<PdfObject> {
2066 use crate::parser::objects::PdfObject;
2067
2068 let original_pos = self.reader.stream_position().unwrap_or(0);
2070
2071 if self.reader.seek(SeekFrom::Start(0)).is_err() {
2073 return Err(ParseError::SyntaxError {
2074 position: 0,
2075 message: "Failed to seek to beginning for manual extraction".to_string(),
2076 });
2077 }
2078
2079 let mut buffer = Vec::new();
2081 if self.reader.read_to_end(&mut buffer).is_err() {
2082 return Err(ParseError::SyntaxError {
2083 position: 0,
2084 message: "Failed to read file for manual extraction".to_string(),
2085 });
2086 }
2087
2088 let pattern = format!("{} 0 obj", obj_num).into_bytes();
2090
2091 if let Some(obj_start) = find_bytes(&buffer, &pattern) {
2092 let start = obj_start + pattern.len();
2093 let search_area = &buffer[start..];
2094
2095 if let Some(dict_start) = find_bytes(search_area, b"<<") {
2096 let mut bracket_count = 1;
2098 let mut pos = dict_start + 2;
2099 let mut dict_end = None;
2100
2101 while pos < search_area.len() - 1 && bracket_count > 0 {
2102 if search_area[pos] == b'<' && search_area[pos + 1] == b'<' {
2103 bracket_count += 1;
2104 pos += 2;
2105 } else if search_area[pos] == b'>' && search_area[pos + 1] == b'>' {
2106 bracket_count -= 1;
2107 if bracket_count == 0 {
2108 dict_end = Some(pos);
2109 break;
2110 }
2111 pos += 2;
2112 } else {
2113 pos += 1;
2114 }
2115 }
2116
2117 if let Some(dict_end_pos) = dict_end {
2118 let dict_start_abs = dict_start + 2;
2119 let dict_end_abs = dict_end_pos;
2120 let dict_content_bytes = &search_area[dict_start_abs..dict_end_abs];
2121 let dict_content = String::from_utf8_lossy(dict_content_bytes);
2122
2123 let after_dict = &search_area[dict_end_abs + 2..];
2125 if is_immediate_stream_start(after_dict) {
2126 return self.reconstruct_stream_object_bytes(
2128 obj_num,
2129 &dict_content,
2130 after_dict,
2131 );
2132 } else {
2133 return self
2135 .extract_object_manually(obj_num)
2136 .map(|dict| PdfObject::Dictionary(dict));
2137 }
2138 }
2139 }
2140 }
2141
2142 self.reader.seek(SeekFrom::Start(original_pos)).ok();
2144
2145 Err(ParseError::SyntaxError {
2146 position: 0,
2147 message: format!("Could not manually extract object {}", obj_num),
2148 })
2149 }
2150
2151 fn reconstruct_stream_object_bytes(
2153 &mut self,
2154 obj_num: u32,
2155 dict_content: &str,
2156 after_dict: &[u8],
2157 ) -> ParseResult<PdfObject> {
2158 use crate::parser::objects::{PdfDictionary, PdfName, PdfObject, PdfStream};
2159 use std::collections::HashMap;
2160
2161 let mut dict = HashMap::new();
2163
2164 if dict_content.contains("/Filter /FlateDecode") {
2166 dict.insert(
2167 PdfName("Filter".to_string()),
2168 PdfObject::Name(PdfName("FlateDecode".to_string())),
2169 );
2170 }
2171
2172 if let Some(length_start) = dict_content.find("/Length ") {
2173 let length_part = &dict_content[length_start + 8..];
2174
2175 let is_indirect_ref =
2178 length_part.trim().contains(" R") || length_part.trim().contains(" 0 R");
2179
2180 if is_indirect_ref {
2181 } else if let Some(space_pos) = length_part.find(' ') {
2183 let length_str = &length_part[..space_pos];
2184 if let Ok(length) = length_str.parse::<i64>() {
2185 dict.insert(PdfName("Length".to_string()), PdfObject::Integer(length));
2186 }
2187 } else {
2188 if let Ok(length) = length_part.trim().parse::<i64>() {
2190 dict.insert(PdfName("Length".to_string()), PdfObject::Integer(length));
2191 }
2192 }
2193 } else {
2194 }
2195
2196 if let Some(stream_start) = find_bytes(after_dict, b"stream") {
2198 let stream_start_pos = stream_start + 6; let stream_data_start = if after_dict.get(stream_start_pos) == Some(&b'\n') {
2200 stream_start_pos + 1
2201 } else if after_dict.get(stream_start_pos) == Some(&b'\r') {
2202 if after_dict.get(stream_start_pos + 1) == Some(&b'\n') {
2203 stream_start_pos + 2
2204 } else {
2205 stream_start_pos + 1
2206 }
2207 } else {
2208 stream_start_pos
2209 };
2210
2211 if let Some(endstream_pos) = find_bytes(after_dict, b"endstream") {
2212 let mut stream_data = &after_dict[stream_data_start..endstream_pos];
2213
2214 if let Some(PdfObject::Integer(length)) = dict.get(&PdfName("Length".to_string())) {
2216 let expected_length = *length as usize;
2217 if stream_data.len() > expected_length {
2218 stream_data = &stream_data[..expected_length];
2219 } else if stream_data.len() < expected_length {
2220 tracing::debug!(
2221 "WARNING: Stream data ({} bytes) < Length ({} bytes)!",
2222 stream_data.len(),
2223 expected_length
2224 );
2225 }
2226 }
2227
2228 let stream = PdfStream {
2229 dict: PdfDictionary(dict),
2230 data: stream_data.to_vec(),
2231 };
2232
2233 return Ok(PdfObject::Stream(stream));
2234 } else {
2235 }
2236 }
2237
2238 Err(ParseError::SyntaxError {
2239 position: 0,
2240 message: format!("Could not reconstruct stream for object {}", obj_num),
2241 })
2242 }
2243
2244 fn parse_resources_from_content(&self, dict_content: &str) -> ParseResult<PdfObject> {
2246 use crate::parser::objects::{PdfDictionary, PdfName, PdfObject};
2247 use std::collections::HashMap;
2248
2249 if let Some(resources_start) = dict_content.find("/Resources") {
2251 if let Some(bracket_start) = dict_content[resources_start..].find("<<") {
2253 let abs_bracket_start = resources_start + bracket_start + 2;
2254
2255 let mut bracket_count = 1;
2257 let mut end_pos = abs_bracket_start;
2258 let chars: Vec<char> = dict_content.chars().collect();
2259
2260 while end_pos < chars.len() && bracket_count > 0 {
2261 if end_pos + 1 < chars.len() {
2262 if chars[end_pos] == '<' && chars[end_pos + 1] == '<' {
2263 bracket_count += 1;
2264 end_pos += 2;
2265 continue;
2266 } else if chars[end_pos] == '>' && chars[end_pos + 1] == '>' {
2267 bracket_count -= 1;
2268 end_pos += 2;
2269 continue;
2270 }
2271 }
2272 end_pos += 1;
2273 }
2274
2275 if bracket_count == 0 {
2276 let resources_content = &dict_content[abs_bracket_start..end_pos - 2];
2277
2278 let mut resources_dict = HashMap::new();
2280
2281 if let Some(font_start) = resources_content.find("/Font") {
2283 if let Some(font_bracket) = resources_content[font_start..].find("<<") {
2284 let abs_font_start = font_start + font_bracket + 2;
2285
2286 let mut font_dict = HashMap::new();
2288
2289 let font_section = &resources_content[abs_font_start..];
2291 let mut pos = 0;
2292 while let Some(f_pos) = font_section[pos..].find("/F") {
2293 let abs_f_pos = pos + f_pos;
2294 if let Some(space_pos) = font_section[abs_f_pos..].find(" ") {
2295 let font_name = &font_section[abs_f_pos..abs_f_pos + space_pos];
2296
2297 let after_name = &font_section[abs_f_pos + space_pos..];
2299 if let Some(r_pos) = after_name.find(" R") {
2300 let ref_part = after_name[..r_pos].trim();
2301 if let Some(parts) = ref_part
2302 .split_whitespace()
2303 .collect::<Vec<&str>>()
2304 .get(0..2)
2305 {
2306 if let (Ok(obj_num), Ok(gen_num)) =
2307 (parts[0].parse::<u32>(), parts[1].parse::<u16>())
2308 {
2309 font_dict.insert(
2310 PdfName(font_name[1..].to_string()), PdfObject::Reference(obj_num, gen_num),
2312 );
2313 }
2314 }
2315 }
2316 }
2317 pos = abs_f_pos + 1;
2318 }
2319
2320 if !font_dict.is_empty() {
2321 resources_dict.insert(
2322 PdfName("Font".to_string()),
2323 PdfObject::Dictionary(PdfDictionary(font_dict)),
2324 );
2325 }
2326 }
2327 }
2328
2329 return Ok(PdfObject::Dictionary(PdfDictionary(resources_dict)));
2330 }
2331 }
2332 }
2333
2334 Err(ParseError::SyntaxError {
2335 position: 0,
2336 message: "Could not parse Resources".to_string(),
2337 })
2338 }
2339
2340 #[allow(dead_code)]
2341 fn extract_catalog_directly(
2342 &mut self,
2343 obj_num: u32,
2344 gen_num: u16,
2345 ) -> ParseResult<&PdfDictionary> {
2346 if let Some(entry) = self.xref.get_entry(obj_num) {
2348 if self.reader.seek(SeekFrom::Start(entry.offset)).is_err() {
2350 return Err(ParseError::SyntaxError {
2351 position: 0,
2352 message: "Failed to seek to catalog object".to_string(),
2353 });
2354 }
2355
2356 let mut buffer = vec![0u8; 2048];
2358 if let Ok(bytes_read) = self.reader.read(&mut buffer) {
2359 let content = String::from_utf8_lossy(&buffer[..bytes_read]);
2360 tracing::debug!("Raw catalog content:\n{}", content);
2361
2362 if let Some(dict_start) = content.find("<<") {
2364 if let Some(dict_end) = content[dict_start..].find(">>") {
2365 let dict_content = &content[dict_start..dict_start + dict_end + 2];
2366 tracing::debug!("Found dictionary content: {}", dict_content);
2367
2368 if let Ok(dict) = self.parse_dictionary_from_string(dict_content) {
2370 let key = (obj_num, gen_num);
2372 self.object_cache.insert(key, PdfObject::Dictionary(dict));
2373
2374 if let Some(PdfObject::Dictionary(ref dict)) =
2376 self.object_cache.get(&key)
2377 {
2378 return Ok(dict);
2379 }
2380 }
2381 }
2382 }
2383 }
2384 }
2385
2386 Err(ParseError::SyntaxError {
2387 position: 0,
2388 message: "Failed to extract catalog directly".to_string(),
2389 })
2390 }
2391
2392 #[allow(dead_code)]
2393 fn parse_dictionary_from_string(&self, dict_str: &str) -> ParseResult<PdfDictionary> {
2394 use crate::parser::lexer::{Lexer, Token};
2395
2396 let mut cursor = std::io::Cursor::new(dict_str.as_bytes());
2398 let mut lexer = Lexer::new_with_options(&mut cursor, self.options.clone());
2399
2400 match lexer.next_token()? {
2402 Token::DictStart => {
2403 let mut dict = std::collections::HashMap::new();
2404
2405 loop {
2406 let token = lexer.next_token()?;
2407 match token {
2408 Token::DictEnd => break,
2409 Token::Name(key) => {
2410 let value = PdfObject::parse_with_options(&mut lexer, &self.options)?;
2412 dict.insert(crate::parser::objects::PdfName(key), value);
2413 }
2414 _ => {
2415 return Err(ParseError::SyntaxError {
2416 position: 0,
2417 message: "Invalid dictionary format".to_string(),
2418 });
2419 }
2420 }
2421 }
2422
2423 Ok(PdfDictionary(dict))
2424 }
2425 _ => Err(ParseError::SyntaxError {
2426 position: 0,
2427 message: "Expected dictionary start".to_string(),
2428 }),
2429 }
2430 }
2431
2432 fn count_page_objects_directly(&mut self) -> Option<u32> {
2434 let mut page_count = 0;
2435
2436 for obj_num in 1..self.xref.len() as u32 {
2438 if let Ok(obj) = self.get_object(obj_num, 0) {
2439 if let Some(dict) = obj.as_dict() {
2440 if let Some(obj_type) = dict.get("Type").and_then(|t| t.as_name()) {
2441 if obj_type.0 == "Page" {
2442 page_count += 1;
2443 }
2444 }
2445 }
2446 }
2447 }
2448
2449 if page_count > 0 {
2450 Some(page_count)
2451 } else {
2452 None
2453 }
2454 }
2455
2456 pub fn metadata(&mut self) -> ParseResult<DocumentMetadata> {
2458 let mut metadata = DocumentMetadata::default();
2459
2460 if let Some(info_dict) = self.info()? {
2461 if let Some(title) = info_dict.get("Title").and_then(|o| o.as_string()) {
2462 metadata.title = title.as_str().ok().map(|s| s.to_string());
2463 }
2464 if let Some(author) = info_dict.get("Author").and_then(|o| o.as_string()) {
2465 metadata.author = author.as_str().ok().map(|s| s.to_string());
2466 }
2467 if let Some(subject) = info_dict.get("Subject").and_then(|o| o.as_string()) {
2468 metadata.subject = subject.as_str().ok().map(|s| s.to_string());
2469 }
2470 if let Some(keywords) = info_dict.get("Keywords").and_then(|o| o.as_string()) {
2471 metadata.keywords = keywords.as_str().ok().map(|s| s.to_string());
2472 }
2473 if let Some(creator) = info_dict.get("Creator").and_then(|o| o.as_string()) {
2474 metadata.creator = creator.as_str().ok().map(|s| s.to_string());
2475 }
2476 if let Some(producer) = info_dict.get("Producer").and_then(|o| o.as_string()) {
2477 metadata.producer = producer.as_str().ok().map(|s| s.to_string());
2478 }
2479 }
2480
2481 metadata.version = self.version().to_string();
2482 metadata.page_count = self.page_count().ok();
2483
2484 Ok(metadata)
2485 }
2486
2487 fn ensure_page_tree(&mut self) -> ParseResult<()> {
2489 if self.page_tree.is_none() {
2490 let page_count = self.page_count()?;
2491 self.page_tree = Some(super::page_tree::PageTree::new(page_count));
2492 }
2493 Ok(())
2494 }
2495
2496 pub fn get_page(&mut self, _index: u32) -> ParseResult<&super::page_tree::ParsedPage> {
2502 self.ensure_page_tree()?;
2503
2504 Err(ParseError::SyntaxError {
2508 position: 0,
2509 message: "get_page not implemented due to borrow checker constraints. Use PdfDocument instead.".to_string(),
2510 })
2511 }
2512
2513 pub fn get_all_pages(&mut self) -> ParseResult<Vec<super::page_tree::ParsedPage>> {
2515 let page_count = self.page_count()?;
2516 let mut pages = Vec::with_capacity(page_count as usize);
2517
2518 for i in 0..page_count {
2519 let page = self.get_page(i)?.clone();
2520 pages.push(page);
2521 }
2522
2523 Ok(pages)
2524 }
2525
2526 pub fn into_document(self) -> super::document::PdfDocument<R> {
2528 super::document::PdfDocument::new(self)
2529 }
2530
2531 pub fn clear_parse_context(&mut self) {
2533 self.parse_context = StackSafeContext::new();
2534 }
2535
2536 pub fn parse_context_mut(&mut self) -> &mut StackSafeContext {
2538 &mut self.parse_context
2539 }
2540
2541 fn find_page_objects(&mut self) -> ParseResult<Vec<(u32, u16)>> {
2543 let original_pos = self.reader.stream_position().unwrap_or(0);
2545
2546 if self.reader.seek(SeekFrom::Start(0)).is_err() {
2548 return Ok(vec![]);
2549 }
2550
2551 let mut buffer = Vec::new();
2552 if self.reader.read_to_end(&mut buffer).is_err() {
2553 return Ok(vec![]);
2554 }
2555
2556 self.reader.seek(SeekFrom::Start(original_pos)).ok();
2558
2559 let content = String::from_utf8_lossy(&buffer);
2560 let mut page_objects = Vec::new();
2561
2562 let lines: Vec<&str> = content.lines().collect();
2564
2565 for (i, line) in lines.iter().enumerate() {
2566 if line.trim().ends_with(" 0 obj") {
2568 if let Some(obj_str) = line.trim().strip_suffix(" 0 obj") {
2569 if let Ok(obj_num) = obj_str.parse::<u32>() {
2570 for j in 1..=10 {
2572 if i + j < lines.len() {
2573 let future_line = lines[i + j];
2574 if future_line.contains("/Type /Page")
2575 && !future_line.contains("/Type /Pages")
2576 {
2577 page_objects.push((obj_num, 0));
2578 break;
2579 }
2580 if future_line.trim().ends_with(" 0 obj")
2582 || future_line.trim() == "endobj"
2583 {
2584 break;
2585 }
2586 }
2587 }
2588 }
2589 }
2590 }
2591 }
2592
2593 page_objects.sort();
2594 page_objects.dedup();
2595
2596 Ok(page_objects)
2597 }
2598
2599 fn find_catalog_object(&mut self) -> ParseResult<(u32, u16)> {
2601 let obj_numbers: Vec<u32> = self.xref.entries().keys().copied().collect();
2606
2607 for obj_num in obj_numbers {
2609 if let Ok(obj) = self.get_object(obj_num, 0) {
2611 if let Some(dict) = obj.as_dict() {
2612 if let Some(type_obj) = dict.get("Type") {
2614 if let Some(type_name) = type_obj.as_name() {
2615 if type_name.0 == "Catalog" {
2616 return Ok((obj_num, 0));
2617 }
2618 if type_name.0 == "Sig"
2620 || type_name.0 == "Pages"
2621 || type_name.0 == "Page"
2622 {
2623 continue;
2624 }
2625 }
2626 }
2627 }
2628 }
2629 }
2630
2631 for obj_num in [1, 2, 3, 4, 5] {
2633 if let Ok(obj) = self.get_object(obj_num, 0) {
2634 if let Some(dict) = obj.as_dict() {
2635 if dict.contains_key("Pages") {
2637 return Ok((obj_num, 0));
2638 }
2639 }
2640 }
2641 }
2642
2643 Err(ParseError::MissingKey(
2644 "Could not find Catalog object".to_string(),
2645 ))
2646 }
2647
2648 fn create_synthetic_pages_dict(
2650 &mut self,
2651 page_refs: &[(u32, u16)],
2652 ) -> ParseResult<&PdfDictionary> {
2653 use super::objects::{PdfArray, PdfName};
2654
2655 let mut valid_page_refs = Vec::new();
2657 for (obj_num, gen_num) in page_refs {
2658 if let Ok(page_obj) = self.get_object(*obj_num, *gen_num) {
2659 if let Some(page_dict) = page_obj.as_dict() {
2660 if let Some(obj_type) = page_dict.get("Type").and_then(|t| t.as_name()) {
2662 if obj_type.0 == "Page" {
2663 valid_page_refs.push((*obj_num, *gen_num));
2664 continue;
2665 }
2666 }
2667
2668 if page_dict.contains_key("MediaBox") || page_dict.contains_key("Contents") {
2670 valid_page_refs.push((*obj_num, *gen_num));
2671 }
2672 }
2673 }
2674 }
2675
2676 if valid_page_refs.is_empty() {
2677 return Err(ParseError::SyntaxError {
2678 position: 0,
2679 message: "No valid page objects found for synthetic Pages tree".to_string(),
2680 });
2681 }
2682
2683 if valid_page_refs.len() > 10 {
2685 return self.create_hierarchical_pages_tree(&valid_page_refs);
2686 }
2687
2688 let mut kids = PdfArray::new();
2690 for (obj_num, gen_num) in &valid_page_refs {
2691 kids.push(PdfObject::Reference(*obj_num, *gen_num));
2692 }
2693
2694 let mut pages_dict = PdfDictionary::new();
2696 pages_dict.insert(
2697 "Type".to_string(),
2698 PdfObject::Name(PdfName("Pages".to_string())),
2699 );
2700 pages_dict.insert("Kids".to_string(), PdfObject::Array(kids));
2701 pages_dict.insert(
2702 "Count".to_string(),
2703 PdfObject::Integer(valid_page_refs.len() as i64),
2704 );
2705
2706 let mut media_box = None;
2708 for (obj_num, gen_num) in valid_page_refs.iter().take(3) {
2709 if let Ok(page_obj) = self.get_object(*obj_num, *gen_num) {
2710 if let Some(page_dict) = page_obj.as_dict() {
2711 if let Some(mb) = page_dict.get("MediaBox") {
2712 media_box = Some(mb.clone());
2713 }
2714 }
2715 }
2716 }
2717
2718 if let Some(mb) = media_box {
2720 pages_dict.insert("MediaBox".to_string(), mb);
2721 } else {
2722 let mut mb_array = PdfArray::new();
2723 mb_array.push(PdfObject::Integer(0));
2724 mb_array.push(PdfObject::Integer(0));
2725 mb_array.push(PdfObject::Integer(612));
2726 mb_array.push(PdfObject::Integer(792));
2727 pages_dict.insert("MediaBox".to_string(), PdfObject::Array(mb_array));
2728 }
2729
2730 let synthetic_key = (u32::MAX - 1, 0);
2732 self.object_cache
2733 .insert(synthetic_key, PdfObject::Dictionary(pages_dict));
2734
2735 if let PdfObject::Dictionary(dict) = &self.object_cache[&synthetic_key] {
2737 Ok(dict)
2738 } else {
2739 unreachable!("Just inserted dictionary")
2740 }
2741 }
2742
2743 fn create_hierarchical_pages_tree(
2745 &mut self,
2746 page_refs: &[(u32, u16)],
2747 ) -> ParseResult<&PdfDictionary> {
2748 use super::objects::{PdfArray, PdfName};
2749
2750 const PAGES_PER_NODE: usize = 10; let chunks: Vec<&[(u32, u16)]> = page_refs.chunks(PAGES_PER_NODE).collect();
2754 let mut intermediate_nodes = Vec::new();
2755
2756 for (chunk_idx, chunk) in chunks.iter().enumerate() {
2758 let mut kids = PdfArray::new();
2759 for (obj_num, gen_num) in chunk.iter() {
2760 kids.push(PdfObject::Reference(*obj_num, *gen_num));
2761 }
2762
2763 let mut intermediate_dict = PdfDictionary::new();
2764 intermediate_dict.insert(
2765 "Type".to_string(),
2766 PdfObject::Name(PdfName("Pages".to_string())),
2767 );
2768 intermediate_dict.insert("Kids".to_string(), PdfObject::Array(kids));
2769 intermediate_dict.insert("Count".to_string(), PdfObject::Integer(chunk.len() as i64));
2770
2771 let intermediate_key = (u32::MAX - 2 - chunk_idx as u32, 0);
2773 self.object_cache
2774 .insert(intermediate_key, PdfObject::Dictionary(intermediate_dict));
2775
2776 intermediate_nodes.push(intermediate_key);
2777 }
2778
2779 let mut root_kids = PdfArray::new();
2781 for (obj_num, gen_num) in &intermediate_nodes {
2782 root_kids.push(PdfObject::Reference(*obj_num, *gen_num));
2783 }
2784
2785 let mut root_pages_dict = PdfDictionary::new();
2786 root_pages_dict.insert(
2787 "Type".to_string(),
2788 PdfObject::Name(PdfName("Pages".to_string())),
2789 );
2790 root_pages_dict.insert("Kids".to_string(), PdfObject::Array(root_kids));
2791 root_pages_dict.insert(
2792 "Count".to_string(),
2793 PdfObject::Integer(page_refs.len() as i64),
2794 );
2795
2796 if let Some((obj_num, gen_num)) = page_refs.first() {
2798 if let Ok(page_obj) = self.get_object(*obj_num, *gen_num) {
2799 if let Some(page_dict) = page_obj.as_dict() {
2800 if let Some(mb) = page_dict.get("MediaBox") {
2801 root_pages_dict.insert("MediaBox".to_string(), mb.clone());
2802 }
2803 }
2804 }
2805 }
2806
2807 let root_key = (u32::MAX - 1, 0);
2809 self.object_cache
2810 .insert(root_key, PdfObject::Dictionary(root_pages_dict));
2811
2812 if let PdfObject::Dictionary(dict) = &self.object_cache[&root_key] {
2814 Ok(dict)
2815 } else {
2816 unreachable!("Just inserted dictionary")
2817 }
2818 }
2819
2820 pub fn signatures(&mut self) -> ParseResult<Vec<crate::signatures::SignatureField>> {
2850 crate::signatures::detect_signature_fields(self).map_err(|e| ParseError::SyntaxError {
2851 position: 0,
2852 message: format!("Failed to detect signatures: {}", e),
2853 })
2854 }
2855
2856 pub fn verify_signatures(
2891 &mut self,
2892 ) -> ParseResult<Vec<crate::signatures::FullSignatureValidationResult>> {
2893 self.verify_signatures_with_trust_store(crate::signatures::TrustStore::default())
2894 }
2895
2896 pub fn verify_signatures_with_trust_store(
2928 &mut self,
2929 trust_store: crate::signatures::TrustStore,
2930 ) -> ParseResult<Vec<crate::signatures::FullSignatureValidationResult>> {
2931 use crate::signatures::{
2932 has_incremental_update, parse_pkcs7_signature, validate_certificate, verify_signature,
2933 FullSignatureValidationResult,
2934 };
2935
2936 let original_pos = self.reader.stream_position().unwrap_or(0);
2938 self.reader.seek(SeekFrom::Start(0))?;
2939
2940 let mut pdf_bytes = Vec::new();
2941 self.reader.read_to_end(&mut pdf_bytes)?;
2942
2943 self.reader.seek(SeekFrom::Start(original_pos)).ok();
2945
2946 let signature_fields = self.signatures()?;
2948
2949 let mut results = Vec::new();
2950
2951 for field in signature_fields {
2952 let mut result = FullSignatureValidationResult {
2953 field: field.clone(),
2954 signer_name: None,
2955 signing_time: None,
2956 hash_valid: false,
2957 signature_valid: false,
2958 certificate_result: None,
2959 has_modifications_after_signing: false,
2960 errors: Vec::new(),
2961 warnings: Vec::new(),
2962 };
2963
2964 result.has_modifications_after_signing =
2966 has_incremental_update(&pdf_bytes, &field.byte_range);
2967
2968 let parsed_sig = match parse_pkcs7_signature(&field.contents) {
2970 Ok(sig) => sig,
2971 Err(e) => {
2972 result
2973 .errors
2974 .push(format!("Failed to parse signature: {}", e));
2975 results.push(result);
2976 continue;
2977 }
2978 };
2979
2980 result.signing_time = parsed_sig.signing_time.clone();
2982 result.signer_name = parsed_sig.signer_common_name().ok();
2983
2984 match verify_signature(&pdf_bytes, &parsed_sig, &field.byte_range) {
2986 Ok(verification) => {
2987 result.hash_valid = verification.hash_valid;
2988 result.signature_valid = verification.signature_valid;
2989 if let Some(details) = verification.details {
2990 result.warnings.push(details);
2991 }
2992 }
2993 Err(e) => {
2994 result
2995 .errors
2996 .push(format!("Signature verification failed: {}", e));
2997 }
2998 }
2999
3000 match validate_certificate(&parsed_sig.signer_certificate_der, &trust_store) {
3002 Ok(cert_result) => {
3003 result.certificate_result = Some(cert_result);
3004 }
3005 Err(e) => {
3006 result
3007 .warnings
3008 .push(format!("Certificate validation failed: {}", e));
3009 }
3010 }
3011
3012 results.push(result);
3013 }
3014
3015 Ok(results)
3016 }
3017}
3018
3019#[derive(Debug, Default, Clone)]
3021pub struct DocumentMetadata {
3022 pub title: Option<String>,
3023 pub author: Option<String>,
3024 pub subject: Option<String>,
3025 pub keywords: Option<String>,
3026 pub creator: Option<String>,
3027 pub producer: Option<String>,
3028 pub creation_date: Option<String>,
3029 pub modification_date: Option<String>,
3030 pub version: String,
3031 pub page_count: Option<u32>,
3032}
3033
3034pub struct EOLIter<'s> {
3035 remainder: &'s str,
3036}
3037impl<'s> Iterator for EOLIter<'s> {
3038 type Item = &'s str;
3039
3040 fn next(&mut self) -> Option<Self::Item> {
3041 if self.remainder.is_empty() {
3042 return None;
3043 }
3044
3045 if let Some((i, sep)) = ["\r\n", "\n", "\r"]
3046 .iter()
3047 .filter_map(|&sep| self.remainder.find(sep).map(|i| (i, sep)))
3048 .min_by_key(|(i, _)| *i)
3049 {
3050 let (line, rest) = self.remainder.split_at(i);
3051 self.remainder = &rest[sep.len()..];
3052 Some(line)
3053 } else {
3054 let line = self.remainder;
3055 self.remainder = "";
3056 Some(line)
3057 }
3058 }
3059}
3060pub trait PDFLines: AsRef<str> {
3061 fn pdf_lines(&self) -> EOLIter<'_> {
3062 EOLIter {
3063 remainder: self.as_ref(),
3064 }
3065 }
3066}
3067impl PDFLines for &str {}
3068impl<'a> PDFLines for std::borrow::Cow<'a, str> {}
3069impl PDFLines for String {}
3070
3071#[cfg(test)]
3072mod tests {
3073
3074 use super::*;
3075 use crate::parser::objects::{PdfName, PdfString};
3076 use crate::parser::test_helpers::*;
3077 use crate::parser::ParseOptions;
3078 use std::io::Cursor;
3079
3080 #[test]
3081 fn test_reader_construction() {
3082 let pdf_data = create_minimal_pdf();
3083 let cursor = Cursor::new(pdf_data);
3084 let result = PdfReader::new(cursor);
3085 assert!(result.is_ok());
3086 }
3087
3088 #[test]
3089 fn test_reader_version() {
3090 let pdf_data = create_minimal_pdf();
3091 let cursor = Cursor::new(pdf_data);
3092 let reader = PdfReader::new(cursor).unwrap();
3093 assert_eq!(reader.version().major, 1);
3094 assert_eq!(reader.version().minor, 4);
3095 }
3096
3097 #[test]
3098 fn test_reader_different_versions() {
3099 let versions = vec![
3100 "1.0", "1.1", "1.2", "1.3", "1.4", "1.5", "1.6", "1.7", "2.0",
3101 ];
3102
3103 for version in versions {
3104 let pdf_data = create_pdf_with_version(version);
3105 let cursor = Cursor::new(pdf_data);
3106 let reader = PdfReader::new(cursor).unwrap();
3107
3108 let parts: Vec<&str> = version.split('.').collect();
3109 assert_eq!(reader.version().major, parts[0].parse::<u8>().unwrap());
3110 assert_eq!(reader.version().minor, parts[1].parse::<u8>().unwrap());
3111 }
3112 }
3113
3114 #[test]
3115 fn test_reader_catalog() {
3116 let pdf_data = create_minimal_pdf();
3117 let cursor = Cursor::new(pdf_data);
3118 let mut reader = PdfReader::new(cursor).unwrap();
3119
3120 let catalog = reader.catalog();
3121 assert!(catalog.is_ok());
3122
3123 let catalog_dict = catalog.unwrap();
3124 assert_eq!(
3125 catalog_dict.get("Type"),
3126 Some(&PdfObject::Name(PdfName("Catalog".to_string())))
3127 );
3128 }
3129
3130 #[test]
3131 fn test_reader_info_none() {
3132 let pdf_data = create_minimal_pdf();
3133 let cursor = Cursor::new(pdf_data);
3134 let mut reader = PdfReader::new(cursor).unwrap();
3135
3136 let info = reader.info().unwrap();
3137 assert!(info.is_none());
3138 }
3139
3140 #[test]
3141 fn test_reader_info_present() {
3142 let pdf_data = create_pdf_with_info();
3143 let cursor = Cursor::new(pdf_data);
3144 let mut reader = PdfReader::new(cursor).unwrap();
3145
3146 let info = reader.info().unwrap();
3147 assert!(info.is_some());
3148
3149 let info_dict = info.unwrap();
3150 assert_eq!(
3151 info_dict.get("Title"),
3152 Some(&PdfObject::String(PdfString(
3153 "Test PDF".to_string().into_bytes()
3154 )))
3155 );
3156 assert_eq!(
3157 info_dict.get("Author"),
3158 Some(&PdfObject::String(PdfString(
3159 "Test Author".to_string().into_bytes()
3160 )))
3161 );
3162 }
3163
3164 #[test]
3165 fn test_reader_get_object() {
3166 let pdf_data = create_minimal_pdf();
3167 let cursor = Cursor::new(pdf_data);
3168 let mut reader = PdfReader::new(cursor).unwrap();
3169
3170 let obj = reader.get_object(1, 0);
3172 assert!(obj.is_ok());
3173
3174 let catalog = obj.unwrap();
3175 assert!(catalog.as_dict().is_some());
3176 }
3177
3178 #[test]
3179 fn test_reader_get_invalid_object() {
3180 let pdf_data = create_minimal_pdf();
3181 let cursor = Cursor::new(pdf_data);
3182 let mut reader = PdfReader::new(cursor).unwrap();
3183
3184 let obj = reader.get_object(999, 0);
3186 assert!(obj.is_err());
3187 }
3188
3189 #[test]
3190 fn test_reader_get_free_object() {
3191 let pdf_data = create_minimal_pdf();
3192 let cursor = Cursor::new(pdf_data);
3193 let mut reader = PdfReader::new(cursor).unwrap();
3194
3195 let obj = reader.get_object(0, 65535);
3197 assert!(obj.is_ok());
3198 assert_eq!(obj.unwrap(), &PdfObject::Null);
3199 }
3200
3201 #[test]
3202 fn test_reader_resolve_reference() {
3203 let pdf_data = create_minimal_pdf();
3204 let cursor = Cursor::new(pdf_data);
3205 let mut reader = PdfReader::new(cursor).unwrap();
3206
3207 let ref_obj = PdfObject::Reference(1, 0);
3209 let resolved = reader.resolve(&ref_obj);
3210
3211 assert!(resolved.is_ok());
3212 assert!(resolved.unwrap().as_dict().is_some());
3213 }
3214
3215 #[test]
3216 fn test_reader_resolve_non_reference() {
3217 let pdf_data = create_minimal_pdf();
3218 let cursor = Cursor::new(pdf_data);
3219 let mut reader = PdfReader::new(cursor).unwrap();
3220
3221 let int_obj = PdfObject::Integer(42);
3223 let resolved = reader.resolve(&int_obj).unwrap();
3224
3225 assert_eq!(resolved, &PdfObject::Integer(42));
3226 }
3227
3228 #[test]
3229 fn test_reader_cache_behavior() {
3230 let pdf_data = create_minimal_pdf();
3231 let cursor = Cursor::new(pdf_data);
3232 let mut reader = PdfReader::new(cursor).unwrap();
3233
3234 let obj1 = reader.get_object(1, 0).unwrap();
3236 assert!(obj1.as_dict().is_some());
3237
3238 let obj2 = reader.get_object(1, 0).unwrap();
3240 assert!(obj2.as_dict().is_some());
3241 }
3242
3243 #[test]
3244 fn test_reader_wrong_generation() {
3245 let pdf_data = create_minimal_pdf();
3246 let cursor = Cursor::new(pdf_data);
3247 let mut reader = PdfReader::new(cursor).unwrap();
3248
3249 let obj = reader.get_object(1, 99);
3251 assert!(obj.is_err());
3252 }
3253
3254 #[test]
3255 fn test_reader_invalid_pdf() {
3256 let invalid_data = b"This is not a PDF file";
3257 let cursor = Cursor::new(invalid_data.to_vec());
3258 let result = PdfReader::new(cursor);
3259
3260 assert!(result.is_err());
3261 }
3262
3263 #[test]
3264 fn test_reader_corrupt_xref() {
3265 let corrupt_pdf = b"%PDF-1.4
32661 0 obj
3267<< /Type /Catalog >>
3268endobj
3269xref
3270corrupted xref table
3271trailer
3272<< /Size 2 /Root 1 0 R >>
3273startxref
327424
3275%%EOF"
3276 .to_vec();
3277
3278 let cursor = Cursor::new(corrupt_pdf);
3279 let result = PdfReader::new(cursor);
3280 assert!(result.is_err());
3283 }
3284
3285 #[test]
3286 fn test_reader_missing_trailer() {
3287 let pdf_no_trailer = b"%PDF-1.4
32881 0 obj
3289<< /Type /Catalog >>
3290endobj
3291xref
32920 2
32930000000000 65535 f
32940000000009 00000 n
3295startxref
329624
3297%%EOF"
3298 .to_vec();
3299
3300 let cursor = Cursor::new(pdf_no_trailer);
3301 let result = PdfReader::new(cursor);
3302 assert!(result.is_err());
3305 }
3306
3307 #[test]
3308 fn test_reader_empty_pdf() {
3309 let cursor = Cursor::new(Vec::new());
3310 let result = PdfReader::new(cursor);
3311 assert!(result.is_err());
3312 }
3313
3314 #[test]
3315 fn test_reader_page_count() {
3316 let pdf_data = create_minimal_pdf();
3317 let cursor = Cursor::new(pdf_data);
3318 let mut reader = PdfReader::new(cursor).unwrap();
3319
3320 let count = reader.page_count();
3321 assert!(count.is_ok());
3322 assert_eq!(count.unwrap(), 0); }
3324
3325 #[test]
3326 fn test_reader_into_document() {
3327 let pdf_data = create_minimal_pdf();
3328 let cursor = Cursor::new(pdf_data);
3329 let reader = PdfReader::new(cursor).unwrap();
3330
3331 let document = reader.into_document();
3332 let page_count = document.page_count();
3334 assert!(page_count.is_ok());
3335 }
3336
3337 #[test]
3338 fn test_reader_pages_dict() {
3339 let pdf_data = create_minimal_pdf();
3340 let cursor = Cursor::new(pdf_data);
3341 let mut reader = PdfReader::new(cursor).unwrap();
3342
3343 let pages = reader.pages();
3344 assert!(pages.is_ok());
3345 let pages_dict = pages.unwrap();
3346 assert_eq!(
3347 pages_dict.get("Type"),
3348 Some(&PdfObject::Name(PdfName("Pages".to_string())))
3349 );
3350 }
3351
3352 #[test]
3353 fn test_reader_pdf_with_binary_data() {
3354 let pdf_data = create_pdf_with_binary_marker();
3355
3356 let cursor = Cursor::new(pdf_data);
3357 let result = PdfReader::new(cursor);
3358 assert!(result.is_ok());
3359 }
3360
3361 #[test]
3362 fn test_reader_metadata() {
3363 let pdf_data = create_pdf_with_info();
3364 let cursor = Cursor::new(pdf_data);
3365 let mut reader = PdfReader::new(cursor).unwrap();
3366
3367 let metadata = reader.metadata().unwrap();
3368 assert_eq!(metadata.title, Some("Test PDF".to_string()));
3369 assert_eq!(metadata.author, Some("Test Author".to_string()));
3370 assert_eq!(metadata.subject, Some("Testing".to_string()));
3371 assert_eq!(metadata.version, "1.4".to_string());
3372 }
3373
3374 #[test]
3375 fn test_reader_metadata_empty() {
3376 let pdf_data = create_minimal_pdf();
3377 let cursor = Cursor::new(pdf_data);
3378 let mut reader = PdfReader::new(cursor).unwrap();
3379
3380 let metadata = reader.metadata().unwrap();
3381 assert!(metadata.title.is_none());
3382 assert!(metadata.author.is_none());
3383 assert_eq!(metadata.version, "1.4".to_string());
3384 assert_eq!(metadata.page_count, Some(0));
3385 }
3386
3387 #[test]
3388 fn test_reader_object_number_mismatch() {
3389 let pdf_data = create_minimal_pdf();
3393 let cursor = Cursor::new(pdf_data);
3394 let mut reader = PdfReader::new(cursor).unwrap();
3395
3396 let result = reader.get_object(1, 99);
3399 assert!(result.is_err());
3400
3401 let result2 = reader.get_object(999, 0);
3403 assert!(result2.is_err());
3404 }
3405
3406 #[test]
3407 fn test_document_metadata_struct() {
3408 let metadata = DocumentMetadata {
3409 title: Some("Title".to_string()),
3410 author: Some("Author".to_string()),
3411 subject: Some("Subject".to_string()),
3412 keywords: Some("Keywords".to_string()),
3413 creator: Some("Creator".to_string()),
3414 producer: Some("Producer".to_string()),
3415 creation_date: Some("D:20240101".to_string()),
3416 modification_date: Some("D:20240102".to_string()),
3417 version: "1.5".to_string(),
3418 page_count: Some(10),
3419 };
3420
3421 assert_eq!(metadata.title, Some("Title".to_string()));
3422 assert_eq!(metadata.page_count, Some(10));
3423 }
3424
3425 #[test]
3426 fn test_document_metadata_default() {
3427 let metadata = DocumentMetadata::default();
3428 assert!(metadata.title.is_none());
3429 assert!(metadata.author.is_none());
3430 assert!(metadata.subject.is_none());
3431 assert!(metadata.keywords.is_none());
3432 assert!(metadata.creator.is_none());
3433 assert!(metadata.producer.is_none());
3434 assert!(metadata.creation_date.is_none());
3435 assert!(metadata.modification_date.is_none());
3436 assert_eq!(metadata.version, "".to_string());
3437 assert!(metadata.page_count.is_none());
3438 }
3439
3440 #[test]
3441 fn test_document_metadata_clone() {
3442 let metadata = DocumentMetadata {
3443 title: Some("Test".to_string()),
3444 version: "1.4".to_string(),
3445 ..Default::default()
3446 };
3447
3448 let cloned = metadata;
3449 assert_eq!(cloned.title, Some("Test".to_string()));
3450 assert_eq!(cloned.version, "1.4".to_string());
3451 }
3452
3453 #[test]
3454 fn test_reader_trailer_validation_error() {
3455 let bad_pdf = b"%PDF-1.4
34571 0 obj
3458<< /Type /Catalog >>
3459endobj
3460xref
34610 2
34620000000000 65535 f
34630000000009 00000 n
3464trailer
3465<< /Size 2 >>
3466startxref
346746
3468%%EOF"
3469 .to_vec();
3470
3471 let cursor = Cursor::new(bad_pdf);
3472 let result = PdfReader::new(cursor);
3473 assert!(result.is_err());
3476 }
3477
3478 #[test]
3479 fn test_reader_with_options() {
3480 let pdf_data = create_minimal_pdf();
3481 let cursor = Cursor::new(pdf_data);
3482 let mut options = ParseOptions::default();
3483 options.lenient_streams = true;
3484 options.max_recovery_bytes = 2000;
3485 options.collect_warnings = true;
3486
3487 let reader = PdfReader::new_with_options(cursor, options);
3488 assert!(reader.is_ok());
3489 }
3490
3491 #[test]
3492 fn test_lenient_stream_parsing() {
3493 let pdf_data = b"%PDF-1.4
34951 0 obj
3496<< /Type /Catalog /Pages 2 0 R >>
3497endobj
34982 0 obj
3499<< /Type /Pages /Kids [3 0 R] /Count 1 >>
3500endobj
35013 0 obj
3502<< /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792] /Contents 4 0 R >>
3503endobj
35044 0 obj
3505<< /Length 10 >>
3506stream
3507This is a longer stream than 10 bytes
3508endstream
3509endobj
3510xref
35110 5
35120000000000 65535 f
35130000000009 00000 n
35140000000058 00000 n
35150000000116 00000 n
35160000000219 00000 n
3517trailer
3518<< /Size 5 /Root 1 0 R >>
3519startxref
3520299
3521%%EOF"
3522 .to_vec();
3523
3524 let cursor = Cursor::new(pdf_data.clone());
3526 let strict_options = ParseOptions::strict();
3527 let strict_reader = PdfReader::new_with_options(cursor, strict_options);
3528 assert!(strict_reader.is_err());
3530
3531 let cursor = Cursor::new(pdf_data);
3533 let mut options = ParseOptions::default();
3534 options.lenient_streams = true;
3535 options.max_recovery_bytes = 1000;
3536 options.collect_warnings = false;
3537 let lenient_reader = PdfReader::new_with_options(cursor, options);
3538 assert!(lenient_reader.is_err());
3539 }
3540
3541 #[test]
3542 fn test_parse_options_default() {
3543 let options = ParseOptions::default();
3544 assert!(!options.lenient_streams);
3545 assert_eq!(options.max_recovery_bytes, 1000);
3546 assert!(!options.collect_warnings);
3547 }
3548
3549 #[test]
3550 fn test_parse_options_clone() {
3551 let mut options = ParseOptions::default();
3552 options.lenient_streams = true;
3553 options.max_recovery_bytes = 2000;
3554 options.collect_warnings = true;
3555 let cloned = options;
3556 assert!(cloned.lenient_streams);
3557 assert_eq!(cloned.max_recovery_bytes, 2000);
3558 assert!(cloned.collect_warnings);
3559 }
3560
3561 #[allow(dead_code)]
3564 fn create_encrypted_pdf_dict() -> PdfDictionary {
3565 let mut dict = PdfDictionary::new();
3566 dict.insert(
3567 "Filter".to_string(),
3568 PdfObject::Name(PdfName("Standard".to_string())),
3569 );
3570 dict.insert("V".to_string(), PdfObject::Integer(1));
3571 dict.insert("R".to_string(), PdfObject::Integer(2));
3572 dict.insert("O".to_string(), PdfObject::String(PdfString(vec![0u8; 32])));
3573 dict.insert("U".to_string(), PdfObject::String(PdfString(vec![0u8; 32])));
3574 dict.insert("P".to_string(), PdfObject::Integer(-4));
3575 dict
3576 }
3577
3578 fn create_pdf_with_encryption() -> Vec<u8> {
3579 b"%PDF-1.4
35811 0 obj
3582<< /Type /Catalog /Pages 2 0 R >>
3583endobj
35842 0 obj
3585<< /Type /Pages /Kids [3 0 R] /Count 1 >>
3586endobj
35873 0 obj
3588<< /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792] >>
3589endobj
35904 0 obj
3591<< /Filter /Standard /V 1 /R 2 /O (32 bytes of owner password hash data) /U (32 bytes of user password hash data) /P -4 >>
3592endobj
3593xref
35940 5
35950000000000 65535 f
35960000000009 00000 n
35970000000058 00000 n
35980000000116 00000 n
35990000000201 00000 n
3600trailer
3601<< /Size 5 /Root 1 0 R /Encrypt 4 0 R /ID [(file id)] >>
3602startxref
3603295
3604%%EOF"
3605 .to_vec()
3606 }
3607
3608 #[test]
3609 fn test_reader_encryption_detection() {
3610 let unencrypted_pdf = create_minimal_pdf();
3612 let cursor = Cursor::new(unencrypted_pdf);
3613 let reader = PdfReader::new(cursor).unwrap();
3614 assert!(!reader.is_encrypted());
3615 assert!(reader.is_unlocked()); let encrypted_pdf = create_pdf_with_encryption();
3619 let cursor = Cursor::new(encrypted_pdf);
3620 let result = PdfReader::new(cursor);
3621 assert!(result.is_err());
3623 }
3624
3625 #[test]
3626 fn test_reader_encryption_methods_unencrypted() {
3627 let pdf_data = create_minimal_pdf();
3628 let cursor = Cursor::new(pdf_data);
3629 let mut reader = PdfReader::new(cursor).unwrap();
3630
3631 assert!(!reader.is_encrypted());
3633 assert!(reader.is_unlocked());
3634 assert!(reader.encryption_handler().is_none());
3635 assert!(reader.encryption_handler_mut().is_none());
3636
3637 assert!(reader.unlock_with_password("any_password").unwrap());
3639 assert!(reader.try_empty_password().unwrap());
3640 }
3641
3642 #[test]
3643 fn test_reader_encryption_handler_access() {
3644 let pdf_data = create_minimal_pdf();
3645 let cursor = Cursor::new(pdf_data);
3646 let mut reader = PdfReader::new(cursor).unwrap();
3647
3648 assert!(reader.encryption_handler().is_none());
3650 assert!(reader.encryption_handler_mut().is_none());
3651
3652 assert!(!reader.is_encrypted());
3654 assert!(reader.is_unlocked());
3655 }
3656
3657 #[test]
3658 fn test_reader_multiple_password_attempts() {
3659 let pdf_data = create_minimal_pdf();
3660 let cursor = Cursor::new(pdf_data);
3661 let mut reader = PdfReader::new(cursor).unwrap();
3662
3663 let passwords = vec!["test1", "test2", "admin", "", "password"];
3665 for password in passwords {
3666 assert!(reader.unlock_with_password(password).unwrap());
3667 }
3668
3669 for _ in 0..5 {
3671 assert!(reader.try_empty_password().unwrap());
3672 }
3673 }
3674
3675 #[test]
3676 fn test_reader_encryption_state_consistency() {
3677 let pdf_data = create_minimal_pdf();
3678 let cursor = Cursor::new(pdf_data);
3679 let mut reader = PdfReader::new(cursor).unwrap();
3680
3681 assert!(!reader.is_encrypted());
3683 assert!(reader.is_unlocked());
3684 assert!(reader.encryption_handler().is_none());
3685
3686 let _ = reader.unlock_with_password("test");
3688 assert!(!reader.is_encrypted());
3689 assert!(reader.is_unlocked());
3690 assert!(reader.encryption_handler().is_none());
3691
3692 let _ = reader.try_empty_password();
3693 assert!(!reader.is_encrypted());
3694 assert!(reader.is_unlocked());
3695 assert!(reader.encryption_handler().is_none());
3696 }
3697
3698 #[test]
3699 fn test_reader_encryption_error_handling() {
3700 let encrypted_pdf = create_pdf_with_encryption();
3702 let cursor = Cursor::new(encrypted_pdf);
3703
3704 let result = PdfReader::new(cursor);
3706 match result {
3707 Err(ParseError::EncryptionNotSupported) => {
3708 }
3710 Err(_) => {
3711 }
3713 Ok(_) => {
3714 panic!("Should not successfully create reader for encrypted PDF without password");
3715 }
3716 }
3717 }
3718
3719 #[test]
3720 fn test_reader_encryption_with_options() {
3721 let pdf_data = create_minimal_pdf();
3722 let cursor = Cursor::new(pdf_data);
3723
3724 let strict_options = ParseOptions::strict();
3726 let strict_reader = PdfReader::new_with_options(cursor, strict_options).unwrap();
3727 assert!(!strict_reader.is_encrypted());
3728 assert!(strict_reader.is_unlocked());
3729
3730 let pdf_data = create_minimal_pdf();
3731 let cursor = Cursor::new(pdf_data);
3732 let lenient_options = ParseOptions::lenient();
3733 let lenient_reader = PdfReader::new_with_options(cursor, lenient_options).unwrap();
3734 assert!(!lenient_reader.is_encrypted());
3735 assert!(lenient_reader.is_unlocked());
3736 }
3737
3738 #[test]
3739 fn test_reader_encryption_integration_edge_cases() {
3740 let pdf_data = create_minimal_pdf();
3741 let cursor = Cursor::new(pdf_data);
3742 let mut reader = PdfReader::new(cursor).unwrap();
3743
3744 assert!(reader.unlock_with_password("").unwrap());
3746 assert!(reader.unlock_with_password(" ").unwrap()); assert!(reader
3748 .unlock_with_password("very_long_password_that_exceeds_normal_length")
3749 .unwrap());
3750 assert!(reader.unlock_with_password("unicode_test_ñáéíóú").unwrap());
3751
3752 assert!(reader.unlock_with_password("pass@#$%^&*()").unwrap());
3754 assert!(reader.unlock_with_password("pass\nwith\nnewlines").unwrap());
3755 assert!(reader.unlock_with_password("pass\twith\ttabs").unwrap());
3756 }
3757
3758 mod rigorous {
3759 use super::*;
3760
3761 #[test]
3766 fn test_reader_invalid_pdf_header() {
3767 let invalid_data = b"This is not a PDF file";
3769 let cursor = Cursor::new(invalid_data.to_vec());
3770 let result = PdfReader::new(cursor);
3771
3772 assert!(result.is_err(), "Should fail on invalid PDF header");
3773 }
3774
3775 #[test]
3776 fn test_reader_truncated_header() {
3777 let truncated = b"%PDF";
3779 let cursor = Cursor::new(truncated.to_vec());
3780 let result = PdfReader::new(cursor);
3781
3782 assert!(result.is_err(), "Should fail on truncated header");
3783 }
3784
3785 #[test]
3786 fn test_reader_empty_file() {
3787 let empty = Vec::new();
3788 let cursor = Cursor::new(empty);
3789 let result = PdfReader::new(cursor);
3790
3791 assert!(result.is_err(), "Should fail on empty file");
3792 }
3793
3794 #[test]
3795 fn test_reader_malformed_version() {
3796 let malformed = b"%PDF-X.Y\n%%\xE2\xE3\xCF\xD3\n";
3798 let cursor = Cursor::new(malformed.to_vec());
3799 let result = PdfReader::new(cursor);
3800
3801 if let Ok(reader) = result {
3803 let _version = reader.version();
3805 }
3806 }
3807
3808 #[test]
3809 fn test_reader_get_nonexistent_object() {
3810 let pdf_data = create_minimal_pdf();
3811 let cursor = Cursor::new(pdf_data);
3812 let mut reader = PdfReader::new(cursor).unwrap();
3813
3814 let result = reader.get_object(999, 0);
3816
3817 assert!(result.is_err(), "Should fail when object doesn't exist");
3818 }
3819
3820 #[test]
3821 fn test_reader_get_object_wrong_generation() {
3822 let pdf_data = create_minimal_pdf();
3823 let cursor = Cursor::new(pdf_data);
3824 let mut reader = PdfReader::new(cursor).unwrap();
3825
3826 let result = reader.get_object(1, 99);
3828
3829 if let Err(e) = result {
3831 let _ = e;
3833 }
3834 }
3835
3836 #[test]
3841 fn test_resolve_direct_object() {
3842 let pdf_data = create_minimal_pdf();
3843 let cursor = Cursor::new(pdf_data);
3844 let mut reader = PdfReader::new(cursor).unwrap();
3845
3846 let direct_obj = PdfObject::Integer(42);
3848
3849 let resolved = reader.resolve(&direct_obj).unwrap();
3850
3851 assert_eq!(resolved, &PdfObject::Integer(42));
3853 }
3854
3855 #[test]
3856 fn test_resolve_reference() {
3857 let pdf_data = create_minimal_pdf();
3858 let cursor = Cursor::new(pdf_data);
3859 let mut reader = PdfReader::new(cursor).unwrap();
3860
3861 let pages_ref = {
3863 let catalog = reader.catalog().unwrap();
3864 if let Some(PdfObject::Reference(obj_num, gen_num)) = catalog.get("Pages") {
3865 PdfObject::Reference(*obj_num, *gen_num)
3866 } else {
3867 panic!("Catalog /Pages must be a Reference");
3868 }
3869 };
3870
3871 let resolved = reader.resolve(&pages_ref).unwrap();
3873
3874 if let PdfObject::Dictionary(dict) = resolved {
3876 assert_eq!(
3877 dict.get("Type"),
3878 Some(&PdfObject::Name(PdfName("Pages".to_string())))
3879 );
3880 } else {
3881 panic!("Expected dictionary, got: {:?}", resolved);
3882 }
3883 }
3884
3885 #[test]
3890 fn test_is_encrypted_on_unencrypted() {
3891 let pdf_data = create_minimal_pdf();
3892 let cursor = Cursor::new(pdf_data);
3893 let reader = PdfReader::new(cursor).unwrap();
3894
3895 assert!(
3896 !reader.is_encrypted(),
3897 "Minimal PDF should not be encrypted"
3898 );
3899 }
3900
3901 #[test]
3902 fn test_is_unlocked_on_unencrypted() {
3903 let pdf_data = create_minimal_pdf();
3904 let cursor = Cursor::new(pdf_data);
3905 let reader = PdfReader::new(cursor).unwrap();
3906
3907 assert!(reader.is_unlocked(), "Unencrypted PDF should be unlocked");
3909 }
3910
3911 #[test]
3912 fn test_try_empty_password_on_unencrypted() {
3913 let pdf_data = create_minimal_pdf();
3914 let cursor = Cursor::new(pdf_data);
3915 let mut reader = PdfReader::new(cursor).unwrap();
3916
3917 let result = reader.try_empty_password();
3919 assert!(result.is_ok());
3920 }
3921
3922 #[test]
3927 fn test_reader_with_strict_options() {
3928 let pdf_data = create_minimal_pdf();
3929 let cursor = Cursor::new(pdf_data);
3930
3931 let options = ParseOptions::strict();
3932 let result = PdfReader::new_with_options(cursor, options);
3933
3934 assert!(result.is_ok(), "Minimal PDF should parse in strict mode");
3935 }
3936
3937 #[test]
3938 fn test_reader_with_lenient_options() {
3939 let pdf_data = create_minimal_pdf();
3940 let cursor = Cursor::new(pdf_data);
3941
3942 let options = ParseOptions::lenient();
3943 let result = PdfReader::new_with_options(cursor, options);
3944
3945 assert!(result.is_ok(), "Minimal PDF should parse in lenient mode");
3946 }
3947
3948 #[test]
3949 fn test_reader_options_accessible() {
3950 let pdf_data = create_minimal_pdf();
3951 let cursor = Cursor::new(pdf_data);
3952
3953 let options = ParseOptions::lenient();
3954 let reader = PdfReader::new_with_options(cursor, options.clone()).unwrap();
3955
3956 let reader_options = reader.options();
3958 assert_eq!(reader_options.strict_mode, options.strict_mode);
3959 }
3960
3961 #[test]
3966 fn test_catalog_has_required_fields() {
3967 let pdf_data = create_minimal_pdf();
3968 let cursor = Cursor::new(pdf_data);
3969 let mut reader = PdfReader::new(cursor).unwrap();
3970
3971 let catalog = reader.catalog().unwrap();
3972
3973 assert_eq!(
3975 catalog.get("Type"),
3976 Some(&PdfObject::Name(PdfName("Catalog".to_string()))),
3977 "Catalog must have /Type /Catalog"
3978 );
3979
3980 assert!(
3982 catalog.contains_key("Pages"),
3983 "Catalog must have /Pages entry"
3984 );
3985 }
3986
3987 #[test]
3988 fn test_info_fields_when_present() {
3989 let pdf_data = create_pdf_with_info();
3990 let cursor = Cursor::new(pdf_data);
3991 let mut reader = PdfReader::new(cursor).unwrap();
3992
3993 let info = reader.info().unwrap();
3994 assert!(info.is_some(), "PDF should have Info dictionary");
3995
3996 let info_dict = info.unwrap();
3997
3998 assert!(info_dict.contains_key("Title"), "Info should have Title");
4000 assert!(info_dict.contains_key("Author"), "Info should have Author");
4001 }
4002
4003 #[test]
4004 fn test_info_none_when_absent() {
4005 let pdf_data = create_minimal_pdf();
4006 let cursor = Cursor::new(pdf_data);
4007 let mut reader = PdfReader::new(cursor).unwrap();
4008
4009 let info = reader.info().unwrap();
4010 assert!(info.is_none(), "Minimal PDF should not have Info");
4011 }
4012
4013 #[test]
4018 fn test_version_exact_values() {
4019 let pdf_data = create_pdf_with_version("1.7");
4020 let cursor = Cursor::new(pdf_data);
4021 let reader = PdfReader::new(cursor).unwrap();
4022
4023 let version = reader.version();
4024 assert_eq!(version.major, 1, "Major version must be exact");
4025 assert_eq!(version.minor, 7, "Minor version must be exact");
4026 }
4027
4028 #[test]
4029 fn test_version_pdf_20() {
4030 let pdf_data = create_pdf_with_version("2.0");
4031 let cursor = Cursor::new(pdf_data);
4032 let reader = PdfReader::new(cursor).unwrap();
4033
4034 let version = reader.version();
4035 assert_eq!(version.major, 2, "PDF 2.0 major version");
4036 assert_eq!(version.minor, 0, "PDF 2.0 minor version");
4037 }
4038
4039 #[test]
4044 fn test_pages_returns_pages_dict() {
4045 let pdf_data = create_minimal_pdf();
4046 let cursor = Cursor::new(pdf_data);
4047 let mut reader = PdfReader::new(cursor).unwrap();
4048
4049 let pages_dict = reader
4050 .pages()
4051 .expect("pages() must return Pages dictionary");
4052
4053 assert_eq!(
4054 pages_dict.get("Type"),
4055 Some(&PdfObject::Name(PdfName("Pages".to_string()))),
4056 "Pages dict must have /Type /Pages"
4057 );
4058 }
4059
4060 #[test]
4061 fn test_page_count_minimal_pdf() {
4062 let pdf_data = create_minimal_pdf();
4063 let cursor = Cursor::new(pdf_data);
4064 let mut reader = PdfReader::new(cursor).unwrap();
4065
4066 let count = reader.page_count().expect("page_count() must succeed");
4067 assert_eq!(count, 0, "Minimal PDF has 0 pages");
4068 }
4069
4070 #[test]
4071 fn test_page_count_with_info_pdf() {
4072 let pdf_data = create_pdf_with_info();
4073 let cursor = Cursor::new(pdf_data);
4074 let mut reader = PdfReader::new(cursor).unwrap();
4075
4076 let count = reader.page_count().expect("page_count() must succeed");
4077 assert_eq!(count, 0, "create_pdf_with_info() has Count 0 in Pages dict");
4078 }
4079
4080 #[test]
4085 fn test_metadata_minimal_pdf() {
4086 let pdf_data = create_minimal_pdf();
4087 let cursor = Cursor::new(pdf_data);
4088 let mut reader = PdfReader::new(cursor).unwrap();
4089
4090 let meta = reader.metadata().expect("metadata() must succeed");
4091
4092 assert!(meta.title.is_none(), "Minimal PDF has no title");
4094 assert!(meta.author.is_none(), "Minimal PDF has no author");
4095 }
4096
4097 #[test]
4098 fn test_metadata_with_info() {
4099 let pdf_data = create_pdf_with_info();
4100 let cursor = Cursor::new(pdf_data);
4101 let mut reader = PdfReader::new(cursor).unwrap();
4102
4103 let meta = reader.metadata().expect("metadata() must succeed");
4104
4105 assert!(meta.title.is_some(), "PDF with Info has title");
4106 assert_eq!(meta.title.unwrap(), "Test PDF", "Title must match");
4107 assert!(meta.author.is_some(), "PDF with Info has author");
4108 assert_eq!(meta.author.unwrap(), "Test Author", "Author must match");
4109 }
4110
4111 #[test]
4116 fn test_resolve_stream_length_direct_integer() {
4117 let pdf_data = create_minimal_pdf();
4118 let cursor = Cursor::new(pdf_data);
4119 let mut reader = PdfReader::new(cursor).unwrap();
4120
4121 let length_obj = PdfObject::Integer(100);
4123
4124 let length = reader
4125 .resolve_stream_length(&length_obj)
4126 .expect("resolve_stream_length must succeed");
4127 assert_eq!(length, Some(100), "Direct integer must be resolved");
4128 }
4129
4130 #[test]
4131 fn test_resolve_stream_length_negative_integer() {
4132 let pdf_data = create_minimal_pdf();
4133 let cursor = Cursor::new(pdf_data);
4134 let mut reader = PdfReader::new(cursor).unwrap();
4135
4136 let length_obj = PdfObject::Integer(-10);
4138
4139 let length = reader
4140 .resolve_stream_length(&length_obj)
4141 .expect("resolve_stream_length must succeed");
4142 assert_eq!(length, None, "Negative integer returns None");
4143 }
4144
4145 #[test]
4146 fn test_resolve_stream_length_non_integer() {
4147 let pdf_data = create_minimal_pdf();
4148 let cursor = Cursor::new(pdf_data);
4149 let mut reader = PdfReader::new(cursor).unwrap();
4150
4151 let name_obj = PdfObject::Name(PdfName("Test".to_string()));
4153
4154 let length = reader
4155 .resolve_stream_length(&name_obj)
4156 .expect("resolve_stream_length must succeed");
4157 assert_eq!(length, None, "Non-integer object returns None");
4158 }
4159
4160 #[test]
4165 fn test_get_all_pages_empty_pdf() {
4166 let pdf_data = create_minimal_pdf();
4167 let cursor = Cursor::new(pdf_data);
4168 let mut reader = PdfReader::new(cursor).unwrap();
4169
4170 let pages = reader
4171 .get_all_pages()
4172 .expect("get_all_pages() must succeed");
4173 assert_eq!(pages.len(), 0, "Minimal PDF has 0 pages");
4174 }
4175
4176 #[test]
4177 fn test_get_all_pages_with_info() {
4178 let pdf_data = create_pdf_with_info();
4179 let cursor = Cursor::new(pdf_data);
4180 let mut reader = PdfReader::new(cursor).unwrap();
4181
4182 let pages = reader
4183 .get_all_pages()
4184 .expect("get_all_pages() must succeed");
4185 assert_eq!(
4186 pages.len(),
4187 0,
4188 "create_pdf_with_info() has 0 pages (Count 0)"
4189 );
4190 }
4191
4192 #[test]
4197 fn test_into_document_consumes_reader() {
4198 let pdf_data = create_minimal_pdf();
4199 let cursor = Cursor::new(pdf_data);
4200 let reader = PdfReader::new(cursor).unwrap();
4201
4202 let document = reader.into_document();
4203
4204 let version = document.version().expect("Document must have version");
4206 assert!(
4207 version.starts_with("1."),
4208 "Document must have PDF 1.x version, got: {}",
4209 version
4210 );
4211
4212 let page_count = document
4214 .page_count()
4215 .expect("Document must allow page_count()");
4216 assert_eq!(
4217 page_count, 0,
4218 "Minimal PDF has 0 pages (Count 0 in test helper)"
4219 );
4220 }
4221
4222 #[test]
4227 fn test_clear_parse_context() {
4228 let pdf_data = create_minimal_pdf();
4229 let cursor = Cursor::new(pdf_data);
4230 let mut reader = PdfReader::new(cursor).unwrap();
4231
4232 reader.clear_parse_context();
4234
4235 let version = reader.version();
4237 assert_eq!(version.major, 1, "Reader must still work after clear");
4238 }
4239
4240 #[test]
4241 fn test_parse_context_mut_accessible() {
4242 let pdf_data = create_minimal_pdf();
4243 let cursor = Cursor::new(pdf_data);
4244 let mut reader = PdfReader::new(cursor).unwrap();
4245
4246 let context = reader.parse_context_mut();
4247
4248 let initial_depth = context.depth;
4250 assert_eq!(initial_depth, 0, "Parse context must start with depth 0");
4251
4252 assert!(
4254 context.max_depth > 0,
4255 "Parse context must have positive max_depth"
4256 );
4257 }
4258
4259 #[test]
4264 fn test_find_bytes_basic() {
4265 let haystack = b"Hello World";
4266 let needle = b"World";
4267 let pos = find_bytes(haystack, needle);
4268 assert_eq!(pos, Some(6), "Must find 'World' at position 6");
4269 }
4270
4271 #[test]
4272 fn test_find_bytes_not_found() {
4273 let haystack = b"Hello World";
4274 let needle = b"Rust";
4275 let pos = find_bytes(haystack, needle);
4276 assert_eq!(pos, None, "Must return None when not found");
4277 }
4278
4279 #[test]
4280 fn test_find_bytes_at_start() {
4281 let haystack = b"Hello World";
4282 let needle = b"Hello";
4283 let pos = find_bytes(haystack, needle);
4284 assert_eq!(pos, Some(0), "Must find at position 0");
4285 }
4286
4287 #[test]
4288 fn test_is_immediate_stream_start_with_stream() {
4289 let data = b"stream\ndata";
4290 assert!(
4291 is_immediate_stream_start(data),
4292 "Must detect 'stream' at start"
4293 );
4294 }
4295
4296 #[test]
4297 fn test_is_immediate_stream_start_with_whitespace() {
4298 let data = b" \n\tstream\ndata";
4299 assert!(
4300 is_immediate_stream_start(data),
4301 "Must detect 'stream' after whitespace"
4302 );
4303 }
4304
4305 #[test]
4306 fn test_is_immediate_stream_start_no_stream() {
4307 let data = b"endobj";
4308 assert!(
4309 !is_immediate_stream_start(data),
4310 "Must return false when 'stream' absent"
4311 );
4312 }
4313 }
4314}