1use super::encryption_handler::EncryptionHandler;
6use super::header::PdfHeader;
7use super::object_stream::ObjectStream;
8use super::objects::{PdfArray, PdfDictionary, PdfObject, PdfString};
9use super::stack_safe::StackSafeContext;
10use super::trailer::PdfTrailer;
11use super::xref::XRefTable;
12use super::{ParseError, ParseResult};
13use crate::objects::ObjectId;
14use std::collections::HashMap;
15use std::fs::File;
16use std::io::{BufReader, Read, Seek, SeekFrom};
17use std::path::Path;
18
19fn find_bytes(haystack: &[u8], needle: &[u8]) -> Option<usize> {
21 haystack
22 .windows(needle.len())
23 .position(|window| window == needle)
24}
25
26fn is_immediate_stream_start(data: &[u8]) -> bool {
28 let mut i = 0;
29
30 while i < data.len() && matches!(data[i], b' ' | b'\t' | b'\n' | b'\r') {
32 i += 1;
33 }
34
35 data[i..].starts_with(b"stream")
37}
38
39pub struct PdfReader<R: Read + Seek> {
41 reader: BufReader<R>,
42 header: PdfHeader,
43 xref: XRefTable,
44 trailer: PdfTrailer,
45 object_cache: HashMap<(u32, u16), PdfObject>,
47 object_stream_cache: HashMap<u32, ObjectStream>,
49 page_tree: Option<super::page_tree::PageTree>,
51 parse_context: StackSafeContext,
53 options: super::ParseOptions,
55 encryption_handler: Option<EncryptionHandler>,
57 objects_being_reconstructed: std::sync::Mutex<std::collections::HashSet<u32>>,
59 max_reconstruction_depth: u32,
61}
62
63impl<R: Read + Seek> PdfReader<R> {
64 pub fn options(&self) -> &super::ParseOptions {
66 &self.options
67 }
68
69 pub fn is_encrypted(&self) -> bool {
71 self.encryption_handler.is_some()
72 }
73
74 pub fn is_unlocked(&self) -> bool {
76 match &self.encryption_handler {
77 Some(handler) => handler.is_unlocked(),
78 None => true, }
80 }
81
82 pub fn encryption_handler_mut(&mut self) -> Option<&mut EncryptionHandler> {
84 self.encryption_handler.as_mut()
85 }
86
87 pub fn encryption_handler(&self) -> Option<&EncryptionHandler> {
89 self.encryption_handler.as_ref()
90 }
91
92 pub fn unlock_with_password(&mut self, password: &str) -> ParseResult<bool> {
94 match &mut self.encryption_handler {
95 Some(handler) => {
96 if handler.unlock_with_user_password(password).unwrap_or(false) {
98 Ok(true)
99 } else {
100 Ok(handler
102 .unlock_with_owner_password(password)
103 .unwrap_or(false))
104 }
105 }
106 None => Ok(true), }
108 }
109
110 pub fn try_empty_password(&mut self) -> ParseResult<bool> {
112 match &mut self.encryption_handler {
113 Some(handler) => Ok(handler.try_empty_password().unwrap_or(false)),
114 None => Ok(true), }
116 }
117
118 pub fn unlock(&mut self, password: &str) -> ParseResult<()> {
149 if !self.is_encrypted() {
151 return Ok(());
152 }
153
154 if self.is_unlocked() {
156 return Ok(());
157 }
158
159 let success = self.unlock_with_password(password)?;
161
162 if success {
163 Ok(())
164 } else {
165 Err(ParseError::WrongPassword)
166 }
167 }
168
169 fn ensure_unlocked(&self) -> ParseResult<()> {
171 if self.is_encrypted() && !self.is_unlocked() {
172 return Err(ParseError::PdfLocked);
173 }
174 Ok(())
175 }
176
177 fn decrypt_object_if_needed(
183 &self,
184 obj: PdfObject,
185 obj_num: u32,
186 gen_num: u16,
187 ) -> ParseResult<PdfObject> {
188 let handler = match &self.encryption_handler {
190 Some(h) if h.is_unlocked() => h,
191 _ => return Ok(obj), };
193
194 let obj_id = ObjectId::new(obj_num, gen_num);
195
196 match obj {
197 PdfObject::String(ref s) => {
198 let decrypted_bytes = handler.decrypt_string(s.as_bytes(), &obj_id)?;
200 Ok(PdfObject::String(PdfString::new(decrypted_bytes)))
201 }
202 PdfObject::Stream(ref stream) => {
203 let should_decrypt = stream
205 .dict
206 .get("StmF")
207 .and_then(|o| o.as_name())
208 .map(|n| n.0.as_str() != "Identity")
209 .unwrap_or(true); if should_decrypt {
212 let decrypted_data = handler.decrypt_stream(&stream.data, &obj_id)?;
213
214 let mut new_stream = stream.clone();
216 new_stream.data = decrypted_data;
217 Ok(PdfObject::Stream(new_stream))
218 } else {
219 Ok(obj) }
221 }
222 PdfObject::Dictionary(ref dict) => {
223 let mut new_dict = PdfDictionary::new();
225 for (key, value) in dict.0.iter() {
226 let decrypted_value =
227 self.decrypt_object_if_needed(value.clone(), obj_num, gen_num)?;
228 new_dict.insert(key.0.clone(), decrypted_value);
229 }
230 Ok(PdfObject::Dictionary(new_dict))
231 }
232 PdfObject::Array(ref arr) => {
233 let mut new_arr = Vec::new();
235 for elem in arr.0.iter() {
236 let decrypted_elem =
237 self.decrypt_object_if_needed(elem.clone(), obj_num, gen_num)?;
238 new_arr.push(decrypted_elem);
239 }
240 Ok(PdfObject::Array(PdfArray(new_arr)))
241 }
242 _ => Ok(obj),
244 }
245 }
246}
247
248impl PdfReader<File> {
249 pub fn open<P: AsRef<Path>>(path: P) -> ParseResult<Self> {
251 use std::io::Write;
252 let mut debug_file = std::fs::File::create("/tmp/pdf_open_debug.log").ok();
253 if let Some(ref mut f) = debug_file {
254 writeln!(f, "Opening file: {:?}", path.as_ref()).ok();
255 }
256 let file = File::open(path)?;
257 if let Some(ref mut f) = debug_file {
258 writeln!(f, "File opened successfully").ok();
259 }
260 let options = super::ParseOptions::lenient();
262 Self::new_with_options(file, options)
263 }
264
265 pub fn open_strict<P: AsRef<Path>>(path: P) -> ParseResult<Self> {
267 let file = File::open(path)?;
268 let options = super::ParseOptions::strict();
269 Self::new_with_options(file, options)
270 }
271
272 pub fn open_with_options<P: AsRef<Path>>(
274 path: P,
275 options: super::ParseOptions,
276 ) -> ParseResult<Self> {
277 let file = File::open(path)?;
278 Self::new_with_options(file, options)
279 }
280
281 pub fn open_document<P: AsRef<Path>>(
283 path: P,
284 ) -> ParseResult<super::document::PdfDocument<File>> {
285 let reader = Self::open(path)?;
286 Ok(reader.into_document())
287 }
288}
289
290impl<R: Read + Seek> PdfReader<R> {
291 pub fn new(reader: R) -> ParseResult<Self> {
298 let mut options = super::ParseOptions::default();
301 options.lenient_streams = true;
302 Self::new_with_options(reader, options)
303 }
304
305 pub fn new_with_options(reader: R, options: super::ParseOptions) -> ParseResult<Self> {
307 let mut buf_reader = BufReader::new(reader);
308
309 let start_pos = buf_reader.stream_position()?;
311 buf_reader.seek(SeekFrom::End(0))?;
312 let file_size = buf_reader.stream_position()?;
313 buf_reader.seek(SeekFrom::Start(start_pos))?;
314
315 if file_size == 0 {
316 return Err(ParseError::EmptyFile);
317 }
318
319 use std::io::Write;
321 let mut debug_file = std::fs::File::create("/tmp/pdf_debug.log").ok();
322 if let Some(ref mut f) = debug_file {
323 writeln!(f, "Parsing PDF header...").ok();
324 }
325 let header = PdfHeader::parse(&mut buf_reader)?;
326 if let Some(ref mut f) = debug_file {
327 writeln!(f, "Header parsed: version {}", header.version).ok();
328 }
329
330 if let Some(ref mut f) = debug_file {
332 writeln!(f, "Parsing XRef table...").ok();
333 }
334 let xref = XRefTable::parse_with_options(&mut buf_reader, &options)?;
335 if let Some(ref mut f) = debug_file {
336 writeln!(f, "XRef table parsed with {} entries", xref.len()).ok();
337 }
338
339 let trailer_dict = xref.trailer().ok_or(ParseError::InvalidTrailer)?.clone();
341
342 let xref_offset = xref.xref_offset();
343 let trailer = PdfTrailer::from_dict(trailer_dict, xref_offset)?;
344
345 trailer.validate()?;
347
348 let encryption_handler = if EncryptionHandler::detect_encryption(trailer.dict()) {
350 if let Ok(Some((encrypt_obj_num, encrypt_gen_num))) = trailer.encrypt() {
351 let mut temp_reader = Self {
353 reader: buf_reader,
354 header: header.clone(),
355 xref: xref.clone(),
356 trailer: trailer.clone(),
357 object_cache: HashMap::new(),
358 object_stream_cache: HashMap::new(),
359 page_tree: None,
360 parse_context: StackSafeContext::new(),
361 options: options.clone(),
362 encryption_handler: None,
363 objects_being_reconstructed: std::sync::Mutex::new(
364 std::collections::HashSet::new(),
365 ),
366 max_reconstruction_depth: 100,
367 };
368
369 let encrypt_obj = temp_reader.get_object(encrypt_obj_num, encrypt_gen_num)?;
371 if let Some(encrypt_dict) = encrypt_obj.as_dict() {
372 let file_id = trailer.id().and_then(|id_obj| {
374 if let PdfObject::Array(ref id_array) = id_obj {
375 if let Some(PdfObject::String(ref id_bytes)) = id_array.get(0) {
376 Some(id_bytes.as_bytes().to_vec())
377 } else {
378 None
379 }
380 } else {
381 None
382 }
383 });
384
385 match EncryptionHandler::new(encrypt_dict, file_id) {
386 Ok(mut handler) => {
387 let _ = handler.try_empty_password();
389 buf_reader = temp_reader.reader;
391 Some(handler)
392 }
393 Err(_) => {
394 let _ = temp_reader.reader;
396 return Err(ParseError::EncryptionNotSupported);
397 }
398 }
399 } else {
400 let _ = temp_reader.reader;
401 return Err(ParseError::EncryptionNotSupported);
402 }
403 } else {
404 return Err(ParseError::EncryptionNotSupported);
405 }
406 } else {
407 None
408 };
409
410 Ok(Self {
411 reader: buf_reader,
412 header,
413 xref,
414 trailer,
415 object_cache: HashMap::new(),
416 object_stream_cache: HashMap::new(),
417 page_tree: None,
418 parse_context: StackSafeContext::new(),
419 options,
420 encryption_handler,
421 objects_being_reconstructed: std::sync::Mutex::new(std::collections::HashSet::new()),
422 max_reconstruction_depth: 100,
423 })
424 }
425
426 pub fn version(&self) -> &super::header::PdfVersion {
428 &self.header.version
429 }
430
431 pub fn catalog(&mut self) -> ParseResult<&PdfDictionary> {
433 let (obj_num, gen_num) = match self.trailer.root() {
435 Ok(root) => {
436 if let Ok(obj) = self.get_object(root.0, root.1) {
439 if let Some(dict) = obj.as_dict() {
440 if let Some(type_obj) = dict.get("Type") {
442 if let Some(type_name) = type_obj.as_name() {
443 if type_name.0 != "Catalog" {
444 tracing::warn!("Trailer /Root points to /Type/{} (not Catalog), scanning for real catalog", type_name.0);
445 if let Ok(catalog_ref) = self.find_catalog_object() {
447 catalog_ref
448 } else {
449 root }
451 } else {
452 root }
454 } else {
455 root }
457 } else {
458 root }
460 } else {
461 root }
463 } else {
464 root }
466 }
467 Err(_) => {
468 #[cfg(debug_assertions)]
470 tracing::warn!("Trailer missing Root entry, attempting recovery");
471
472 if let Some(root) = self.trailer.find_root_fallback() {
474 root
475 } else {
476 if let Ok(catalog_ref) = self.find_catalog_object() {
478 catalog_ref
479 } else {
480 return Err(ParseError::MissingKey("Root".to_string()));
481 }
482 }
483 }
484 };
485
486 let key = (obj_num, gen_num);
488 let needs_reconstruction = {
489 match self.get_object(obj_num, gen_num) {
490 Ok(catalog) => {
491 if catalog.as_dict().is_some() {
493 false
495 } else {
496 true
498 }
499 }
500 Err(_) => {
501 true
503 }
504 }
505 };
506
507 if !needs_reconstruction {
508 let catalog = self.get_object(obj_num, gen_num)?;
510 return catalog.as_dict().ok_or_else(|| ParseError::SyntaxError {
511 position: 0,
512 message: format!("Catalog object {} {} is not a dictionary", obj_num, gen_num),
513 });
514 }
515
516 match self.extract_object_manually(obj_num) {
519 Ok(dict) => {
520 let obj = PdfObject::Dictionary(dict);
522 self.object_cache.insert(key, obj);
523
524 use crate::parser::xref::XRefEntry;
526 let xref_entry = XRefEntry {
527 offset: 0, generation: gen_num,
529 in_use: true,
530 };
531 self.xref.add_entry(obj_num, xref_entry);
532
533 if let Some(PdfObject::Dictionary(ref dict)) = self.object_cache.get(&key) {
535 return Ok(dict);
536 }
537 }
538 Err(_e) => {}
539 }
540
541 Err(ParseError::SyntaxError {
543 position: 0,
544 message: format!(
545 "Catalog object {} could not be parsed or reconstructed as a dictionary",
546 obj_num
547 ),
548 })
549 }
550
551 pub fn info(&mut self) -> ParseResult<Option<&PdfDictionary>> {
553 match self.trailer.info() {
554 Some((obj_num, gen_num)) => {
555 let info = self.get_object(obj_num, gen_num)?;
556 Ok(info.as_dict())
557 }
558 None => Ok(None),
559 }
560 }
561
562 pub fn get_object(&mut self, obj_num: u32, gen_num: u16) -> ParseResult<&PdfObject> {
564 self.ensure_unlocked()?;
566
567 let key = (obj_num, gen_num);
568
569 if self.object_cache.contains_key(&key) {
571 return Ok(&self.object_cache[&key]);
572 }
573
574 {
576 let being_loaded =
577 self.objects_being_reconstructed
578 .lock()
579 .map_err(|_| ParseError::SyntaxError {
580 position: 0,
581 message: "Mutex poisoned during circular reference check".to_string(),
582 })?;
583 if being_loaded.contains(&obj_num) {
584 drop(being_loaded);
585 if self.options.collect_warnings {}
586 self.object_cache.insert(key, PdfObject::Null);
587 return Ok(&self.object_cache[&key]);
588 }
589 }
590
591 {
593 let being_loaded =
594 self.objects_being_reconstructed
595 .lock()
596 .map_err(|_| ParseError::SyntaxError {
597 position: 0,
598 message: "Mutex poisoned during depth limit check".to_string(),
599 })?;
600 let depth = being_loaded.len() as u32;
601 if depth >= self.max_reconstruction_depth {
602 drop(being_loaded);
603 if self.options.collect_warnings {}
604 return Err(ParseError::SyntaxError {
605 position: 0,
606 message: format!(
607 "Maximum object loading depth ({}) exceeded",
608 self.max_reconstruction_depth
609 ),
610 });
611 }
612 }
613
614 self.objects_being_reconstructed
616 .lock()
617 .map_err(|_| ParseError::SyntaxError {
618 position: 0,
619 message: "Mutex poisoned while marking object as being loaded".to_string(),
620 })?
621 .insert(obj_num);
622
623 match self.load_object_from_disk(obj_num, gen_num) {
625 Ok(_) => {
626 self.objects_being_reconstructed
628 .lock()
629 .map_err(|_| ParseError::SyntaxError {
630 position: 0,
631 message: "Mutex poisoned while unmarking object after successful load"
632 .to_string(),
633 })?
634 .remove(&obj_num);
635 Ok(&self.object_cache[&key])
637 }
638 Err(e) => {
639 if let Ok(mut guard) = self.objects_being_reconstructed.lock() {
642 guard.remove(&obj_num);
643 }
644 Err(e)
645 }
646 }
647 }
648
649 fn load_object_from_disk(&mut self, obj_num: u32, gen_num: u16) -> ParseResult<&PdfObject> {
651 let key = (obj_num, gen_num);
652
653 if self.object_cache.contains_key(&key) {
655 return Ok(&self.object_cache[&key]);
656 }
657
658 if let Some(ext_entry) = self.xref.get_extended_entry(obj_num) {
660 if let Some((stream_obj_num, index_in_stream)) = ext_entry.compressed_info {
661 return self.get_compressed_object(
663 obj_num,
664 gen_num,
665 stream_obj_num,
666 index_in_stream,
667 );
668 }
669 } else {
670 }
671
672 let (current_offset, _generation) = {
674 let entry = self.xref.get_entry(obj_num);
675
676 match entry {
677 Some(entry) => {
678 if !entry.in_use {
679 self.object_cache.insert(key, PdfObject::Null);
681 return Ok(&self.object_cache[&key]);
682 }
683
684 if entry.generation != gen_num {
685 if self.options.lenient_syntax {
686 if self.options.collect_warnings {
688 tracing::warn!("Object {} generation mismatch - expected {}, found {}, using available",
689 obj_num, gen_num, entry.generation);
690 }
691 } else {
692 return Err(ParseError::InvalidReference(obj_num, gen_num));
693 }
694 }
695
696 (entry.offset, entry.generation)
697 }
698 None => {
699 if self.is_reconstructible_object(obj_num) {
701 return self.attempt_manual_object_reconstruction(obj_num, gen_num, 0);
702 } else {
703 if self.options.lenient_syntax {
704 if self.options.collect_warnings {
706 tracing::warn!(
707 "Object {} {} R not found in XRef, returning null object",
708 obj_num,
709 gen_num
710 );
711 }
712 self.object_cache.insert(key, PdfObject::Null);
713 return Ok(&self.object_cache[&key]);
714 } else {
715 return Err(ParseError::InvalidReference(obj_num, gen_num));
716 }
717 }
718 }
719 }
720 };
721
722 self.reader.seek(std::io::SeekFrom::Start(current_offset))?;
726
727 let mut lexer =
729 super::lexer::Lexer::new_with_options(&mut self.reader, self.options.clone());
730
731 {
733 let token = lexer.next_token()?;
735 let read_obj_num = match token {
736 super::lexer::Token::Integer(n) => n as u32,
737 _ => {
738 if self.options.lenient_syntax {
740 if self.options.collect_warnings {
742 tracing::debug!(
743 "Warning: Using expected object number {obj_num} instead of parsed token: {:?}",
744 token
745 );
746 }
747 obj_num
748 } else {
749 return Err(ParseError::SyntaxError {
750 position: current_offset as usize,
751 message: "Expected object number".to_string(),
752 });
753 }
754 }
755 };
756
757 if read_obj_num != obj_num && !self.options.lenient_syntax {
758 return Err(ParseError::SyntaxError {
759 position: current_offset as usize,
760 message: format!(
761 "Object number mismatch: expected {obj_num}, found {read_obj_num}"
762 ),
763 });
764 }
765
766 let token = lexer.next_token()?;
768 let _read_gen_num = match token {
769 super::lexer::Token::Integer(n) => n as u16,
770 _ => {
771 if self.options.lenient_syntax {
773 if self.options.collect_warnings {
774 tracing::warn!(
775 "Using generation 0 instead of parsed token for object {obj_num}"
776 );
777 }
778 0
779 } else {
780 return Err(ParseError::SyntaxError {
781 position: current_offset as usize,
782 message: "Expected generation number".to_string(),
783 });
784 }
785 }
786 };
787
788 let token = lexer.next_token()?;
790 match token {
791 super::lexer::Token::Obj => {}
792 _ => {
793 if self.options.lenient_syntax {
794 if self.options.collect_warnings {
796 tracing::warn!("Expected 'obj' keyword for object {obj_num} {gen_num}, continuing anyway");
797 }
798 } else {
799 return Err(ParseError::SyntaxError {
800 position: current_offset as usize,
801 message: "Expected 'obj' keyword".to_string(),
802 });
803 }
804 }
805 }
806 }
807
808 self.parse_context.enter()?;
810
811 let obj = match PdfObject::parse_with_options(&mut lexer, &self.options) {
812 Ok(obj) => {
813 self.parse_context.exit();
814 if obj_num == 102 && self.options.collect_warnings {}
816 obj
817 }
818 Err(e) => {
819 self.parse_context.exit();
820
821 if self.is_reconstructible_object(obj_num)
823 && self.can_attempt_manual_reconstruction(&e)
824 {
825 match self.attempt_manual_object_reconstruction(
826 obj_num,
827 gen_num,
828 current_offset,
829 ) {
830 Ok(reconstructed_obj) => {
831 return Ok(reconstructed_obj);
832 }
833 Err(_reconstruction_error) => {}
834 }
835 }
836
837 return Err(e);
838 }
839 };
840
841 let token = lexer.next_token()?;
843 match token {
844 super::lexer::Token::EndObj => {}
845 _ => {
846 if self.options.lenient_syntax {
847 if self.options.collect_warnings {
849 tracing::warn!("Expected 'endobj' keyword after object {obj_num} {gen_num}, continuing anyway");
850 }
851 } else {
852 return Err(ParseError::SyntaxError {
853 position: current_offset as usize,
854 message: "Expected 'endobj' keyword".to_string(),
855 });
856 }
857 }
858 };
859
860 let decrypted_obj = self.decrypt_object_if_needed(obj, obj_num, gen_num)?;
862
863 self.object_cache.insert(key, decrypted_obj);
865
866 Ok(&self.object_cache[&key])
867 }
868
869 pub fn resolve<'a>(&'a mut self, obj: &'a PdfObject) -> ParseResult<&'a PdfObject> {
871 match obj {
872 PdfObject::Reference(obj_num, gen_num) => self.get_object(*obj_num, *gen_num),
873 _ => Ok(obj),
874 }
875 }
876
877 pub fn resolve_stream_length(&mut self, obj: &PdfObject) -> ParseResult<Option<usize>> {
880 match obj {
881 PdfObject::Integer(len) => {
882 if *len >= 0 {
883 Ok(Some(*len as usize))
884 } else {
885 Ok(None)
887 }
888 }
889 PdfObject::Reference(obj_num, gen_num) => {
890 let resolved = self.get_object(*obj_num, *gen_num)?;
891 match resolved {
892 PdfObject::Integer(len) => {
893 if *len >= 0 {
894 Ok(Some(*len as usize))
895 } else {
896 Ok(None)
897 }
898 }
899 _ => {
900 Ok(None)
902 }
903 }
904 }
905 _ => {
906 Ok(None)
908 }
909 }
910 }
911
912 fn get_compressed_object(
914 &mut self,
915 obj_num: u32,
916 gen_num: u16,
917 stream_obj_num: u32,
918 _index_in_stream: u32,
919 ) -> ParseResult<&PdfObject> {
920 let key = (obj_num, gen_num);
921
922 if !self.object_stream_cache.contains_key(&stream_obj_num) {
924 let stream_obj = self.get_object(stream_obj_num, 0)?;
926
927 if let Some(stream) = stream_obj.as_stream() {
928 let obj_stream = ObjectStream::parse(stream.clone(), &self.options)?;
930 self.object_stream_cache.insert(stream_obj_num, obj_stream);
931 } else {
932 return Err(ParseError::SyntaxError {
933 position: 0,
934 message: format!("Object {stream_obj_num} is not a stream"),
935 });
936 }
937 }
938
939 let obj_stream = &self.object_stream_cache[&stream_obj_num];
941 let obj = obj_stream
942 .get_object(obj_num)
943 .ok_or_else(|| ParseError::SyntaxError {
944 position: 0,
945 message: format!("Object {obj_num} not found in object stream {stream_obj_num}"),
946 })?;
947
948 let decrypted_obj = self.decrypt_object_if_needed(obj.clone(), obj_num, gen_num)?;
950
951 self.object_cache.insert(key, decrypted_obj);
953 Ok(&self.object_cache[&key])
954 }
955
956 pub fn pages(&mut self) -> ParseResult<&PdfDictionary> {
958 let (pages_obj_num, pages_gen_num) = {
960 let catalog = self.catalog()?;
961
962 if let Some(pages_ref) = catalog.get("Pages") {
964 match pages_ref {
965 PdfObject::Reference(obj_num, gen_num) => (*obj_num, *gen_num),
966 _ => {
967 return Err(ParseError::SyntaxError {
968 position: 0,
969 message: "Pages must be a reference".to_string(),
970 })
971 }
972 }
973 } else {
974 #[cfg(debug_assertions)]
976 tracing::warn!("Catalog missing Pages entry, attempting recovery");
977
978 if let Ok(page_refs) = self.find_page_objects() {
980 if !page_refs.is_empty() {
981 return self.create_synthetic_pages_dict(&page_refs);
983 }
984 }
985
986 if self.options.lenient_syntax {
988 if self.options.collect_warnings {
989 tracing::warn!("Missing Pages in catalog, searching for page tree");
990 }
991 let mut found_pages = None;
993 for i in 1..self.xref.len() as u32 {
994 if let Ok(obj) = self.get_object(i, 0) {
995 if let Some(dict) = obj.as_dict() {
996 if let Some(obj_type) = dict.get("Type").and_then(|t| t.as_name()) {
997 if obj_type.0 == "Pages" {
998 found_pages = Some((i, 0));
999 break;
1000 }
1001 }
1002 }
1003 }
1004 }
1005 if let Some((obj_num, gen_num)) = found_pages {
1006 (obj_num, gen_num)
1007 } else {
1008 return Err(ParseError::MissingKey("Pages".to_string()));
1009 }
1010 } else {
1011 return Err(ParseError::MissingKey("Pages".to_string()));
1012 }
1013 }
1014 };
1015
1016 let needs_double_resolve = {
1019 let pages_obj = self.get_object(pages_obj_num, pages_gen_num)?;
1020 pages_obj.as_reference()
1021 };
1022
1023 let (final_obj_num, final_gen_num) =
1025 if let Some((ref_obj_num, ref_gen_num)) = needs_double_resolve {
1026 (ref_obj_num, ref_gen_num)
1027 } else {
1028 (pages_obj_num, pages_gen_num)
1029 };
1030
1031 let actual_pages_num = {
1033 let is_valid_dict = {
1035 let pages_obj = self.get_object(final_obj_num, final_gen_num)?;
1036 pages_obj.as_dict().is_some()
1037 };
1038
1039 if is_valid_dict {
1040 final_obj_num
1042 } else {
1043 #[cfg(debug_assertions)]
1045 tracing::warn!("Pages reference invalid, searching for valid Pages object");
1046
1047 if self.options.lenient_syntax {
1048 let xref_len = self.xref.len() as u32;
1050 let mut found_pages_num = None;
1051
1052 for i in 1..xref_len {
1053 let is_pages = {
1055 if let Ok(obj) = self.get_object(i, 0) {
1056 if let Some(dict) = obj.as_dict() {
1057 if let Some(obj_type) =
1058 dict.get("Type").and_then(|t| t.as_name())
1059 {
1060 obj_type.0 == "Pages"
1061 } else {
1062 false
1063 }
1064 } else {
1065 false
1066 }
1067 } else {
1068 false
1069 }
1070 };
1071
1072 if is_pages {
1073 found_pages_num = Some(i);
1074 break;
1075 }
1076 }
1077
1078 if let Some(obj_num) = found_pages_num {
1079 #[cfg(debug_assertions)]
1080 tracing::debug!("Found valid Pages object at {} 0 R", obj_num);
1081 obj_num
1082 } else {
1083 return Err(ParseError::SyntaxError {
1085 position: 0,
1086 message: "Pages is not a dictionary and no valid Pages object found"
1087 .to_string(),
1088 });
1089 }
1090 } else {
1091 return Err(ParseError::SyntaxError {
1093 position: 0,
1094 message: "Pages is not a dictionary".to_string(),
1095 });
1096 }
1097 }
1098 };
1099
1100 let pages_obj = self.get_object(actual_pages_num, 0)?;
1102 pages_obj.as_dict().ok_or_else(|| ParseError::SyntaxError {
1103 position: 0,
1104 message: "Pages object is not a dictionary".to_string(),
1105 })
1106 }
1107
1108 pub fn page_count(&mut self) -> ParseResult<u32> {
1110 const MAX_PAGE_COUNT: u32 = 100_000;
1113
1114 match self.pages() {
1116 Ok(pages) => {
1117 if let Some(count_obj) = pages.get("Count") {
1119 if let Some(count) = count_obj.as_integer() {
1120 let count = count as u32;
1121 if count <= MAX_PAGE_COUNT {
1122 return Ok(count);
1123 }
1124 tracing::warn!(
1125 "PDF /Count {} exceeds limit {}, falling back to Kids array length",
1126 count,
1127 MAX_PAGE_COUNT
1128 );
1129 }
1131 }
1132
1133 if let Some(kids_obj) = pages.get("Kids") {
1135 if let Some(kids_array) = kids_obj.as_array() {
1136 return Ok(kids_array.0.len() as u32);
1137 }
1138 }
1139
1140 Ok(0)
1141 }
1142 Err(_) => {
1143 tracing::debug!("Standard page extraction failed, trying direct extraction");
1145 self.page_count_fallback()
1146 }
1147 }
1148 }
1149
1150 fn page_count_fallback(&mut self) -> ParseResult<u32> {
1152 if let Some(count) = self.extract_page_count_from_linearization() {
1154 tracing::debug!("Found page count {} from linearization", count);
1155 return Ok(count);
1156 }
1157
1158 if let Some(count) = self.count_page_objects_directly() {
1160 tracing::debug!("Found {} pages by counting page objects", count);
1161 return Ok(count);
1162 }
1163
1164 Ok(0)
1165 }
1166
1167 fn extract_page_count_from_linearization(&mut self) -> Option<u32> {
1169 match self.get_object(100, 0) {
1171 Ok(obj) => {
1172 tracing::debug!("Found object 100: {:?}", obj);
1173 if let Some(dict) = obj.as_dict() {
1174 tracing::debug!("Object 100 is a dictionary with {} keys", dict.0.len());
1175 if let Some(n_obj) = dict.get("N") {
1177 tracing::debug!("Found /N field: {:?}", n_obj);
1178 if let Some(count) = n_obj.as_integer() {
1179 tracing::debug!("Extracted page count from linearization: {}", count);
1180 return Some(count as u32);
1181 }
1182 } else {
1183 tracing::debug!("No /N field found in object 100");
1184 for (key, value) in &dict.0 {
1185 tracing::debug!(" {:?}: {:?}", key, value);
1186 }
1187 }
1188 } else {
1189 tracing::debug!("Object 100 is not a dictionary: {:?}", obj);
1190 }
1191 }
1192 Err(e) => {
1193 tracing::debug!("Failed to get object 100: {:?}", e);
1194 tracing::debug!("Attempting direct content extraction...");
1195 return self.extract_n_value_from_raw_object_100();
1197 }
1198 }
1199
1200 None
1201 }
1202
1203 fn extract_n_value_from_raw_object_100(&mut self) -> Option<u32> {
1204 if let Some(entry) = self.xref.get_entry(100) {
1206 if self.reader.seek(SeekFrom::Start(entry.offset)).is_err() {
1208 return None;
1209 }
1210
1211 let mut buffer = vec![0u8; 1024];
1213 if let Ok(bytes_read) = self.reader.read(&mut buffer) {
1214 if bytes_read == 0 {
1215 return None;
1216 }
1217
1218 let content = String::from_utf8_lossy(&buffer[..bytes_read]);
1220 tracing::debug!("Raw content around object 100:\n{}", content);
1221
1222 if let Some(n_pos) = content.find("/N ") {
1224 let after_n = &content[n_pos + 3..];
1225 tracing::debug!(
1226 "Content after /N: {}",
1227 &after_n[..std::cmp::min(50, after_n.len())]
1228 );
1229
1230 let mut num_str = String::new();
1232 for ch in after_n.chars() {
1233 if ch.is_ascii_digit() {
1234 num_str.push(ch);
1235 } else if !num_str.is_empty() {
1236 break;
1238 }
1239 }
1241
1242 if !num_str.is_empty() {
1243 if let Ok(page_count) = num_str.parse::<u32>() {
1244 tracing::debug!(
1245 "Extracted page count from raw content: {}",
1246 page_count
1247 );
1248 return Some(page_count);
1249 }
1250 }
1251 }
1252 }
1253 }
1254 None
1255 }
1256
1257 #[allow(dead_code)]
1258 fn find_object_pattern(&mut self, obj_num: u32, gen_num: u16) -> Option<u64> {
1259 let pattern = format!("{} {} obj", obj_num, gen_num);
1260
1261 let original_pos = self.reader.stream_position().unwrap_or(0);
1263
1264 if self.reader.seek(SeekFrom::Start(0)).is_err() {
1266 return None;
1267 }
1268
1269 let mut buffer = vec![0u8; 8192];
1271 let mut file_content = Vec::new();
1272
1273 loop {
1274 match self.reader.read(&mut buffer) {
1275 Ok(0) => break, Ok(bytes_read) => {
1277 file_content.extend_from_slice(&buffer[..bytes_read]);
1278 }
1279 Err(_) => return None,
1280 }
1281 }
1282
1283 let content = String::from_utf8_lossy(&file_content);
1285 if let Some(pattern_pos) = content.find(&pattern) {
1286 let after_pattern = pattern_pos + pattern.len();
1288 let search_area = &content[after_pattern..];
1289
1290 if let Some(dict_start_offset) = search_area.find("<<") {
1291 let dict_start_pos = after_pattern + dict_start_offset;
1292
1293 self.reader.seek(SeekFrom::Start(original_pos)).ok();
1295 return Some(dict_start_pos as u64);
1296 } else {
1297 }
1298 }
1299
1300 self.reader.seek(SeekFrom::Start(original_pos)).ok();
1302 None
1303 }
1304
1305 fn can_attempt_manual_reconstruction(&self, error: &ParseError) -> bool {
1307 match error {
1308 ParseError::SyntaxError { .. } => true,
1310 ParseError::UnexpectedToken { .. } => true,
1311 _ => false,
1313 }
1314 }
1315
1316 fn is_reconstructible_object(&self, obj_num: u32) -> bool {
1318 if obj_num == 102 || obj_num == 113 || obj_num == 114 {
1320 return true;
1321 }
1322
1323 let page_objects = [
1326 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 30, 34, 37, 39, 42, 44, 46, 49, 52,
1327 54, 56, 58, 60, 62, 64, 67, 69, 71, 73, 75, 77, 79, 81, 83, 85, 87, 89, 91, 93, 104,
1328 ];
1329
1330 let content_objects = [
1333 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 29, 31, 32, 33, 35, 36, 38, 40, 41,
1334 43, 45, 47, 48, 50, 51, 53, 55, 57, 59, 61, 63, 65, 66, 68, 70, 72, 74, 76, 78, 80, 82,
1335 84, 86, 88, 90, 92, 94, 95, 96, 97, 98, 99, 100, 101, 105, 106, 107, 108, 109, 110,
1336 111,
1337 ];
1338
1339 page_objects.contains(&obj_num) || content_objects.contains(&obj_num)
1340 }
1341
1342 fn is_page_object(&self, obj_num: u32) -> bool {
1344 let page_objects = [
1345 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 30, 34, 37, 39, 42, 44, 46, 49, 52,
1346 54, 56, 58, 60, 62, 64, 67, 69, 71, 73, 75, 77, 79, 81, 83, 85, 87, 89, 91, 93, 104,
1347 ];
1348 page_objects.contains(&obj_num)
1349 }
1350
1351 fn parse_page_dictionary_content(
1353 &self,
1354 dict_content: &str,
1355 result_dict: &mut std::collections::HashMap<
1356 crate::parser::objects::PdfName,
1357 crate::parser::objects::PdfObject,
1358 >,
1359 _obj_num: u32,
1360 ) -> ParseResult<()> {
1361 use crate::parser::objects::{PdfArray, PdfName, PdfObject};
1362 use std::collections::HashMap;
1363
1364 if let Some(mediabox_start) = dict_content.find("/MediaBox") {
1366 let mediabox_area = &dict_content[mediabox_start..];
1367 if let Some(start_bracket) = mediabox_area.find("[") {
1368 if let Some(end_bracket) = mediabox_area.find("]") {
1369 let mediabox_content = &mediabox_area[start_bracket + 1..end_bracket];
1370 let values: Vec<f32> = mediabox_content
1371 .split_whitespace()
1372 .filter_map(|s| s.parse().ok())
1373 .collect();
1374
1375 if values.len() == 4 {
1376 let mediabox = PdfArray(vec![
1377 PdfObject::Integer(values[0] as i64),
1378 PdfObject::Integer(values[1] as i64),
1379 PdfObject::Integer(values[2] as i64),
1380 PdfObject::Integer(values[3] as i64),
1381 ]);
1382 result_dict
1383 .insert(PdfName("MediaBox".to_string()), PdfObject::Array(mediabox));
1384 }
1385 }
1386 }
1387 }
1388
1389 if let Some(contents_match) = dict_content.find("/Contents") {
1391 let contents_area = &dict_content[contents_match..];
1392 let parts: Vec<&str> = contents_area.split_whitespace().collect();
1394 if parts.len() >= 3 {
1395 if let (Ok(obj_ref), Ok(gen_ref)) =
1396 (parts[1].parse::<u32>(), parts[2].parse::<u16>())
1397 {
1398 if parts.len() > 3 && parts[3] == "R" {
1399 result_dict.insert(
1400 PdfName("Contents".to_string()),
1401 PdfObject::Reference(obj_ref, gen_ref),
1402 );
1403 }
1404 }
1405 }
1406 }
1407
1408 if dict_content.contains("/Parent") {
1410 result_dict.insert(
1411 PdfName("Parent".to_string()),
1412 PdfObject::Reference(113, 0), );
1414 }
1415
1416 if dict_content.contains("/Resources") {
1418 if let Ok(parsed_resources) = self.parse_resources_from_content(&dict_content) {
1419 result_dict.insert(PdfName("Resources".to_string()), parsed_resources);
1420 } else {
1421 let resources = HashMap::new();
1423 result_dict.insert(
1424 PdfName("Resources".to_string()),
1425 PdfObject::Dictionary(crate::parser::objects::PdfDictionary(resources)),
1426 );
1427 }
1428 }
1429
1430 Ok(())
1431 }
1432
1433 fn attempt_manual_object_reconstruction(
1435 &mut self,
1436 obj_num: u32,
1437 gen_num: u16,
1438 _current_offset: u64,
1439 ) -> ParseResult<&PdfObject> {
1440 let is_circular = self
1442 .objects_being_reconstructed
1443 .lock()
1444 .map_err(|_| ParseError::SyntaxError {
1445 position: 0,
1446 message: "Mutex poisoned during circular reference check".to_string(),
1447 })?
1448 .contains(&obj_num);
1449
1450 if is_circular {
1451 tracing::debug!(
1452 "Warning: Circular reconstruction detected for object {} {} - attempting manual extraction",
1453 obj_num, gen_num
1454 );
1455
1456 match self.extract_object_or_stream_manually(obj_num) {
1460 Ok(obj) => {
1461 tracing::debug!(
1462 " Successfully extracted object {} {} manually despite circular reference",
1463 obj_num, gen_num
1464 );
1465 self.object_cache.insert((obj_num, gen_num), obj);
1466 return Ok(&self.object_cache[&(obj_num, gen_num)]);
1467 }
1468 Err(e) => {
1469 tracing::debug!(
1470 " Manual extraction failed: {} - breaking cycle with null object",
1471 e
1472 );
1473 self.object_cache
1475 .insert((obj_num, gen_num), PdfObject::Null);
1476 return Ok(&self.object_cache[&(obj_num, gen_num)]);
1477 }
1478 }
1479 }
1480
1481 let current_depth = self
1483 .objects_being_reconstructed
1484 .lock()
1485 .map_err(|_| ParseError::SyntaxError {
1486 position: 0,
1487 message: "Mutex poisoned during depth check".to_string(),
1488 })?
1489 .len() as u32;
1490 if current_depth >= self.max_reconstruction_depth {
1491 return Err(ParseError::SyntaxError {
1492 position: 0,
1493 message: format!(
1494 "Maximum reconstruction depth ({}) exceeded for object {} {}",
1495 self.max_reconstruction_depth, obj_num, gen_num
1496 ),
1497 });
1498 }
1499
1500 self.objects_being_reconstructed
1502 .lock()
1503 .map_err(|_| ParseError::SyntaxError {
1504 position: 0,
1505 message: "Mutex poisoned while marking object as being reconstructed".to_string(),
1506 })?
1507 .insert(obj_num);
1508
1509 let reconstructed_obj = match self.smart_object_reconstruction(obj_num, gen_num) {
1511 Ok(obj) => obj,
1512 Err(_) => {
1513 match self.extract_object_or_stream_manually(obj_num) {
1515 Ok(obj) => obj,
1516 Err(e) => {
1517 if self.options.lenient_syntax {
1519 PdfObject::Null
1520 } else {
1521 if let Ok(mut guard) = self.objects_being_reconstructed.lock() {
1523 guard.remove(&obj_num);
1524 }
1525 return Err(e);
1526 }
1527 }
1528 }
1529 }
1530 };
1531
1532 self.objects_being_reconstructed
1534 .lock()
1535 .map_err(|_| ParseError::SyntaxError {
1536 position: 0,
1537 message: "Mutex poisoned while unmarking reconstructed object".to_string(),
1538 })?
1539 .remove(&obj_num);
1540
1541 self.object_cache
1542 .insert((obj_num, gen_num), reconstructed_obj);
1543
1544 use crate::parser::xref::XRefEntry;
1546 let xref_entry = XRefEntry {
1547 offset: 0, generation: gen_num,
1549 in_use: true,
1550 };
1551 self.xref.add_entry(obj_num, xref_entry);
1552
1553 self.object_cache
1554 .get(&(obj_num, gen_num))
1555 .ok_or_else(|| ParseError::SyntaxError {
1556 position: 0,
1557 message: format!(
1558 "Object {} {} not in cache after reconstruction",
1559 obj_num, gen_num
1560 ),
1561 })
1562 }
1563
1564 fn smart_object_reconstruction(
1566 &mut self,
1567 obj_num: u32,
1568 gen_num: u16,
1569 ) -> ParseResult<PdfObject> {
1570 if let Ok(inferred_obj) = self.infer_object_from_context(obj_num) {
1574 return Ok(inferred_obj);
1575 }
1576
1577 if let Ok(scanned_obj) = self.scan_for_object_patterns(obj_num) {
1579 return Ok(scanned_obj);
1580 }
1581
1582 if let Ok(synthetic_obj) = self.create_synthetic_object(obj_num) {
1584 return Ok(synthetic_obj);
1585 }
1586
1587 Err(ParseError::SyntaxError {
1588 position: 0,
1589 message: format!("Could not reconstruct object {} {}", obj_num, gen_num),
1590 })
1591 }
1592
1593 fn infer_object_from_context(&mut self, obj_num: u32) -> ParseResult<PdfObject> {
1595 for (_key, obj) in self.object_cache.iter() {
1599 if let PdfObject::Dictionary(dict) = obj {
1600 for (key, value) in dict.0.iter() {
1601 if let PdfObject::Reference(ref_num, _) = value {
1602 if *ref_num == obj_num {
1603 match key.as_str() {
1605 "Font" | "F1" | "F2" | "F3" => {
1606 return Ok(self.create_font_object(obj_num));
1607 }
1608 "XObject" | "Image" | "Im1" => {
1609 return Ok(self.create_xobject(obj_num));
1610 }
1611 "Contents" => {
1612 return Ok(self.create_content_stream(obj_num));
1613 }
1614 "Resources" => {
1615 return Ok(self.create_resources_dict(obj_num));
1616 }
1617 _ => continue,
1618 }
1619 }
1620 }
1621 }
1622 }
1623 }
1624
1625 Err(ParseError::SyntaxError {
1626 position: 0,
1627 message: "Cannot infer object type from context".to_string(),
1628 })
1629 }
1630
1631 fn scan_for_object_patterns(&mut self, obj_num: u32) -> ParseResult<PdfObject> {
1633 self.extract_object_or_stream_manually(obj_num)
1636 }
1637
1638 fn create_synthetic_object(&mut self, obj_num: u32) -> ParseResult<PdfObject> {
1640 use super::objects::{PdfDictionary, PdfName, PdfObject};
1641
1642 match obj_num {
1644 1..=10 => {
1645 let mut dict = PdfDictionary::new();
1647 dict.insert(
1648 "Type".to_string(),
1649 PdfObject::Name(PdfName("Null".to_string())),
1650 );
1651 Ok(PdfObject::Dictionary(dict))
1652 }
1653 _ => {
1654 Ok(PdfObject::Null)
1656 }
1657 }
1658 }
1659
1660 fn create_font_object(&self, _obj_num: u32) -> PdfObject {
1661 use super::objects::{PdfDictionary, PdfName, PdfObject};
1662 let mut font_dict = PdfDictionary::new();
1663 font_dict.insert(
1664 "Type".to_string(),
1665 PdfObject::Name(PdfName("Font".to_string())),
1666 );
1667 font_dict.insert(
1668 "Subtype".to_string(),
1669 PdfObject::Name(PdfName("Type1".to_string())),
1670 );
1671 font_dict.insert(
1672 "BaseFont".to_string(),
1673 PdfObject::Name(PdfName("Helvetica".to_string())),
1674 );
1675 PdfObject::Dictionary(font_dict)
1676 }
1677
1678 fn create_xobject(&self, _obj_num: u32) -> PdfObject {
1679 use super::objects::{PdfDictionary, PdfName, PdfObject};
1680 let mut xobj_dict = PdfDictionary::new();
1681 xobj_dict.insert(
1682 "Type".to_string(),
1683 PdfObject::Name(PdfName("XObject".to_string())),
1684 );
1685 xobj_dict.insert(
1686 "Subtype".to_string(),
1687 PdfObject::Name(PdfName("Form".to_string())),
1688 );
1689 PdfObject::Dictionary(xobj_dict)
1690 }
1691
1692 fn create_content_stream(&self, _obj_num: u32) -> PdfObject {
1693 use super::objects::{PdfDictionary, PdfObject, PdfStream};
1694 let mut stream_dict = PdfDictionary::new();
1695 stream_dict.insert("Length".to_string(), PdfObject::Integer(0));
1696
1697 let stream = PdfStream {
1698 dict: stream_dict,
1699 data: Vec::new(),
1700 };
1701 PdfObject::Stream(stream)
1702 }
1703
1704 fn create_resources_dict(&self, _obj_num: u32) -> PdfObject {
1705 use super::objects::{PdfArray, PdfDictionary, PdfObject};
1706 let mut res_dict = PdfDictionary::new();
1707 res_dict.insert("ProcSet".to_string(), PdfObject::Array(PdfArray::new()));
1708 PdfObject::Dictionary(res_dict)
1709 }
1710
1711 fn extract_object_manually(
1712 &mut self,
1713 obj_num: u32,
1714 ) -> ParseResult<crate::parser::objects::PdfDictionary> {
1715 use crate::parser::objects::{PdfArray, PdfDictionary, PdfName, PdfObject};
1716 use std::collections::HashMap;
1717
1718 let original_pos = self.reader.stream_position().unwrap_or(0);
1720
1721 if self.reader.seek(SeekFrom::Start(0)).is_err() {
1723 return Err(ParseError::SyntaxError {
1724 position: 0,
1725 message: "Failed to seek to beginning for manual extraction".to_string(),
1726 });
1727 }
1728
1729 let mut buffer = Vec::new();
1731 if self.reader.read_to_end(&mut buffer).is_err() {
1732 return Err(ParseError::SyntaxError {
1733 position: 0,
1734 message: "Failed to read file for manual extraction".to_string(),
1735 });
1736 }
1737
1738 let content = String::from_utf8_lossy(&buffer);
1739
1740 let pattern = format!("{} 0 obj", obj_num);
1742 if let Some(start) = content.find(&pattern) {
1743 let search_area = &content[start..];
1744 if let Some(dict_start) = search_area.find("<<") {
1745 let mut bracket_count = 1;
1747 let mut pos = dict_start + 2;
1748 let bytes = search_area.as_bytes();
1749 let mut dict_end = None;
1750
1751 while pos < bytes.len() - 1 && bracket_count > 0 {
1752 if bytes[pos] == b'<' && bytes[pos + 1] == b'<' {
1753 bracket_count += 1;
1754 pos += 2;
1755 } else if bytes[pos] == b'>' && bytes[pos + 1] == b'>' {
1756 bracket_count -= 1;
1757 if bracket_count == 0 {
1758 dict_end = Some(pos);
1759 break;
1760 }
1761 pos += 2;
1762 } else {
1763 pos += 1;
1764 }
1765 }
1766
1767 if let Some(dict_end) = dict_end {
1768 let dict_content = &search_area[dict_start + 2..dict_end];
1769
1770 let mut result_dict = HashMap::new();
1772
1773 if dict_content.contains("/Type/Catalog")
1776 || dict_content.contains("/Type /Catalog")
1777 {
1778 result_dict.insert(
1779 PdfName("Type".to_string()),
1780 PdfObject::Name(PdfName("Catalog".to_string())),
1781 );
1782
1783 if let Some(pages_start) = dict_content.find("/Pages") {
1787 let after_pages = &dict_content[pages_start + 6..]; let trimmed = after_pages.trim_start();
1790 let parts: Vec<&str> = trimmed.split_whitespace().collect();
1792 if parts.len() >= 3 {
1793 if let (Ok(obj), Ok(gen)) =
1797 (parts[0].parse::<u32>(), parts[1].parse::<u16>())
1798 {
1799 if parts[2] == "R" || parts[2].starts_with('R') {
1800 result_dict.insert(
1801 PdfName("Pages".to_string()),
1802 PdfObject::Reference(obj, gen),
1803 );
1804 }
1805 }
1806 }
1807 }
1808
1809 if let Some(ver_start) = dict_content.find("/Version") {
1812 let after_ver = &dict_content[ver_start + 8..];
1813 if let Some(ver_end) = after_ver.find(|c: char| c == '/' || c == '>') {
1814 let version_str = after_ver[..ver_end].trim();
1815 result_dict.insert(
1816 PdfName("Version".to_string()),
1817 PdfObject::Name(PdfName(
1818 version_str.trim_start_matches('/').to_string(),
1819 )),
1820 );
1821 }
1822 }
1823
1824 if let Some(meta_start) = dict_content.find("/Metadata") {
1826 let after_meta = &dict_content[meta_start + 9..];
1827 let parts: Vec<&str> = after_meta.split_whitespace().collect();
1828 if parts.len() >= 3 {
1829 if let (Ok(obj), Ok(gen)) =
1830 (parts[0].parse::<u32>(), parts[1].parse::<u16>())
1831 {
1832 if parts[2] == "R" {
1833 result_dict.insert(
1834 PdfName("Metadata".to_string()),
1835 PdfObject::Reference(obj, gen),
1836 );
1837 }
1838 }
1839 }
1840 }
1841
1842 if let Some(acro_start) = dict_content.find("/AcroForm") {
1844 let after_acro = &dict_content[acro_start + 9..];
1845 if after_acro.trim_start().starts_with("<<") {
1847 } else {
1849 let parts: Vec<&str> = after_acro.split_whitespace().collect();
1850 if parts.len() >= 3 {
1851 if let (Ok(obj), Ok(gen)) =
1852 (parts[0].parse::<u32>(), parts[1].parse::<u16>())
1853 {
1854 if parts[2] == "R" {
1855 result_dict.insert(
1856 PdfName("AcroForm".to_string()),
1857 PdfObject::Reference(obj, gen),
1858 );
1859 }
1860 }
1861 }
1862 }
1863 }
1864 } else if obj_num == 102 {
1865 if dict_content.contains("/Type /Catalog") {
1867 result_dict.insert(
1869 PdfName("Type".to_string()),
1870 PdfObject::Name(PdfName("Catalog".to_string())),
1871 );
1872
1873 if dict_content.contains("/Dests 139 0 R") {
1875 result_dict.insert(
1876 PdfName("Dests".to_string()),
1877 PdfObject::Reference(139, 0),
1878 );
1879 }
1880
1881 if dict_content.contains("/Pages 113 0 R") {
1883 result_dict.insert(
1884 PdfName("Pages".to_string()),
1885 PdfObject::Reference(113, 0),
1886 );
1887 }
1888 } else {
1889 self.reader.seek(SeekFrom::Start(original_pos)).ok();
1892 return Err(ParseError::SyntaxError {
1893 position: 0,
1894 message:
1895 "Object 102 is not a corrupted catalog, cannot reconstruct"
1896 .to_string(),
1897 });
1898 }
1899 } else if obj_num == 113 {
1900 result_dict.insert(
1903 PdfName("Type".to_string()),
1904 PdfObject::Name(PdfName("Pages".to_string())),
1905 );
1906
1907 let page_refs = match self.find_page_objects() {
1909 Ok(refs) => refs,
1910 Err(_e) => {
1911 vec![]
1912 }
1913 };
1914
1915 let page_count = if page_refs.is_empty() {
1917 44
1918 } else {
1919 page_refs.len() as i64
1920 };
1921 result_dict
1922 .insert(PdfName("Count".to_string()), PdfObject::Integer(page_count));
1923
1924 let kids_array: Vec<PdfObject> = page_refs
1926 .into_iter()
1927 .map(|(obj_num, gen_num)| PdfObject::Reference(obj_num, gen_num))
1928 .collect();
1929
1930 result_dict.insert(
1931 PdfName("Kids".to_string()),
1932 PdfObject::Array(PdfArray(kids_array)),
1933 );
1934 } else if obj_num == 114 {
1935 result_dict.insert(
1938 PdfName("Type".to_string()),
1939 PdfObject::Name(PdfName("Pages".to_string())),
1940 );
1941
1942 let page_refs = match self.find_page_objects() {
1944 Ok(refs) => refs,
1945 Err(_e) => {
1946 vec![]
1947 }
1948 };
1949
1950 let page_count = if page_refs.is_empty() {
1952 44
1953 } else {
1954 page_refs.len() as i64
1955 };
1956 result_dict
1957 .insert(PdfName("Count".to_string()), PdfObject::Integer(page_count));
1958
1959 let kids_array: Vec<PdfObject> = page_refs
1961 .into_iter()
1962 .map(|(obj_num, gen_num)| PdfObject::Reference(obj_num, gen_num))
1963 .collect();
1964
1965 result_dict.insert(
1966 PdfName("Kids".to_string()),
1967 PdfObject::Array(PdfArray(kids_array)),
1968 );
1969 } else if self.is_page_object(obj_num) {
1970 result_dict.insert(
1973 PdfName("Type".to_string()),
1974 PdfObject::Name(PdfName("Page".to_string())),
1975 );
1976
1977 self.parse_page_dictionary_content(
1979 &dict_content,
1980 &mut result_dict,
1981 obj_num,
1982 )?;
1983 }
1984
1985 self.reader.seek(SeekFrom::Start(original_pos)).ok();
1987
1988 return Ok(PdfDictionary(result_dict));
1989 }
1990 }
1991 }
1992
1993 self.reader.seek(SeekFrom::Start(original_pos)).ok();
1995
1996 if obj_num == 113 {
1998 let mut result_dict = HashMap::new();
1999 result_dict.insert(
2000 PdfName("Type".to_string()),
2001 PdfObject::Name(PdfName("Pages".to_string())),
2002 );
2003
2004 let page_refs = match self.find_page_objects() {
2006 Ok(refs) => refs,
2007 Err(_e) => {
2008 vec![]
2009 }
2010 };
2011
2012 let page_count = if page_refs.is_empty() {
2014 44
2015 } else {
2016 page_refs.len() as i64
2017 };
2018 result_dict.insert(PdfName("Count".to_string()), PdfObject::Integer(page_count));
2019
2020 let kids_array: Vec<PdfObject> = page_refs
2022 .into_iter()
2023 .map(|(obj_num, gen_num)| PdfObject::Reference(obj_num, gen_num))
2024 .collect();
2025
2026 result_dict.insert(
2027 PdfName("Kids".to_string()),
2028 PdfObject::Array(PdfArray(kids_array)),
2029 );
2030
2031 return Ok(PdfDictionary(result_dict));
2032 } else if obj_num == 114 {
2033 let mut result_dict = HashMap::new();
2034 result_dict.insert(
2035 PdfName("Type".to_string()),
2036 PdfObject::Name(PdfName("Pages".to_string())),
2037 );
2038
2039 let page_refs = match self.find_page_objects() {
2041 Ok(refs) => refs,
2042 Err(_e) => {
2043 vec![]
2044 }
2045 };
2046
2047 let page_count = if page_refs.is_empty() {
2049 44
2050 } else {
2051 page_refs.len() as i64
2052 };
2053 result_dict.insert(PdfName("Count".to_string()), PdfObject::Integer(page_count));
2054
2055 let kids_array: Vec<PdfObject> = page_refs
2057 .into_iter()
2058 .map(|(obj_num, gen_num)| PdfObject::Reference(obj_num, gen_num))
2059 .collect();
2060
2061 result_dict.insert(
2062 PdfName("Kids".to_string()),
2063 PdfObject::Array(PdfArray(kids_array)),
2064 );
2065
2066 return Ok(PdfDictionary(result_dict));
2067 }
2068
2069 Err(ParseError::SyntaxError {
2070 position: 0,
2071 message: "Could not find catalog dictionary in manual extraction".to_string(),
2072 })
2073 }
2074
2075 fn extract_object_or_stream_manually(&mut self, obj_num: u32) -> ParseResult<PdfObject> {
2077 use crate::parser::objects::PdfObject;
2078
2079 let original_pos = self.reader.stream_position().unwrap_or(0);
2081
2082 if self.reader.seek(SeekFrom::Start(0)).is_err() {
2084 return Err(ParseError::SyntaxError {
2085 position: 0,
2086 message: "Failed to seek to beginning for manual extraction".to_string(),
2087 });
2088 }
2089
2090 let mut buffer = Vec::new();
2092 if self.reader.read_to_end(&mut buffer).is_err() {
2093 return Err(ParseError::SyntaxError {
2094 position: 0,
2095 message: "Failed to read file for manual extraction".to_string(),
2096 });
2097 }
2098
2099 let pattern = format!("{} 0 obj", obj_num).into_bytes();
2101
2102 if let Some(obj_start) = find_bytes(&buffer, &pattern) {
2103 let start = obj_start + pattern.len();
2104 let search_area = &buffer[start..];
2105
2106 if let Some(dict_start) = find_bytes(search_area, b"<<") {
2107 let mut bracket_count = 1;
2109 let mut pos = dict_start + 2;
2110 let mut dict_end = None;
2111
2112 while pos < search_area.len() - 1 && bracket_count > 0 {
2113 if search_area[pos] == b'<' && search_area[pos + 1] == b'<' {
2114 bracket_count += 1;
2115 pos += 2;
2116 } else if search_area[pos] == b'>' && search_area[pos + 1] == b'>' {
2117 bracket_count -= 1;
2118 if bracket_count == 0 {
2119 dict_end = Some(pos);
2120 break;
2121 }
2122 pos += 2;
2123 } else {
2124 pos += 1;
2125 }
2126 }
2127
2128 if let Some(dict_end_pos) = dict_end {
2129 let dict_start_abs = dict_start + 2;
2130 let dict_end_abs = dict_end_pos;
2131 let dict_content_bytes = &search_area[dict_start_abs..dict_end_abs];
2132 let dict_content = String::from_utf8_lossy(dict_content_bytes);
2133
2134 let after_dict = &search_area[dict_end_abs + 2..];
2136 if is_immediate_stream_start(after_dict) {
2137 return self.reconstruct_stream_object_bytes(
2139 obj_num,
2140 &dict_content,
2141 after_dict,
2142 );
2143 } else {
2144 return self
2146 .extract_object_manually(obj_num)
2147 .map(|dict| PdfObject::Dictionary(dict));
2148 }
2149 }
2150 }
2151 }
2152
2153 self.reader.seek(SeekFrom::Start(original_pos)).ok();
2155
2156 Err(ParseError::SyntaxError {
2157 position: 0,
2158 message: format!("Could not manually extract object {}", obj_num),
2159 })
2160 }
2161
2162 fn reconstruct_stream_object_bytes(
2164 &mut self,
2165 obj_num: u32,
2166 dict_content: &str,
2167 after_dict: &[u8],
2168 ) -> ParseResult<PdfObject> {
2169 use crate::parser::objects::{PdfDictionary, PdfName, PdfObject, PdfStream};
2170 use std::collections::HashMap;
2171
2172 let mut dict = HashMap::new();
2174
2175 if dict_content.contains("/Filter /FlateDecode") {
2177 dict.insert(
2178 PdfName("Filter".to_string()),
2179 PdfObject::Name(PdfName("FlateDecode".to_string())),
2180 );
2181 }
2182
2183 if let Some(length_start) = dict_content.find("/Length ") {
2184 let length_part = &dict_content[length_start + 8..];
2185
2186 let is_indirect_ref =
2189 length_part.trim().contains(" R") || length_part.trim().contains(" 0 R");
2190
2191 if is_indirect_ref {
2192 } else if let Some(space_pos) = length_part.find(' ') {
2194 let length_str = &length_part[..space_pos];
2195 if let Ok(length) = length_str.parse::<i64>() {
2196 dict.insert(PdfName("Length".to_string()), PdfObject::Integer(length));
2197 }
2198 } else {
2199 if let Ok(length) = length_part.trim().parse::<i64>() {
2201 dict.insert(PdfName("Length".to_string()), PdfObject::Integer(length));
2202 }
2203 }
2204 } else {
2205 }
2206
2207 if let Some(stream_start) = find_bytes(after_dict, b"stream") {
2209 let stream_start_pos = stream_start + 6; let stream_data_start = if after_dict.get(stream_start_pos) == Some(&b'\n') {
2211 stream_start_pos + 1
2212 } else if after_dict.get(stream_start_pos) == Some(&b'\r') {
2213 if after_dict.get(stream_start_pos + 1) == Some(&b'\n') {
2214 stream_start_pos + 2
2215 } else {
2216 stream_start_pos + 1
2217 }
2218 } else {
2219 stream_start_pos
2220 };
2221
2222 if let Some(endstream_pos) = find_bytes(after_dict, b"endstream") {
2223 let mut stream_data = &after_dict[stream_data_start..endstream_pos];
2224
2225 if let Some(PdfObject::Integer(length)) = dict.get(&PdfName("Length".to_string())) {
2227 let expected_length = *length as usize;
2228 if stream_data.len() > expected_length {
2229 stream_data = &stream_data[..expected_length];
2230 } else if stream_data.len() < expected_length {
2231 tracing::debug!(
2232 "WARNING: Stream data ({} bytes) < Length ({} bytes)!",
2233 stream_data.len(),
2234 expected_length
2235 );
2236 }
2237 }
2238
2239 let stream = PdfStream {
2240 dict: PdfDictionary(dict),
2241 data: stream_data.to_vec(),
2242 };
2243
2244 return Ok(PdfObject::Stream(stream));
2245 } else {
2246 }
2247 }
2248
2249 Err(ParseError::SyntaxError {
2250 position: 0,
2251 message: format!("Could not reconstruct stream for object {}", obj_num),
2252 })
2253 }
2254
2255 fn parse_resources_from_content(&self, dict_content: &str) -> ParseResult<PdfObject> {
2257 use crate::parser::objects::{PdfDictionary, PdfName, PdfObject};
2258 use std::collections::HashMap;
2259
2260 if let Some(resources_start) = dict_content.find("/Resources") {
2262 if let Some(bracket_start) = dict_content[resources_start..].find("<<") {
2264 let abs_bracket_start = resources_start + bracket_start + 2;
2265
2266 let mut bracket_count = 1;
2268 let mut end_pos = abs_bracket_start;
2269 let chars: Vec<char> = dict_content.chars().collect();
2270
2271 while end_pos < chars.len() && bracket_count > 0 {
2272 if end_pos + 1 < chars.len() {
2273 if chars[end_pos] == '<' && chars[end_pos + 1] == '<' {
2274 bracket_count += 1;
2275 end_pos += 2;
2276 continue;
2277 } else if chars[end_pos] == '>' && chars[end_pos + 1] == '>' {
2278 bracket_count -= 1;
2279 end_pos += 2;
2280 continue;
2281 }
2282 }
2283 end_pos += 1;
2284 }
2285
2286 if bracket_count == 0 {
2287 let resources_content = &dict_content[abs_bracket_start..end_pos - 2];
2288
2289 let mut resources_dict = HashMap::new();
2291
2292 if let Some(font_start) = resources_content.find("/Font") {
2294 if let Some(font_bracket) = resources_content[font_start..].find("<<") {
2295 let abs_font_start = font_start + font_bracket + 2;
2296
2297 let mut font_dict = HashMap::new();
2299
2300 let font_section = &resources_content[abs_font_start..];
2302 let mut pos = 0;
2303 while let Some(f_pos) = font_section[pos..].find("/F") {
2304 let abs_f_pos = pos + f_pos;
2305 if let Some(space_pos) = font_section[abs_f_pos..].find(" ") {
2306 let font_name = &font_section[abs_f_pos..abs_f_pos + space_pos];
2307
2308 let after_name = &font_section[abs_f_pos + space_pos..];
2310 if let Some(r_pos) = after_name.find(" R") {
2311 let ref_part = after_name[..r_pos].trim();
2312 if let Some(parts) = ref_part
2313 .split_whitespace()
2314 .collect::<Vec<&str>>()
2315 .get(0..2)
2316 {
2317 if let (Ok(obj_num), Ok(gen_num)) =
2318 (parts[0].parse::<u32>(), parts[1].parse::<u16>())
2319 {
2320 font_dict.insert(
2321 PdfName(font_name[1..].to_string()), PdfObject::Reference(obj_num, gen_num),
2323 );
2324 }
2325 }
2326 }
2327 }
2328 pos = abs_f_pos + 1;
2329 }
2330
2331 if !font_dict.is_empty() {
2332 resources_dict.insert(
2333 PdfName("Font".to_string()),
2334 PdfObject::Dictionary(PdfDictionary(font_dict)),
2335 );
2336 }
2337 }
2338 }
2339
2340 return Ok(PdfObject::Dictionary(PdfDictionary(resources_dict)));
2341 }
2342 }
2343 }
2344
2345 Err(ParseError::SyntaxError {
2346 position: 0,
2347 message: "Could not parse Resources".to_string(),
2348 })
2349 }
2350
2351 #[allow(dead_code)]
2352 fn extract_catalog_directly(
2353 &mut self,
2354 obj_num: u32,
2355 gen_num: u16,
2356 ) -> ParseResult<&PdfDictionary> {
2357 if let Some(entry) = self.xref.get_entry(obj_num) {
2359 if self.reader.seek(SeekFrom::Start(entry.offset)).is_err() {
2361 return Err(ParseError::SyntaxError {
2362 position: 0,
2363 message: "Failed to seek to catalog object".to_string(),
2364 });
2365 }
2366
2367 let mut buffer = vec![0u8; 2048];
2369 if let Ok(bytes_read) = self.reader.read(&mut buffer) {
2370 let content = String::from_utf8_lossy(&buffer[..bytes_read]);
2371 tracing::debug!("Raw catalog content:\n{}", content);
2372
2373 if let Some(dict_start) = content.find("<<") {
2375 if let Some(dict_end) = content[dict_start..].find(">>") {
2376 let dict_content = &content[dict_start..dict_start + dict_end + 2];
2377 tracing::debug!("Found dictionary content: {}", dict_content);
2378
2379 if let Ok(dict) = self.parse_dictionary_from_string(dict_content) {
2381 let key = (obj_num, gen_num);
2383 self.object_cache.insert(key, PdfObject::Dictionary(dict));
2384
2385 if let Some(PdfObject::Dictionary(ref dict)) =
2387 self.object_cache.get(&key)
2388 {
2389 return Ok(dict);
2390 }
2391 }
2392 }
2393 }
2394 }
2395 }
2396
2397 Err(ParseError::SyntaxError {
2398 position: 0,
2399 message: "Failed to extract catalog directly".to_string(),
2400 })
2401 }
2402
2403 #[allow(dead_code)]
2404 fn parse_dictionary_from_string(&self, dict_str: &str) -> ParseResult<PdfDictionary> {
2405 use crate::parser::lexer::{Lexer, Token};
2406
2407 let mut cursor = std::io::Cursor::new(dict_str.as_bytes());
2409 let mut lexer = Lexer::new_with_options(&mut cursor, self.options.clone());
2410
2411 match lexer.next_token()? {
2413 Token::DictStart => {
2414 let mut dict = std::collections::HashMap::new();
2415
2416 loop {
2417 let token = lexer.next_token()?;
2418 match token {
2419 Token::DictEnd => break,
2420 Token::Name(key) => {
2421 let value = PdfObject::parse_with_options(&mut lexer, &self.options)?;
2423 dict.insert(crate::parser::objects::PdfName(key), value);
2424 }
2425 _ => {
2426 return Err(ParseError::SyntaxError {
2427 position: 0,
2428 message: "Invalid dictionary format".to_string(),
2429 });
2430 }
2431 }
2432 }
2433
2434 Ok(PdfDictionary(dict))
2435 }
2436 _ => Err(ParseError::SyntaxError {
2437 position: 0,
2438 message: "Expected dictionary start".to_string(),
2439 }),
2440 }
2441 }
2442
2443 fn count_page_objects_directly(&mut self) -> Option<u32> {
2445 let mut page_count = 0;
2446
2447 for obj_num in 1..self.xref.len() as u32 {
2449 if let Ok(obj) = self.get_object(obj_num, 0) {
2450 if let Some(dict) = obj.as_dict() {
2451 if let Some(obj_type) = dict.get("Type").and_then(|t| t.as_name()) {
2452 if obj_type.0 == "Page" {
2453 page_count += 1;
2454 }
2455 }
2456 }
2457 }
2458 }
2459
2460 if page_count > 0 {
2461 Some(page_count)
2462 } else {
2463 None
2464 }
2465 }
2466
2467 pub fn metadata(&mut self) -> ParseResult<DocumentMetadata> {
2469 let mut metadata = DocumentMetadata::default();
2470
2471 if let Some(info_dict) = self.info()? {
2472 if let Some(title) = info_dict.get("Title").and_then(|o| o.as_string()) {
2473 metadata.title = title.as_str().ok().map(|s| s.to_string());
2474 }
2475 if let Some(author) = info_dict.get("Author").and_then(|o| o.as_string()) {
2476 metadata.author = author.as_str().ok().map(|s| s.to_string());
2477 }
2478 if let Some(subject) = info_dict.get("Subject").and_then(|o| o.as_string()) {
2479 metadata.subject = subject.as_str().ok().map(|s| s.to_string());
2480 }
2481 if let Some(keywords) = info_dict.get("Keywords").and_then(|o| o.as_string()) {
2482 metadata.keywords = keywords.as_str().ok().map(|s| s.to_string());
2483 }
2484 if let Some(creator) = info_dict.get("Creator").and_then(|o| o.as_string()) {
2485 metadata.creator = creator.as_str().ok().map(|s| s.to_string());
2486 }
2487 if let Some(producer) = info_dict.get("Producer").and_then(|o| o.as_string()) {
2488 metadata.producer = producer.as_str().ok().map(|s| s.to_string());
2489 }
2490 }
2491
2492 metadata.version = self.version().to_string();
2493 metadata.page_count = self.page_count().ok();
2494
2495 Ok(metadata)
2496 }
2497
2498 fn ensure_page_tree(&mut self) -> ParseResult<()> {
2500 if self.page_tree.is_none() {
2501 let page_count = self.page_count()?;
2502 self.page_tree = Some(super::page_tree::PageTree::new(page_count));
2503 }
2504 Ok(())
2505 }
2506
2507 pub fn get_page(&mut self, _index: u32) -> ParseResult<&super::page_tree::ParsedPage> {
2513 self.ensure_page_tree()?;
2514
2515 Err(ParseError::SyntaxError {
2519 position: 0,
2520 message: "get_page not implemented due to borrow checker constraints. Use PdfDocument instead.".to_string(),
2521 })
2522 }
2523
2524 pub fn get_all_pages(&mut self) -> ParseResult<Vec<super::page_tree::ParsedPage>> {
2526 let page_count = self.page_count()?;
2527 let mut pages = Vec::with_capacity(page_count as usize);
2528
2529 for i in 0..page_count {
2530 let page = self.get_page(i)?.clone();
2531 pages.push(page);
2532 }
2533
2534 Ok(pages)
2535 }
2536
2537 pub fn into_document(self) -> super::document::PdfDocument<R> {
2539 super::document::PdfDocument::new(self)
2540 }
2541
2542 pub fn clear_parse_context(&mut self) {
2544 self.parse_context = StackSafeContext::new();
2545 }
2546
2547 pub fn parse_context_mut(&mut self) -> &mut StackSafeContext {
2549 &mut self.parse_context
2550 }
2551
2552 fn find_page_objects(&mut self) -> ParseResult<Vec<(u32, u16)>> {
2554 let original_pos = self.reader.stream_position().unwrap_or(0);
2556
2557 if self.reader.seek(SeekFrom::Start(0)).is_err() {
2559 return Ok(vec![]);
2560 }
2561
2562 let mut buffer = Vec::new();
2563 if self.reader.read_to_end(&mut buffer).is_err() {
2564 return Ok(vec![]);
2565 }
2566
2567 self.reader.seek(SeekFrom::Start(original_pos)).ok();
2569
2570 let content = String::from_utf8_lossy(&buffer);
2571 let mut page_objects = Vec::new();
2572
2573 let lines: Vec<&str> = content.lines().collect();
2575
2576 for (i, line) in lines.iter().enumerate() {
2577 if line.trim().ends_with(" 0 obj") {
2579 if let Some(obj_str) = line.trim().strip_suffix(" 0 obj") {
2580 if let Ok(obj_num) = obj_str.parse::<u32>() {
2581 for j in 1..=10 {
2583 if i + j < lines.len() {
2584 let future_line = lines[i + j];
2585 if future_line.contains("/Type /Page")
2586 && !future_line.contains("/Type /Pages")
2587 {
2588 page_objects.push((obj_num, 0));
2589 break;
2590 }
2591 if future_line.trim().ends_with(" 0 obj")
2593 || future_line.trim() == "endobj"
2594 {
2595 break;
2596 }
2597 }
2598 }
2599 }
2600 }
2601 }
2602 }
2603
2604 page_objects.sort();
2605 page_objects.dedup();
2606
2607 Ok(page_objects)
2608 }
2609
2610 fn find_catalog_object(&mut self) -> ParseResult<(u32, u16)> {
2612 let obj_numbers: Vec<u32> = self.xref.entries().keys().copied().collect();
2617
2618 for obj_num in obj_numbers {
2620 if let Ok(obj) = self.get_object(obj_num, 0) {
2622 if let Some(dict) = obj.as_dict() {
2623 if let Some(type_obj) = dict.get("Type") {
2625 if let Some(type_name) = type_obj.as_name() {
2626 if type_name.0 == "Catalog" {
2627 return Ok((obj_num, 0));
2628 }
2629 if type_name.0 == "Sig"
2631 || type_name.0 == "Pages"
2632 || type_name.0 == "Page"
2633 {
2634 continue;
2635 }
2636 }
2637 }
2638 }
2639 }
2640 }
2641
2642 for obj_num in [1, 2, 3, 4, 5] {
2644 if let Ok(obj) = self.get_object(obj_num, 0) {
2645 if let Some(dict) = obj.as_dict() {
2646 if dict.contains_key("Pages") {
2648 return Ok((obj_num, 0));
2649 }
2650 }
2651 }
2652 }
2653
2654 Err(ParseError::MissingKey(
2655 "Could not find Catalog object".to_string(),
2656 ))
2657 }
2658
2659 fn create_synthetic_pages_dict(
2661 &mut self,
2662 page_refs: &[(u32, u16)],
2663 ) -> ParseResult<&PdfDictionary> {
2664 use super::objects::{PdfArray, PdfName};
2665
2666 let mut valid_page_refs = Vec::new();
2668 for (obj_num, gen_num) in page_refs {
2669 if let Ok(page_obj) = self.get_object(*obj_num, *gen_num) {
2670 if let Some(page_dict) = page_obj.as_dict() {
2671 if let Some(obj_type) = page_dict.get("Type").and_then(|t| t.as_name()) {
2673 if obj_type.0 == "Page" {
2674 valid_page_refs.push((*obj_num, *gen_num));
2675 continue;
2676 }
2677 }
2678
2679 if page_dict.contains_key("MediaBox") || page_dict.contains_key("Contents") {
2681 valid_page_refs.push((*obj_num, *gen_num));
2682 }
2683 }
2684 }
2685 }
2686
2687 if valid_page_refs.is_empty() {
2688 return Err(ParseError::SyntaxError {
2689 position: 0,
2690 message: "No valid page objects found for synthetic Pages tree".to_string(),
2691 });
2692 }
2693
2694 if valid_page_refs.len() > 10 {
2696 return self.create_hierarchical_pages_tree(&valid_page_refs);
2697 }
2698
2699 let mut kids = PdfArray::new();
2701 for (obj_num, gen_num) in &valid_page_refs {
2702 kids.push(PdfObject::Reference(*obj_num, *gen_num));
2703 }
2704
2705 let mut pages_dict = PdfDictionary::new();
2707 pages_dict.insert(
2708 "Type".to_string(),
2709 PdfObject::Name(PdfName("Pages".to_string())),
2710 );
2711 pages_dict.insert("Kids".to_string(), PdfObject::Array(kids));
2712 pages_dict.insert(
2713 "Count".to_string(),
2714 PdfObject::Integer(valid_page_refs.len() as i64),
2715 );
2716
2717 let mut media_box = None;
2719 for (obj_num, gen_num) in valid_page_refs.iter().take(3) {
2720 if let Ok(page_obj) = self.get_object(*obj_num, *gen_num) {
2721 if let Some(page_dict) = page_obj.as_dict() {
2722 if let Some(mb) = page_dict.get("MediaBox") {
2723 media_box = Some(mb.clone());
2724 }
2725 }
2726 }
2727 }
2728
2729 if let Some(mb) = media_box {
2731 pages_dict.insert("MediaBox".to_string(), mb);
2732 } else {
2733 let mut mb_array = PdfArray::new();
2734 mb_array.push(PdfObject::Integer(0));
2735 mb_array.push(PdfObject::Integer(0));
2736 mb_array.push(PdfObject::Integer(612));
2737 mb_array.push(PdfObject::Integer(792));
2738 pages_dict.insert("MediaBox".to_string(), PdfObject::Array(mb_array));
2739 }
2740
2741 let synthetic_key = (u32::MAX - 1, 0);
2743 self.object_cache
2744 .insert(synthetic_key, PdfObject::Dictionary(pages_dict));
2745
2746 if let PdfObject::Dictionary(dict) = &self.object_cache[&synthetic_key] {
2748 Ok(dict)
2749 } else {
2750 unreachable!("Just inserted dictionary")
2751 }
2752 }
2753
2754 fn create_hierarchical_pages_tree(
2756 &mut self,
2757 page_refs: &[(u32, u16)],
2758 ) -> ParseResult<&PdfDictionary> {
2759 use super::objects::{PdfArray, PdfName};
2760
2761 const PAGES_PER_NODE: usize = 10; let chunks: Vec<&[(u32, u16)]> = page_refs.chunks(PAGES_PER_NODE).collect();
2765 let mut intermediate_nodes = Vec::new();
2766
2767 for (chunk_idx, chunk) in chunks.iter().enumerate() {
2769 let mut kids = PdfArray::new();
2770 for (obj_num, gen_num) in chunk.iter() {
2771 kids.push(PdfObject::Reference(*obj_num, *gen_num));
2772 }
2773
2774 let mut intermediate_dict = PdfDictionary::new();
2775 intermediate_dict.insert(
2776 "Type".to_string(),
2777 PdfObject::Name(PdfName("Pages".to_string())),
2778 );
2779 intermediate_dict.insert("Kids".to_string(), PdfObject::Array(kids));
2780 intermediate_dict.insert("Count".to_string(), PdfObject::Integer(chunk.len() as i64));
2781
2782 let intermediate_key = (u32::MAX - 2 - chunk_idx as u32, 0);
2784 self.object_cache
2785 .insert(intermediate_key, PdfObject::Dictionary(intermediate_dict));
2786
2787 intermediate_nodes.push(intermediate_key);
2788 }
2789
2790 let mut root_kids = PdfArray::new();
2792 for (obj_num, gen_num) in &intermediate_nodes {
2793 root_kids.push(PdfObject::Reference(*obj_num, *gen_num));
2794 }
2795
2796 let mut root_pages_dict = PdfDictionary::new();
2797 root_pages_dict.insert(
2798 "Type".to_string(),
2799 PdfObject::Name(PdfName("Pages".to_string())),
2800 );
2801 root_pages_dict.insert("Kids".to_string(), PdfObject::Array(root_kids));
2802 root_pages_dict.insert(
2803 "Count".to_string(),
2804 PdfObject::Integer(page_refs.len() as i64),
2805 );
2806
2807 if let Some((obj_num, gen_num)) = page_refs.first() {
2809 if let Ok(page_obj) = self.get_object(*obj_num, *gen_num) {
2810 if let Some(page_dict) = page_obj.as_dict() {
2811 if let Some(mb) = page_dict.get("MediaBox") {
2812 root_pages_dict.insert("MediaBox".to_string(), mb.clone());
2813 }
2814 }
2815 }
2816 }
2817
2818 let root_key = (u32::MAX - 1, 0);
2820 self.object_cache
2821 .insert(root_key, PdfObject::Dictionary(root_pages_dict));
2822
2823 if let PdfObject::Dictionary(dict) = &self.object_cache[&root_key] {
2825 Ok(dict)
2826 } else {
2827 unreachable!("Just inserted dictionary")
2828 }
2829 }
2830
2831 pub fn signatures(&mut self) -> ParseResult<Vec<crate::signatures::SignatureField>> {
2861 crate::signatures::detect_signature_fields(self).map_err(|e| ParseError::SyntaxError {
2862 position: 0,
2863 message: format!("Failed to detect signatures: {}", e),
2864 })
2865 }
2866
2867 pub fn verify_signatures(
2902 &mut self,
2903 ) -> ParseResult<Vec<crate::signatures::FullSignatureValidationResult>> {
2904 self.verify_signatures_with_trust_store(crate::signatures::TrustStore::default())
2905 }
2906
2907 pub fn verify_signatures_with_trust_store(
2939 &mut self,
2940 trust_store: crate::signatures::TrustStore,
2941 ) -> ParseResult<Vec<crate::signatures::FullSignatureValidationResult>> {
2942 use crate::signatures::{
2943 has_incremental_update, parse_pkcs7_signature, validate_certificate, verify_signature,
2944 FullSignatureValidationResult,
2945 };
2946
2947 let original_pos = self.reader.stream_position().unwrap_or(0);
2949 self.reader.seek(SeekFrom::Start(0))?;
2950
2951 let mut pdf_bytes = Vec::new();
2952 self.reader.read_to_end(&mut pdf_bytes)?;
2953
2954 self.reader.seek(SeekFrom::Start(original_pos)).ok();
2956
2957 let signature_fields = self.signatures()?;
2959
2960 let mut results = Vec::new();
2961
2962 for field in signature_fields {
2963 let mut result = FullSignatureValidationResult {
2964 field: field.clone(),
2965 signer_name: None,
2966 signing_time: None,
2967 hash_valid: false,
2968 signature_valid: false,
2969 certificate_result: None,
2970 has_modifications_after_signing: false,
2971 errors: Vec::new(),
2972 warnings: Vec::new(),
2973 };
2974
2975 result.has_modifications_after_signing =
2977 has_incremental_update(&pdf_bytes, &field.byte_range);
2978
2979 let parsed_sig = match parse_pkcs7_signature(&field.contents) {
2981 Ok(sig) => sig,
2982 Err(e) => {
2983 result
2984 .errors
2985 .push(format!("Failed to parse signature: {}", e));
2986 results.push(result);
2987 continue;
2988 }
2989 };
2990
2991 result.signing_time = parsed_sig.signing_time.clone();
2993 result.signer_name = parsed_sig.signer_common_name().ok();
2994
2995 match verify_signature(&pdf_bytes, &parsed_sig, &field.byte_range) {
2997 Ok(verification) => {
2998 result.hash_valid = verification.hash_valid;
2999 result.signature_valid = verification.signature_valid;
3000 if let Some(details) = verification.details {
3001 result.warnings.push(details);
3002 }
3003 }
3004 Err(e) => {
3005 result
3006 .errors
3007 .push(format!("Signature verification failed: {}", e));
3008 }
3009 }
3010
3011 match validate_certificate(&parsed_sig.signer_certificate_der, &trust_store) {
3013 Ok(cert_result) => {
3014 result.certificate_result = Some(cert_result);
3015 }
3016 Err(e) => {
3017 result
3018 .warnings
3019 .push(format!("Certificate validation failed: {}", e));
3020 }
3021 }
3022
3023 results.push(result);
3024 }
3025
3026 Ok(results)
3027 }
3028}
3029
3030#[derive(Debug, Default, Clone)]
3032pub struct DocumentMetadata {
3033 pub title: Option<String>,
3034 pub author: Option<String>,
3035 pub subject: Option<String>,
3036 pub keywords: Option<String>,
3037 pub creator: Option<String>,
3038 pub producer: Option<String>,
3039 pub creation_date: Option<String>,
3040 pub modification_date: Option<String>,
3041 pub version: String,
3042 pub page_count: Option<u32>,
3043}
3044
3045pub struct EOLIter<'s> {
3046 remainder: &'s str,
3047}
3048impl<'s> Iterator for EOLIter<'s> {
3049 type Item = &'s str;
3050
3051 fn next(&mut self) -> Option<Self::Item> {
3052 if self.remainder.is_empty() {
3053 return None;
3054 }
3055
3056 if let Some((i, sep)) = ["\r\n", "\n", "\r"]
3057 .iter()
3058 .filter_map(|&sep| self.remainder.find(sep).map(|i| (i, sep)))
3059 .min_by_key(|(i, _)| *i)
3060 {
3061 let (line, rest) = self.remainder.split_at(i);
3062 self.remainder = &rest[sep.len()..];
3063 Some(line)
3064 } else {
3065 let line = self.remainder;
3066 self.remainder = "";
3067 Some(line)
3068 }
3069 }
3070}
3071pub trait PDFLines: AsRef<str> {
3072 fn pdf_lines(&self) -> EOLIter<'_> {
3073 EOLIter {
3074 remainder: self.as_ref(),
3075 }
3076 }
3077}
3078impl PDFLines for &str {}
3079impl<'a> PDFLines for std::borrow::Cow<'a, str> {}
3080impl PDFLines for String {}
3081
3082#[cfg(test)]
3083mod tests {
3084
3085 use super::*;
3086 use crate::parser::objects::{PdfName, PdfString};
3087 use crate::parser::test_helpers::*;
3088 use crate::parser::ParseOptions;
3089 use std::io::Cursor;
3090
3091 #[test]
3092 fn test_reader_construction() {
3093 let pdf_data = create_minimal_pdf();
3094 let cursor = Cursor::new(pdf_data);
3095 let result = PdfReader::new(cursor);
3096 assert!(result.is_ok());
3097 }
3098
3099 #[test]
3100 fn test_reader_version() {
3101 let pdf_data = create_minimal_pdf();
3102 let cursor = Cursor::new(pdf_data);
3103 let reader = PdfReader::new(cursor).unwrap();
3104 assert_eq!(reader.version().major, 1);
3105 assert_eq!(reader.version().minor, 4);
3106 }
3107
3108 #[test]
3109 fn test_reader_different_versions() {
3110 let versions = vec![
3111 "1.0", "1.1", "1.2", "1.3", "1.4", "1.5", "1.6", "1.7", "2.0",
3112 ];
3113
3114 for version in versions {
3115 let pdf_data = create_pdf_with_version(version);
3116 let cursor = Cursor::new(pdf_data);
3117 let reader = PdfReader::new(cursor).unwrap();
3118
3119 let parts: Vec<&str> = version.split('.').collect();
3120 assert_eq!(reader.version().major, parts[0].parse::<u8>().unwrap());
3121 assert_eq!(reader.version().minor, parts[1].parse::<u8>().unwrap());
3122 }
3123 }
3124
3125 #[test]
3126 fn test_reader_catalog() {
3127 let pdf_data = create_minimal_pdf();
3128 let cursor = Cursor::new(pdf_data);
3129 let mut reader = PdfReader::new(cursor).unwrap();
3130
3131 let catalog = reader.catalog();
3132 assert!(catalog.is_ok());
3133
3134 let catalog_dict = catalog.unwrap();
3135 assert_eq!(
3136 catalog_dict.get("Type"),
3137 Some(&PdfObject::Name(PdfName("Catalog".to_string())))
3138 );
3139 }
3140
3141 #[test]
3142 fn test_reader_info_none() {
3143 let pdf_data = create_minimal_pdf();
3144 let cursor = Cursor::new(pdf_data);
3145 let mut reader = PdfReader::new(cursor).unwrap();
3146
3147 let info = reader.info().unwrap();
3148 assert!(info.is_none());
3149 }
3150
3151 #[test]
3152 fn test_reader_info_present() {
3153 let pdf_data = create_pdf_with_info();
3154 let cursor = Cursor::new(pdf_data);
3155 let mut reader = PdfReader::new(cursor).unwrap();
3156
3157 let info = reader.info().unwrap();
3158 assert!(info.is_some());
3159
3160 let info_dict = info.unwrap();
3161 assert_eq!(
3162 info_dict.get("Title"),
3163 Some(&PdfObject::String(PdfString(
3164 "Test PDF".to_string().into_bytes()
3165 )))
3166 );
3167 assert_eq!(
3168 info_dict.get("Author"),
3169 Some(&PdfObject::String(PdfString(
3170 "Test Author".to_string().into_bytes()
3171 )))
3172 );
3173 }
3174
3175 #[test]
3176 fn test_reader_get_object() {
3177 let pdf_data = create_minimal_pdf();
3178 let cursor = Cursor::new(pdf_data);
3179 let mut reader = PdfReader::new(cursor).unwrap();
3180
3181 let obj = reader.get_object(1, 0);
3183 assert!(obj.is_ok());
3184
3185 let catalog = obj.unwrap();
3186 assert!(catalog.as_dict().is_some());
3187 }
3188
3189 #[test]
3190 fn test_reader_get_invalid_object() {
3191 let pdf_data = create_minimal_pdf();
3192 let cursor = Cursor::new(pdf_data);
3193 let mut reader = PdfReader::new(cursor).unwrap();
3194
3195 let obj = reader.get_object(999, 0);
3197 assert!(obj.is_err());
3198 }
3199
3200 #[test]
3201 fn test_reader_get_free_object() {
3202 let pdf_data = create_minimal_pdf();
3203 let cursor = Cursor::new(pdf_data);
3204 let mut reader = PdfReader::new(cursor).unwrap();
3205
3206 let obj = reader.get_object(0, 65535);
3208 assert!(obj.is_ok());
3209 assert_eq!(obj.unwrap(), &PdfObject::Null);
3210 }
3211
3212 #[test]
3213 fn test_reader_resolve_reference() {
3214 let pdf_data = create_minimal_pdf();
3215 let cursor = Cursor::new(pdf_data);
3216 let mut reader = PdfReader::new(cursor).unwrap();
3217
3218 let ref_obj = PdfObject::Reference(1, 0);
3220 let resolved = reader.resolve(&ref_obj);
3221
3222 assert!(resolved.is_ok());
3223 assert!(resolved.unwrap().as_dict().is_some());
3224 }
3225
3226 #[test]
3227 fn test_reader_resolve_non_reference() {
3228 let pdf_data = create_minimal_pdf();
3229 let cursor = Cursor::new(pdf_data);
3230 let mut reader = PdfReader::new(cursor).unwrap();
3231
3232 let int_obj = PdfObject::Integer(42);
3234 let resolved = reader.resolve(&int_obj).unwrap();
3235
3236 assert_eq!(resolved, &PdfObject::Integer(42));
3237 }
3238
3239 #[test]
3240 fn test_reader_cache_behavior() {
3241 let pdf_data = create_minimal_pdf();
3242 let cursor = Cursor::new(pdf_data);
3243 let mut reader = PdfReader::new(cursor).unwrap();
3244
3245 let obj1 = reader.get_object(1, 0).unwrap();
3247 assert!(obj1.as_dict().is_some());
3248
3249 let obj2 = reader.get_object(1, 0).unwrap();
3251 assert!(obj2.as_dict().is_some());
3252 }
3253
3254 #[test]
3255 fn test_reader_wrong_generation() {
3256 let pdf_data = create_minimal_pdf();
3257 let cursor = Cursor::new(pdf_data);
3258 let mut reader = PdfReader::new(cursor).unwrap();
3259
3260 let obj = reader.get_object(1, 99);
3262 assert!(obj.is_err());
3263 }
3264
3265 #[test]
3266 fn test_reader_invalid_pdf() {
3267 let invalid_data = b"This is not a PDF file";
3268 let cursor = Cursor::new(invalid_data.to_vec());
3269 let result = PdfReader::new(cursor);
3270
3271 assert!(result.is_err());
3272 }
3273
3274 #[test]
3275 fn test_reader_corrupt_xref() {
3276 let corrupt_pdf = b"%PDF-1.4
32771 0 obj
3278<< /Type /Catalog >>
3279endobj
3280xref
3281corrupted xref table
3282trailer
3283<< /Size 2 /Root 1 0 R >>
3284startxref
328524
3286%%EOF"
3287 .to_vec();
3288
3289 let cursor = Cursor::new(corrupt_pdf);
3290 let result = PdfReader::new(cursor);
3291 assert!(result.is_err());
3294 }
3295
3296 #[test]
3297 fn test_reader_missing_trailer() {
3298 let pdf_no_trailer = b"%PDF-1.4
32991 0 obj
3300<< /Type /Catalog >>
3301endobj
3302xref
33030 2
33040000000000 65535 f
33050000000009 00000 n
3306startxref
330724
3308%%EOF"
3309 .to_vec();
3310
3311 let cursor = Cursor::new(pdf_no_trailer);
3312 let result = PdfReader::new(cursor);
3313 assert!(result.is_err());
3316 }
3317
3318 #[test]
3319 fn test_reader_empty_pdf() {
3320 let cursor = Cursor::new(Vec::new());
3321 let result = PdfReader::new(cursor);
3322 assert!(result.is_err());
3323 }
3324
3325 #[test]
3326 fn test_reader_page_count() {
3327 let pdf_data = create_minimal_pdf();
3328 let cursor = Cursor::new(pdf_data);
3329 let mut reader = PdfReader::new(cursor).unwrap();
3330
3331 let count = reader.page_count();
3332 assert!(count.is_ok());
3333 assert_eq!(count.unwrap(), 0); }
3335
3336 #[test]
3337 fn test_reader_into_document() {
3338 let pdf_data = create_minimal_pdf();
3339 let cursor = Cursor::new(pdf_data);
3340 let reader = PdfReader::new(cursor).unwrap();
3341
3342 let document = reader.into_document();
3343 let page_count = document.page_count();
3345 assert!(page_count.is_ok());
3346 }
3347
3348 #[test]
3349 fn test_reader_pages_dict() {
3350 let pdf_data = create_minimal_pdf();
3351 let cursor = Cursor::new(pdf_data);
3352 let mut reader = PdfReader::new(cursor).unwrap();
3353
3354 let pages = reader.pages();
3355 assert!(pages.is_ok());
3356 let pages_dict = pages.unwrap();
3357 assert_eq!(
3358 pages_dict.get("Type"),
3359 Some(&PdfObject::Name(PdfName("Pages".to_string())))
3360 );
3361 }
3362
3363 #[test]
3364 fn test_reader_pdf_with_binary_data() {
3365 let pdf_data = create_pdf_with_binary_marker();
3366
3367 let cursor = Cursor::new(pdf_data);
3368 let result = PdfReader::new(cursor);
3369 assert!(result.is_ok());
3370 }
3371
3372 #[test]
3373 fn test_reader_metadata() {
3374 let pdf_data = create_pdf_with_info();
3375 let cursor = Cursor::new(pdf_data);
3376 let mut reader = PdfReader::new(cursor).unwrap();
3377
3378 let metadata = reader.metadata().unwrap();
3379 assert_eq!(metadata.title, Some("Test PDF".to_string()));
3380 assert_eq!(metadata.author, Some("Test Author".to_string()));
3381 assert_eq!(metadata.subject, Some("Testing".to_string()));
3382 assert_eq!(metadata.version, "1.4".to_string());
3383 }
3384
3385 #[test]
3386 fn test_reader_metadata_empty() {
3387 let pdf_data = create_minimal_pdf();
3388 let cursor = Cursor::new(pdf_data);
3389 let mut reader = PdfReader::new(cursor).unwrap();
3390
3391 let metadata = reader.metadata().unwrap();
3392 assert!(metadata.title.is_none());
3393 assert!(metadata.author.is_none());
3394 assert_eq!(metadata.version, "1.4".to_string());
3395 assert_eq!(metadata.page_count, Some(0));
3396 }
3397
3398 #[test]
3399 fn test_reader_object_number_mismatch() {
3400 let pdf_data = create_minimal_pdf();
3404 let cursor = Cursor::new(pdf_data);
3405 let mut reader = PdfReader::new(cursor).unwrap();
3406
3407 let result = reader.get_object(1, 99);
3410 assert!(result.is_err());
3411
3412 let result2 = reader.get_object(999, 0);
3414 assert!(result2.is_err());
3415 }
3416
3417 #[test]
3418 fn test_document_metadata_struct() {
3419 let metadata = DocumentMetadata {
3420 title: Some("Title".to_string()),
3421 author: Some("Author".to_string()),
3422 subject: Some("Subject".to_string()),
3423 keywords: Some("Keywords".to_string()),
3424 creator: Some("Creator".to_string()),
3425 producer: Some("Producer".to_string()),
3426 creation_date: Some("D:20240101".to_string()),
3427 modification_date: Some("D:20240102".to_string()),
3428 version: "1.5".to_string(),
3429 page_count: Some(10),
3430 };
3431
3432 assert_eq!(metadata.title, Some("Title".to_string()));
3433 assert_eq!(metadata.page_count, Some(10));
3434 }
3435
3436 #[test]
3437 fn test_document_metadata_default() {
3438 let metadata = DocumentMetadata::default();
3439 assert!(metadata.title.is_none());
3440 assert!(metadata.author.is_none());
3441 assert!(metadata.subject.is_none());
3442 assert!(metadata.keywords.is_none());
3443 assert!(metadata.creator.is_none());
3444 assert!(metadata.producer.is_none());
3445 assert!(metadata.creation_date.is_none());
3446 assert!(metadata.modification_date.is_none());
3447 assert_eq!(metadata.version, "".to_string());
3448 assert!(metadata.page_count.is_none());
3449 }
3450
3451 #[test]
3452 fn test_document_metadata_clone() {
3453 let metadata = DocumentMetadata {
3454 title: Some("Test".to_string()),
3455 version: "1.4".to_string(),
3456 ..Default::default()
3457 };
3458
3459 let cloned = metadata;
3460 assert_eq!(cloned.title, Some("Test".to_string()));
3461 assert_eq!(cloned.version, "1.4".to_string());
3462 }
3463
3464 #[test]
3465 fn test_reader_trailer_validation_error() {
3466 let bad_pdf = b"%PDF-1.4
34681 0 obj
3469<< /Type /Catalog >>
3470endobj
3471xref
34720 2
34730000000000 65535 f
34740000000009 00000 n
3475trailer
3476<< /Size 2 >>
3477startxref
347846
3479%%EOF"
3480 .to_vec();
3481
3482 let cursor = Cursor::new(bad_pdf);
3483 let result = PdfReader::new(cursor);
3484 assert!(result.is_err());
3487 }
3488
3489 #[test]
3490 fn test_reader_with_options() {
3491 let pdf_data = create_minimal_pdf();
3492 let cursor = Cursor::new(pdf_data);
3493 let mut options = ParseOptions::default();
3494 options.lenient_streams = true;
3495 options.max_recovery_bytes = 2000;
3496 options.collect_warnings = true;
3497
3498 let reader = PdfReader::new_with_options(cursor, options);
3499 assert!(reader.is_ok());
3500 }
3501
3502 #[test]
3503 fn test_lenient_stream_parsing() {
3504 let pdf_data = b"%PDF-1.4
35061 0 obj
3507<< /Type /Catalog /Pages 2 0 R >>
3508endobj
35092 0 obj
3510<< /Type /Pages /Kids [3 0 R] /Count 1 >>
3511endobj
35123 0 obj
3513<< /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792] /Contents 4 0 R >>
3514endobj
35154 0 obj
3516<< /Length 10 >>
3517stream
3518This is a longer stream than 10 bytes
3519endstream
3520endobj
3521xref
35220 5
35230000000000 65535 f
35240000000009 00000 n
35250000000058 00000 n
35260000000116 00000 n
35270000000219 00000 n
3528trailer
3529<< /Size 5 /Root 1 0 R >>
3530startxref
3531299
3532%%EOF"
3533 .to_vec();
3534
3535 let cursor = Cursor::new(pdf_data.clone());
3537 let strict_options = ParseOptions::strict();
3538 let strict_reader = PdfReader::new_with_options(cursor, strict_options);
3539 assert!(strict_reader.is_err());
3541
3542 let cursor = Cursor::new(pdf_data);
3544 let mut options = ParseOptions::default();
3545 options.lenient_streams = true;
3546 options.max_recovery_bytes = 1000;
3547 options.collect_warnings = false;
3548 let lenient_reader = PdfReader::new_with_options(cursor, options);
3549 assert!(lenient_reader.is_err());
3550 }
3551
3552 #[test]
3553 fn test_parse_options_default() {
3554 let options = ParseOptions::default();
3555 assert!(!options.lenient_streams);
3556 assert_eq!(options.max_recovery_bytes, 1000);
3557 assert!(!options.collect_warnings);
3558 }
3559
3560 #[test]
3561 fn test_parse_options_clone() {
3562 let mut options = ParseOptions::default();
3563 options.lenient_streams = true;
3564 options.max_recovery_bytes = 2000;
3565 options.collect_warnings = true;
3566 let cloned = options;
3567 assert!(cloned.lenient_streams);
3568 assert_eq!(cloned.max_recovery_bytes, 2000);
3569 assert!(cloned.collect_warnings);
3570 }
3571
3572 #[allow(dead_code)]
3575 fn create_encrypted_pdf_dict() -> PdfDictionary {
3576 let mut dict = PdfDictionary::new();
3577 dict.insert(
3578 "Filter".to_string(),
3579 PdfObject::Name(PdfName("Standard".to_string())),
3580 );
3581 dict.insert("V".to_string(), PdfObject::Integer(1));
3582 dict.insert("R".to_string(), PdfObject::Integer(2));
3583 dict.insert("O".to_string(), PdfObject::String(PdfString(vec![0u8; 32])));
3584 dict.insert("U".to_string(), PdfObject::String(PdfString(vec![0u8; 32])));
3585 dict.insert("P".to_string(), PdfObject::Integer(-4));
3586 dict
3587 }
3588
3589 fn create_pdf_with_encryption() -> Vec<u8> {
3590 b"%PDF-1.4
35921 0 obj
3593<< /Type /Catalog /Pages 2 0 R >>
3594endobj
35952 0 obj
3596<< /Type /Pages /Kids [3 0 R] /Count 1 >>
3597endobj
35983 0 obj
3599<< /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792] >>
3600endobj
36014 0 obj
3602<< /Filter /Standard /V 1 /R 2 /O (32 bytes of owner password hash data) /U (32 bytes of user password hash data) /P -4 >>
3603endobj
3604xref
36050 5
36060000000000 65535 f
36070000000009 00000 n
36080000000058 00000 n
36090000000116 00000 n
36100000000201 00000 n
3611trailer
3612<< /Size 5 /Root 1 0 R /Encrypt 4 0 R /ID [(file id)] >>
3613startxref
3614295
3615%%EOF"
3616 .to_vec()
3617 }
3618
3619 #[test]
3620 fn test_reader_encryption_detection() {
3621 let unencrypted_pdf = create_minimal_pdf();
3623 let cursor = Cursor::new(unencrypted_pdf);
3624 let reader = PdfReader::new(cursor).unwrap();
3625 assert!(!reader.is_encrypted());
3626 assert!(reader.is_unlocked()); let encrypted_pdf = create_pdf_with_encryption();
3630 let cursor = Cursor::new(encrypted_pdf);
3631 let result = PdfReader::new(cursor);
3632 assert!(result.is_err());
3634 }
3635
3636 #[test]
3637 fn test_reader_encryption_methods_unencrypted() {
3638 let pdf_data = create_minimal_pdf();
3639 let cursor = Cursor::new(pdf_data);
3640 let mut reader = PdfReader::new(cursor).unwrap();
3641
3642 assert!(!reader.is_encrypted());
3644 assert!(reader.is_unlocked());
3645 assert!(reader.encryption_handler().is_none());
3646 assert!(reader.encryption_handler_mut().is_none());
3647
3648 assert!(reader.unlock_with_password("any_password").unwrap());
3650 assert!(reader.try_empty_password().unwrap());
3651 }
3652
3653 #[test]
3654 fn test_reader_encryption_handler_access() {
3655 let pdf_data = create_minimal_pdf();
3656 let cursor = Cursor::new(pdf_data);
3657 let mut reader = PdfReader::new(cursor).unwrap();
3658
3659 assert!(reader.encryption_handler().is_none());
3661 assert!(reader.encryption_handler_mut().is_none());
3662
3663 assert!(!reader.is_encrypted());
3665 assert!(reader.is_unlocked());
3666 }
3667
3668 #[test]
3669 fn test_reader_multiple_password_attempts() {
3670 let pdf_data = create_minimal_pdf();
3671 let cursor = Cursor::new(pdf_data);
3672 let mut reader = PdfReader::new(cursor).unwrap();
3673
3674 let passwords = vec!["test1", "test2", "admin", "", "password"];
3676 for password in passwords {
3677 assert!(reader.unlock_with_password(password).unwrap());
3678 }
3679
3680 for _ in 0..5 {
3682 assert!(reader.try_empty_password().unwrap());
3683 }
3684 }
3685
3686 #[test]
3687 fn test_reader_encryption_state_consistency() {
3688 let pdf_data = create_minimal_pdf();
3689 let cursor = Cursor::new(pdf_data);
3690 let mut reader = PdfReader::new(cursor).unwrap();
3691
3692 assert!(!reader.is_encrypted());
3694 assert!(reader.is_unlocked());
3695 assert!(reader.encryption_handler().is_none());
3696
3697 let _ = reader.unlock_with_password("test");
3699 assert!(!reader.is_encrypted());
3700 assert!(reader.is_unlocked());
3701 assert!(reader.encryption_handler().is_none());
3702
3703 let _ = reader.try_empty_password();
3704 assert!(!reader.is_encrypted());
3705 assert!(reader.is_unlocked());
3706 assert!(reader.encryption_handler().is_none());
3707 }
3708
3709 #[test]
3710 fn test_reader_encryption_error_handling() {
3711 let encrypted_pdf = create_pdf_with_encryption();
3713 let cursor = Cursor::new(encrypted_pdf);
3714
3715 let result = PdfReader::new(cursor);
3717 match result {
3718 Err(ParseError::EncryptionNotSupported) => {
3719 }
3721 Err(_) => {
3722 }
3724 Ok(_) => {
3725 panic!("Should not successfully create reader for encrypted PDF without password");
3726 }
3727 }
3728 }
3729
3730 #[test]
3731 fn test_reader_encryption_with_options() {
3732 let pdf_data = create_minimal_pdf();
3733 let cursor = Cursor::new(pdf_data);
3734
3735 let strict_options = ParseOptions::strict();
3737 let strict_reader = PdfReader::new_with_options(cursor, strict_options).unwrap();
3738 assert!(!strict_reader.is_encrypted());
3739 assert!(strict_reader.is_unlocked());
3740
3741 let pdf_data = create_minimal_pdf();
3742 let cursor = Cursor::new(pdf_data);
3743 let lenient_options = ParseOptions::lenient();
3744 let lenient_reader = PdfReader::new_with_options(cursor, lenient_options).unwrap();
3745 assert!(!lenient_reader.is_encrypted());
3746 assert!(lenient_reader.is_unlocked());
3747 }
3748
3749 #[test]
3750 fn test_reader_encryption_integration_edge_cases() {
3751 let pdf_data = create_minimal_pdf();
3752 let cursor = Cursor::new(pdf_data);
3753 let mut reader = PdfReader::new(cursor).unwrap();
3754
3755 assert!(reader.unlock_with_password("").unwrap());
3757 assert!(reader.unlock_with_password(" ").unwrap()); assert!(reader
3759 .unlock_with_password("very_long_password_that_exceeds_normal_length")
3760 .unwrap());
3761 assert!(reader.unlock_with_password("unicode_test_ñáéíóú").unwrap());
3762
3763 assert!(reader.unlock_with_password("pass@#$%^&*()").unwrap());
3765 assert!(reader.unlock_with_password("pass\nwith\nnewlines").unwrap());
3766 assert!(reader.unlock_with_password("pass\twith\ttabs").unwrap());
3767 }
3768
3769 mod rigorous {
3770 use super::*;
3771
3772 #[test]
3777 fn test_reader_invalid_pdf_header() {
3778 let invalid_data = b"This is not a PDF file";
3780 let cursor = Cursor::new(invalid_data.to_vec());
3781 let result = PdfReader::new(cursor);
3782
3783 assert!(result.is_err(), "Should fail on invalid PDF header");
3784 }
3785
3786 #[test]
3787 fn test_reader_truncated_header() {
3788 let truncated = b"%PDF";
3790 let cursor = Cursor::new(truncated.to_vec());
3791 let result = PdfReader::new(cursor);
3792
3793 assert!(result.is_err(), "Should fail on truncated header");
3794 }
3795
3796 #[test]
3797 fn test_reader_empty_file() {
3798 let empty = Vec::new();
3799 let cursor = Cursor::new(empty);
3800 let result = PdfReader::new(cursor);
3801
3802 assert!(result.is_err(), "Should fail on empty file");
3803 }
3804
3805 #[test]
3806 fn test_reader_malformed_version() {
3807 let malformed = b"%PDF-X.Y\n%%\xE2\xE3\xCF\xD3\n";
3809 let cursor = Cursor::new(malformed.to_vec());
3810 let result = PdfReader::new(cursor);
3811
3812 if let Ok(reader) = result {
3814 let _version = reader.version();
3816 }
3817 }
3818
3819 #[test]
3820 fn test_reader_get_nonexistent_object() {
3821 let pdf_data = create_minimal_pdf();
3822 let cursor = Cursor::new(pdf_data);
3823 let mut reader = PdfReader::new(cursor).unwrap();
3824
3825 let result = reader.get_object(999, 0);
3827
3828 assert!(result.is_err(), "Should fail when object doesn't exist");
3829 }
3830
3831 #[test]
3832 fn test_reader_get_object_wrong_generation() {
3833 let pdf_data = create_minimal_pdf();
3834 let cursor = Cursor::new(pdf_data);
3835 let mut reader = PdfReader::new(cursor).unwrap();
3836
3837 let result = reader.get_object(1, 99);
3839
3840 if let Err(e) = result {
3842 let _ = e;
3844 }
3845 }
3846
3847 #[test]
3852 fn test_resolve_direct_object() {
3853 let pdf_data = create_minimal_pdf();
3854 let cursor = Cursor::new(pdf_data);
3855 let mut reader = PdfReader::new(cursor).unwrap();
3856
3857 let direct_obj = PdfObject::Integer(42);
3859
3860 let resolved = reader.resolve(&direct_obj).unwrap();
3861
3862 assert_eq!(resolved, &PdfObject::Integer(42));
3864 }
3865
3866 #[test]
3867 fn test_resolve_reference() {
3868 let pdf_data = create_minimal_pdf();
3869 let cursor = Cursor::new(pdf_data);
3870 let mut reader = PdfReader::new(cursor).unwrap();
3871
3872 let pages_ref = {
3874 let catalog = reader.catalog().unwrap();
3875 if let Some(PdfObject::Reference(obj_num, gen_num)) = catalog.get("Pages") {
3876 PdfObject::Reference(*obj_num, *gen_num)
3877 } else {
3878 panic!("Catalog /Pages must be a Reference");
3879 }
3880 };
3881
3882 let resolved = reader.resolve(&pages_ref).unwrap();
3884
3885 if let PdfObject::Dictionary(dict) = resolved {
3887 assert_eq!(
3888 dict.get("Type"),
3889 Some(&PdfObject::Name(PdfName("Pages".to_string())))
3890 );
3891 } else {
3892 panic!("Expected dictionary, got: {:?}", resolved);
3893 }
3894 }
3895
3896 #[test]
3901 fn test_is_encrypted_on_unencrypted() {
3902 let pdf_data = create_minimal_pdf();
3903 let cursor = Cursor::new(pdf_data);
3904 let reader = PdfReader::new(cursor).unwrap();
3905
3906 assert!(
3907 !reader.is_encrypted(),
3908 "Minimal PDF should not be encrypted"
3909 );
3910 }
3911
3912 #[test]
3913 fn test_is_unlocked_on_unencrypted() {
3914 let pdf_data = create_minimal_pdf();
3915 let cursor = Cursor::new(pdf_data);
3916 let reader = PdfReader::new(cursor).unwrap();
3917
3918 assert!(reader.is_unlocked(), "Unencrypted PDF should be unlocked");
3920 }
3921
3922 #[test]
3923 fn test_try_empty_password_on_unencrypted() {
3924 let pdf_data = create_minimal_pdf();
3925 let cursor = Cursor::new(pdf_data);
3926 let mut reader = PdfReader::new(cursor).unwrap();
3927
3928 let result = reader.try_empty_password();
3930 assert!(result.is_ok());
3931 }
3932
3933 #[test]
3938 fn test_reader_with_strict_options() {
3939 let pdf_data = create_minimal_pdf();
3940 let cursor = Cursor::new(pdf_data);
3941
3942 let options = ParseOptions::strict();
3943 let result = PdfReader::new_with_options(cursor, options);
3944
3945 assert!(result.is_ok(), "Minimal PDF should parse in strict mode");
3946 }
3947
3948 #[test]
3949 fn test_reader_with_lenient_options() {
3950 let pdf_data = create_minimal_pdf();
3951 let cursor = Cursor::new(pdf_data);
3952
3953 let options = ParseOptions::lenient();
3954 let result = PdfReader::new_with_options(cursor, options);
3955
3956 assert!(result.is_ok(), "Minimal PDF should parse in lenient mode");
3957 }
3958
3959 #[test]
3960 fn test_reader_options_accessible() {
3961 let pdf_data = create_minimal_pdf();
3962 let cursor = Cursor::new(pdf_data);
3963
3964 let options = ParseOptions::lenient();
3965 let reader = PdfReader::new_with_options(cursor, options.clone()).unwrap();
3966
3967 let reader_options = reader.options();
3969 assert_eq!(reader_options.strict_mode, options.strict_mode);
3970 }
3971
3972 #[test]
3977 fn test_catalog_has_required_fields() {
3978 let pdf_data = create_minimal_pdf();
3979 let cursor = Cursor::new(pdf_data);
3980 let mut reader = PdfReader::new(cursor).unwrap();
3981
3982 let catalog = reader.catalog().unwrap();
3983
3984 assert_eq!(
3986 catalog.get("Type"),
3987 Some(&PdfObject::Name(PdfName("Catalog".to_string()))),
3988 "Catalog must have /Type /Catalog"
3989 );
3990
3991 assert!(
3993 catalog.contains_key("Pages"),
3994 "Catalog must have /Pages entry"
3995 );
3996 }
3997
3998 #[test]
3999 fn test_info_fields_when_present() {
4000 let pdf_data = create_pdf_with_info();
4001 let cursor = Cursor::new(pdf_data);
4002 let mut reader = PdfReader::new(cursor).unwrap();
4003
4004 let info = reader.info().unwrap();
4005 assert!(info.is_some(), "PDF should have Info dictionary");
4006
4007 let info_dict = info.unwrap();
4008
4009 assert!(info_dict.contains_key("Title"), "Info should have Title");
4011 assert!(info_dict.contains_key("Author"), "Info should have Author");
4012 }
4013
4014 #[test]
4015 fn test_info_none_when_absent() {
4016 let pdf_data = create_minimal_pdf();
4017 let cursor = Cursor::new(pdf_data);
4018 let mut reader = PdfReader::new(cursor).unwrap();
4019
4020 let info = reader.info().unwrap();
4021 assert!(info.is_none(), "Minimal PDF should not have Info");
4022 }
4023
4024 #[test]
4029 fn test_version_exact_values() {
4030 let pdf_data = create_pdf_with_version("1.7");
4031 let cursor = Cursor::new(pdf_data);
4032 let reader = PdfReader::new(cursor).unwrap();
4033
4034 let version = reader.version();
4035 assert_eq!(version.major, 1, "Major version must be exact");
4036 assert_eq!(version.minor, 7, "Minor version must be exact");
4037 }
4038
4039 #[test]
4040 fn test_version_pdf_20() {
4041 let pdf_data = create_pdf_with_version("2.0");
4042 let cursor = Cursor::new(pdf_data);
4043 let reader = PdfReader::new(cursor).unwrap();
4044
4045 let version = reader.version();
4046 assert_eq!(version.major, 2, "PDF 2.0 major version");
4047 assert_eq!(version.minor, 0, "PDF 2.0 minor version");
4048 }
4049
4050 #[test]
4055 fn test_pages_returns_pages_dict() {
4056 let pdf_data = create_minimal_pdf();
4057 let cursor = Cursor::new(pdf_data);
4058 let mut reader = PdfReader::new(cursor).unwrap();
4059
4060 let pages_dict = reader
4061 .pages()
4062 .expect("pages() must return Pages dictionary");
4063
4064 assert_eq!(
4065 pages_dict.get("Type"),
4066 Some(&PdfObject::Name(PdfName("Pages".to_string()))),
4067 "Pages dict must have /Type /Pages"
4068 );
4069 }
4070
4071 #[test]
4072 fn test_page_count_minimal_pdf() {
4073 let pdf_data = create_minimal_pdf();
4074 let cursor = Cursor::new(pdf_data);
4075 let mut reader = PdfReader::new(cursor).unwrap();
4076
4077 let count = reader.page_count().expect("page_count() must succeed");
4078 assert_eq!(count, 0, "Minimal PDF has 0 pages");
4079 }
4080
4081 #[test]
4082 fn test_page_count_with_info_pdf() {
4083 let pdf_data = create_pdf_with_info();
4084 let cursor = Cursor::new(pdf_data);
4085 let mut reader = PdfReader::new(cursor).unwrap();
4086
4087 let count = reader.page_count().expect("page_count() must succeed");
4088 assert_eq!(count, 0, "create_pdf_with_info() has Count 0 in Pages dict");
4089 }
4090
4091 #[test]
4096 fn test_metadata_minimal_pdf() {
4097 let pdf_data = create_minimal_pdf();
4098 let cursor = Cursor::new(pdf_data);
4099 let mut reader = PdfReader::new(cursor).unwrap();
4100
4101 let meta = reader.metadata().expect("metadata() must succeed");
4102
4103 assert!(meta.title.is_none(), "Minimal PDF has no title");
4105 assert!(meta.author.is_none(), "Minimal PDF has no author");
4106 }
4107
4108 #[test]
4109 fn test_metadata_with_info() {
4110 let pdf_data = create_pdf_with_info();
4111 let cursor = Cursor::new(pdf_data);
4112 let mut reader = PdfReader::new(cursor).unwrap();
4113
4114 let meta = reader.metadata().expect("metadata() must succeed");
4115
4116 assert!(meta.title.is_some(), "PDF with Info has title");
4117 assert_eq!(meta.title.unwrap(), "Test PDF", "Title must match");
4118 assert!(meta.author.is_some(), "PDF with Info has author");
4119 assert_eq!(meta.author.unwrap(), "Test Author", "Author must match");
4120 }
4121
4122 #[test]
4127 fn test_resolve_stream_length_direct_integer() {
4128 let pdf_data = create_minimal_pdf();
4129 let cursor = Cursor::new(pdf_data);
4130 let mut reader = PdfReader::new(cursor).unwrap();
4131
4132 let length_obj = PdfObject::Integer(100);
4134
4135 let length = reader
4136 .resolve_stream_length(&length_obj)
4137 .expect("resolve_stream_length must succeed");
4138 assert_eq!(length, Some(100), "Direct integer must be resolved");
4139 }
4140
4141 #[test]
4142 fn test_resolve_stream_length_negative_integer() {
4143 let pdf_data = create_minimal_pdf();
4144 let cursor = Cursor::new(pdf_data);
4145 let mut reader = PdfReader::new(cursor).unwrap();
4146
4147 let length_obj = PdfObject::Integer(-10);
4149
4150 let length = reader
4151 .resolve_stream_length(&length_obj)
4152 .expect("resolve_stream_length must succeed");
4153 assert_eq!(length, None, "Negative integer returns None");
4154 }
4155
4156 #[test]
4157 fn test_resolve_stream_length_non_integer() {
4158 let pdf_data = create_minimal_pdf();
4159 let cursor = Cursor::new(pdf_data);
4160 let mut reader = PdfReader::new(cursor).unwrap();
4161
4162 let name_obj = PdfObject::Name(PdfName("Test".to_string()));
4164
4165 let length = reader
4166 .resolve_stream_length(&name_obj)
4167 .expect("resolve_stream_length must succeed");
4168 assert_eq!(length, None, "Non-integer object returns None");
4169 }
4170
4171 #[test]
4176 fn test_get_all_pages_empty_pdf() {
4177 let pdf_data = create_minimal_pdf();
4178 let cursor = Cursor::new(pdf_data);
4179 let mut reader = PdfReader::new(cursor).unwrap();
4180
4181 let pages = reader
4182 .get_all_pages()
4183 .expect("get_all_pages() must succeed");
4184 assert_eq!(pages.len(), 0, "Minimal PDF has 0 pages");
4185 }
4186
4187 #[test]
4188 fn test_get_all_pages_with_info() {
4189 let pdf_data = create_pdf_with_info();
4190 let cursor = Cursor::new(pdf_data);
4191 let mut reader = PdfReader::new(cursor).unwrap();
4192
4193 let pages = reader
4194 .get_all_pages()
4195 .expect("get_all_pages() must succeed");
4196 assert_eq!(
4197 pages.len(),
4198 0,
4199 "create_pdf_with_info() has 0 pages (Count 0)"
4200 );
4201 }
4202
4203 #[test]
4208 fn test_into_document_consumes_reader() {
4209 let pdf_data = create_minimal_pdf();
4210 let cursor = Cursor::new(pdf_data);
4211 let reader = PdfReader::new(cursor).unwrap();
4212
4213 let document = reader.into_document();
4214
4215 let version = document.version().expect("Document must have version");
4217 assert!(
4218 version.starts_with("1."),
4219 "Document must have PDF 1.x version, got: {}",
4220 version
4221 );
4222
4223 let page_count = document
4225 .page_count()
4226 .expect("Document must allow page_count()");
4227 assert_eq!(
4228 page_count, 0,
4229 "Minimal PDF has 0 pages (Count 0 in test helper)"
4230 );
4231 }
4232
4233 #[test]
4238 fn test_clear_parse_context() {
4239 let pdf_data = create_minimal_pdf();
4240 let cursor = Cursor::new(pdf_data);
4241 let mut reader = PdfReader::new(cursor).unwrap();
4242
4243 reader.clear_parse_context();
4245
4246 let version = reader.version();
4248 assert_eq!(version.major, 1, "Reader must still work after clear");
4249 }
4250
4251 #[test]
4252 fn test_parse_context_mut_accessible() {
4253 let pdf_data = create_minimal_pdf();
4254 let cursor = Cursor::new(pdf_data);
4255 let mut reader = PdfReader::new(cursor).unwrap();
4256
4257 let context = reader.parse_context_mut();
4258
4259 let initial_depth = context.depth;
4261 assert_eq!(initial_depth, 0, "Parse context must start with depth 0");
4262
4263 assert!(
4265 context.max_depth > 0,
4266 "Parse context must have positive max_depth"
4267 );
4268 }
4269
4270 #[test]
4275 fn test_find_bytes_basic() {
4276 let haystack = b"Hello World";
4277 let needle = b"World";
4278 let pos = find_bytes(haystack, needle);
4279 assert_eq!(pos, Some(6), "Must find 'World' at position 6");
4280 }
4281
4282 #[test]
4283 fn test_find_bytes_not_found() {
4284 let haystack = b"Hello World";
4285 let needle = b"Rust";
4286 let pos = find_bytes(haystack, needle);
4287 assert_eq!(pos, None, "Must return None when not found");
4288 }
4289
4290 #[test]
4291 fn test_find_bytes_at_start() {
4292 let haystack = b"Hello World";
4293 let needle = b"Hello";
4294 let pos = find_bytes(haystack, needle);
4295 assert_eq!(pos, Some(0), "Must find at position 0");
4296 }
4297
4298 #[test]
4299 fn test_is_immediate_stream_start_with_stream() {
4300 let data = b"stream\ndata";
4301 assert!(
4302 is_immediate_stream_start(data),
4303 "Must detect 'stream' at start"
4304 );
4305 }
4306
4307 #[test]
4308 fn test_is_immediate_stream_start_with_whitespace() {
4309 let data = b" \n\tstream\ndata";
4310 assert!(
4311 is_immediate_stream_start(data),
4312 "Must detect 'stream' after whitespace"
4313 );
4314 }
4315
4316 #[test]
4317 fn test_is_immediate_stream_start_no_stream() {
4318 let data = b"endobj";
4319 assert!(
4320 !is_immediate_stream_start(data),
4321 "Must return false when 'stream' absent"
4322 );
4323 }
4324 }
4325}