1use super::xref_stream;
6use super::xref_types::{XRefEntryInfo, XRefEntryType};
7use super::{ParseError, ParseOptions, ParseResult};
8use crate::parser::reader::PDFLines;
9use std::collections::HashMap;
10use std::io::{BufRead, BufReader, Read, Seek, SeekFrom};
11
12#[derive(Debug, Clone, Copy, PartialEq)]
14pub struct XRefEntry {
15 pub offset: u64,
17 pub generation: u16,
19 pub in_use: bool,
21}
22
23#[derive(Debug, Clone, PartialEq)]
25pub struct XRefEntryExt {
26 pub basic: XRefEntry,
28 pub compressed_info: Option<(u32, u32)>, }
31
32#[derive(Debug, Clone)]
34pub struct XRefTable {
35 entries: HashMap<u32, XRefEntry>,
37 extended_entries: HashMap<u32, XRefEntryExt>,
39 trailer: Option<super::objects::PdfDictionary>,
41 xref_offset: u64,
43}
44
45impl Default for XRefTable {
46 fn default() -> Self {
47 Self::new()
48 }
49}
50
51impl XRefTable {
52 pub fn new() -> Self {
54 Self {
55 entries: HashMap::new(),
56 extended_entries: HashMap::new(),
57 trailer: None,
58 xref_offset: 0,
59 }
60 }
61
62 pub fn entries(&self) -> &HashMap<u32, XRefEntry> {
64 &self.entries
65 }
66
67 pub fn parse<R: Read + Seek>(reader: &mut BufReader<R>) -> ParseResult<Self> {
69 Self::parse_with_options(reader, &super::ParseOptions::default())
70 }
71
72 pub fn parse_with_options<R: Read + Seek>(
74 reader: &mut BufReader<R>,
75 options: &super::ParseOptions,
76 ) -> ParseResult<Self> {
77 match Self::parse_with_incremental_updates_options(reader, options) {
79 Ok(table) => Ok(table),
80 Err(e) => {
81 if options.lenient_syntax {
82 eprintln!("Primary XRef parsing failed: {e:?}, attempting recovery");
83
84 reader.seek(SeekFrom::Start(0))?;
86 Self::parse_with_recovery_options(reader, options)
87 } else {
88 Err(e)
89 }
90 }
91 }
92 }
93
94 #[allow(dead_code)]
96 fn parse_with_incremental_updates<R: Read + Seek>(
97 reader: &mut BufReader<R>,
98 ) -> ParseResult<Self> {
99 Self::parse_with_incremental_updates_options(reader, &super::ParseOptions::default())
100 }
101
102 fn parse_with_incremental_updates_options<R: Read + Seek>(
104 reader: &mut BufReader<R>,
105 options: &super::ParseOptions,
106 ) -> ParseResult<Self> {
107 let xref_offset = Self::find_xref_offset(reader)?;
109
110 let mut merged_table = Self::new();
112 let mut current_offset = Some(xref_offset);
113 let mut visited_offsets = std::collections::HashSet::new();
114
115 while let Some(offset) = current_offset {
116 if visited_offsets.contains(&offset) {
118 eprintln!("Circular reference in XRef chain at offset {offset}");
119 break;
120 }
121 visited_offsets.insert(offset);
122
123 reader.seek(SeekFrom::Start(offset))?;
125 let table = Self::parse_primary_with_options(reader, options)?;
126
127 let prev_offset = table
129 .trailer
130 .as_ref()
131 .and_then(|t| t.get("Prev"))
132 .and_then(|obj| obj.as_integer())
133 .map(|i| i as u64);
134
135 for (obj_num, entry) in table.entries {
137 merged_table.entries.entry(obj_num).or_insert(entry);
138 }
139 for (obj_num, ext_entry) in table.extended_entries {
140 merged_table
141 .extended_entries
142 .entry(obj_num)
143 .or_insert(ext_entry);
144 }
145
146 if merged_table.trailer.is_none() {
148 merged_table.trailer = table.trailer;
149 merged_table.xref_offset = table.xref_offset;
150 }
151
152 current_offset = prev_offset;
153 }
154
155 Ok(merged_table)
156 }
157
158 #[allow(dead_code)]
160 fn parse_primary<R: Read + Seek>(reader: &mut BufReader<R>) -> ParseResult<Self> {
161 Self::parse_primary_with_options(reader, &super::ParseOptions::default())
162 }
163
164 fn parse_primary_with_options<R: Read + Seek>(
166 reader: &mut BufReader<R>,
167 options: &super::ParseOptions,
168 ) -> ParseResult<Self> {
169 let mut table = Self::new();
170
171 let saved_pos = reader.stream_position()?;
174
175 reader.seek(SeekFrom::Start(0))?;
177 if let Ok(xref_offset) = Self::find_linearized_xref(reader) {
178 eprintln!("Found linearized PDF with XRef at offset {xref_offset}");
179
180 Self::validate_offset(reader, xref_offset)?;
182
183 table.xref_offset = xref_offset;
184 reader.seek(SeekFrom::Start(xref_offset))?;
185 } else {
186 reader.seek(SeekFrom::Start(saved_pos))?;
188
189 let xref_offset = Self::find_xref_offset(reader)?;
191
192 Self::validate_offset(reader, xref_offset)?;
194
195 table.xref_offset = xref_offset;
196 reader.seek(SeekFrom::Start(xref_offset))?;
197 }
198
199 let mut line = String::new();
201 let pos = reader.stream_position()?;
202 reader.read_line(&mut line)?;
203
204 if line.trim() == "xref" {
205 Self::parse_traditional_xref_with_options(reader, &mut table, options)?;
207 } else {
208 eprintln!(
209 "Not a traditional xref, checking for xref stream. Line: {:?}",
210 line.trim()
211 );
212
213 reader.seek(SeekFrom::Start(pos))?;
215
216 let mut lexer = super::lexer::Lexer::new_with_options(&mut *reader, options.clone());
218
219 let obj_num = match lexer.next_token()? {
221 super::lexer::Token::Integer(n) => n as u32,
222 _ => return Err(ParseError::InvalidXRef),
223 };
224
225 eprintln!("Found object {obj_num} at xref position");
226
227 let _gen_num = match lexer.next_token()? {
228 super::lexer::Token::Integer(n) => n as u16,
229 _ => return Err(ParseError::InvalidXRef),
230 };
231
232 match lexer.next_token()? {
233 super::lexer::Token::Obj => {}
234 _ => return Err(ParseError::InvalidXRef),
235 };
236
237 let obj = super::objects::PdfObject::parse_with_options(&mut lexer, options)?;
239
240 if let Some(stream) = obj.as_stream() {
241 if stream
243 .dict
244 .get("Type")
245 .and_then(|o| o.as_name())
246 .map(|n| n.as_str())
247 == Some("XRef")
248 {
249 eprintln!("Parsing XRef stream");
250
251 let decoded_data = match stream.decode(options) {
253 Ok(data) => data,
254 Err(e) => {
255 eprintln!(
256 "XRef stream decode failed: {e:?}, attempting raw data fallback"
257 );
258
259 if !stream.data.is_empty() {
262 eprintln!(
263 "Using raw stream data ({} bytes) as fallback",
264 stream.data.len()
265 );
266 stream.data.clone()
267 } else {
268 eprintln!("No raw stream data available, triggering recovery mode");
269 return Err(e);
270 }
271 }
272 };
273
274 let xref_stream_parser = xref_stream::XRefStream::parse(
276 &mut *reader,
277 stream.dict.clone(),
278 decoded_data,
279 options,
280 )?;
281
282 let entries = xref_stream_parser.to_xref_entries()?;
284 eprintln!("XRef stream parsed, found {} entries", entries.len());
285
286 for (obj_num, entry) in entries {
288 match entry {
289 xref_stream::XRefEntry::Free {
290 next_free_object,
291 generation,
292 } => {
293 table.entries.insert(
294 obj_num,
295 XRefEntry {
296 offset: next_free_object as u64,
297 generation,
298 in_use: false,
299 },
300 );
301 }
302 xref_stream::XRefEntry::InUse { offset, generation } => {
303 table.entries.insert(
304 obj_num,
305 XRefEntry {
306 offset,
307 generation,
308 in_use: true,
309 },
310 );
311 }
312 xref_stream::XRefEntry::Compressed {
313 stream_object_number,
314 index_within_stream,
315 } => {
316 eprintln!(
317 "DEBUG: Adding compressed object {} -> stream {} index {}",
318 obj_num, stream_object_number, index_within_stream
319 );
320 let ext_entry = XRefEntryExt {
322 basic: XRefEntry {
323 offset: 0,
324 generation: 0,
325 in_use: true,
326 },
327 compressed_info: Some((
328 stream_object_number,
329 index_within_stream,
330 )),
331 };
332 table.extended_entries.insert(obj_num, ext_entry);
333 table.entries.insert(
334 obj_num,
335 XRefEntry {
336 offset: 0,
337 generation: 0,
338 in_use: true,
339 },
340 );
341 }
342 }
343 }
344
345 table.trailer = Some(xref_stream_parser.trailer_dict().clone());
347 } else {
348 return Err(ParseError::InvalidXRef);
349 }
350 } else {
351 return Err(ParseError::InvalidXRef);
352 }
353 }
354
355 Ok(table)
356 }
357
358 #[allow(dead_code)]
360 fn parse_traditional_xref<R: Read + Seek>(
361 reader: &mut BufReader<R>,
362 table: &mut XRefTable,
363 ) -> ParseResult<()> {
364 Self::parse_traditional_xref_with_options(reader, table, &super::ParseOptions::default())
365 }
366
367 fn parse_traditional_xref_with_options<R: Read + Seek>(
369 reader: &mut BufReader<R>,
370 table: &mut XRefTable,
371 options: &super::ParseOptions,
372 ) -> ParseResult<()> {
373 let mut line = String::new();
374
375 loop {
377 line.clear();
378 reader.read_line(&mut line)?;
379 let trimmed_line = line.trim();
380
381 if trimmed_line.is_empty() || trimmed_line.starts_with('%') {
383 continue;
384 }
385
386 if trimmed_line == "trailer" {
388 break;
389 }
390
391 if trimmed_line.starts_with("<<") {
393 eprintln!("Warning: Found trailer dictionary without 'trailer' keyword");
394 break;
396 }
397
398 let parts: Vec<&str> = trimmed_line.split_whitespace().collect();
400 if parts.len() != 2 {
401 return Err(ParseError::InvalidXRef);
403 }
404
405 let first_obj_num = parts[0]
406 .parse::<u32>()
407 .map_err(|_| ParseError::InvalidXRef)?;
408 let count = parts[1]
409 .parse::<u32>()
410 .map_err(|_| ParseError::InvalidXRef)?;
411
412 let mut entries_parsed = 0;
415 let mut i = 0;
416 while i < count {
417 line.clear();
418 let bytes_read = reader.read_line(&mut line)?;
419 let trimmed = line.trim();
420
421 if trimmed.starts_with('%') {
423 continue;
424 }
425
426 if bytes_read == 0 || trimmed == "trailer" {
428 eprintln!(
429 "Warning: XRef subsection incomplete - expected {count} entries but found only {entries_parsed}"
430 );
431 if line.trim() == "trailer" {
433 break;
435 }
436 break;
437 }
438
439 match Self::parse_xref_entry(&line) {
440 Ok(entry) => {
441 table.entries.insert(first_obj_num + i, entry);
442 entries_parsed += 1;
443 }
444 Err(_) => {
445 eprintln!(
446 "Warning: Invalid XRef entry at position {}: {:?}",
447 i,
448 line.trim()
449 );
450 }
452 }
453 i += 1;
454 }
455 }
457
458 let mut lexer = super::lexer::Lexer::new_with_options(reader, options.clone());
460 let trailer_obj = super::objects::PdfObject::parse_with_options(&mut lexer, options)?;
461 table.trailer = trailer_obj.as_dict().cloned();
464
465 if let Some(trailer) = &table.trailer {
467 if let Some(size_obj) = trailer.get("Size") {
468 if let Some(expected_size) = size_obj.as_integer() {
469 if let Some(max_obj_num) = table.entries.keys().max() {
472 let max_expected = (*max_obj_num + 1) as i64;
473 if max_expected > expected_size {
474 eprintln!(
475 "Warning: XRef table has object {} but trailer Size is only {}",
476 max_obj_num, expected_size
477 );
478 return Err(ParseError::InvalidXRef);
480 }
481 }
482 }
483 }
484 }
485
486 Ok(())
490 }
491
492 fn find_linearized_xref<R: Read + Seek>(reader: &mut BufReader<R>) -> ParseResult<u64> {
494 reader.seek(SeekFrom::Start(0))?;
496 let mut header = String::new();
497 reader.read_line(&mut header)?;
498
499 if !header.starts_with("%PDF-") {
500 return Err(ParseError::InvalidHeader);
501 }
502
503 let mut line = String::new();
505 reader.read_line(&mut line)?;
506
507 let pos = reader.stream_position()?;
510 let mut buffer = vec![0u8; 1024];
511 let bytes_read = reader.read(&mut buffer)?;
512 buffer.truncate(bytes_read);
513
514 let content = String::from_utf8_lossy(&buffer);
515
516 eprintln!(
519 "Checking for linearized PDF, first 100 chars: {:?}",
520 &content.chars().take(100).collect::<String>()
521 );
522
523 if content.contains("/Linearized") {
524 if let Some(xref_pos) = content.find("xref") {
530 return Ok(pos + xref_pos as u64);
531 }
532
533 if content.contains("/Type/XRef") || content.contains("/Type /XRef") {
535 if let Some(obj_pos) = content.find(" obj") {
538 let after_first_obj = &content[obj_pos + 4..];
540 if let Some(next_obj) = after_first_obj.find(" obj") {
541 let second_obj_start = pos + (obj_pos + 4 + next_obj - 10) as u64;
543 return Ok(second_obj_start);
544 }
545 }
546 }
547 }
548
549 Err(ParseError::InvalidXRef)
550 }
551
552 fn find_xref_offset<R: Read + Seek>(reader: &mut BufReader<R>) -> ParseResult<u64> {
554 reader.seek(SeekFrom::End(0))?;
556 let file_size = reader.stream_position()?;
557
558 let read_size = std::cmp::min(1024, file_size);
560 reader.seek(SeekFrom::End(-(read_size as i64)))?;
561
562 let mut buffer = vec![0u8; read_size as usize];
563 reader.read_exact(&mut buffer)?;
564
565 let content = String::from_utf8_lossy(&buffer);
567
568 let debug_content = content.chars().take(200).collect::<String>();
570 eprintln!("XRef search in last {read_size} bytes: {debug_content:?}");
571
572 let mut lines = content.pdf_lines();
573
574 while let Some(line) = lines.next() {
576 if line.trim() == "startxref" {
577 if let Some(offset_line) = lines.next() {
579 let offset = offset_line
580 .trim()
581 .parse::<u64>()
582 .map_err(|_| ParseError::InvalidXRef)?;
583 return Ok(offset);
584 }
585 }
586 }
587
588 Err(ParseError::InvalidXRef)
589 }
590
591 #[allow(dead_code)]
593 fn parse_with_recovery<R: Read + Seek>(reader: &mut BufReader<R>) -> ParseResult<Self> {
594 Self::parse_with_recovery_options(reader, &super::ParseOptions::default())
595 }
596
597 fn parse_with_recovery_options<R: Read + Seek>(
599 reader: &mut BufReader<R>,
600 _options: &super::ParseOptions,
601 ) -> ParseResult<Self> {
602 let mut recovery_options = _options.clone();
604 recovery_options.lenient_syntax = true;
605 recovery_options.collect_warnings = true;
606 recovery_options.recover_from_stream_errors = true;
607 let mut table = Self::new();
608
609 let mut buffer = Vec::new();
611 reader.read_to_end(&mut buffer)?;
612 let content = String::from_utf8_lossy(&buffer);
613
614 eprintln!("XRef recovery: scanning {} bytes for objects", buffer.len());
615
616 let mut xref_root_candidate = None;
618 if let Some(root_match) = extract_root_from_xref_stream(&content) {
619 xref_root_candidate = Some(root_match);
620 eprintln!("XRef recovery: Found Root {} in XRef stream", root_match);
621 }
622
623 let mut objects_found = 0;
624 let mut object_streams = Vec::new();
625
626 let mut pos = 0;
629 while pos < content.len() {
630 let remaining = &content[pos..];
632
633 if let Some(obj_pos) = remaining.find("obj") {
635 let abs_pos = pos + obj_pos;
637 if abs_pos < 4 {
638 pos += obj_pos + 3;
639 continue;
640 }
641
642 let line_start = content[..abs_pos]
645 .rfind(['\n', '\r'])
646 .map(|p| p + 1)
647 .unwrap_or(0);
648 let line_end = abs_pos + 3; if line_end <= content.len() {
652 let line = &content[line_start..line_end];
653
654 if let Some((obj_num, gen_num)) = Self::parse_obj_header(line.trim()) {
655 let offset = line_start;
656
657 if !table.entries.contains_key(&obj_num) {
659 table.add_entry(
660 obj_num,
661 XRefEntry {
662 offset: offset as u64,
663 generation: gen_num,
664 in_use: true,
665 },
666 );
667 objects_found += 1;
668
669 let obj_end_pos = line_end;
671 if obj_end_pos + 200 < buffer.len() {
673 let search_bytes = &buffer[obj_end_pos..obj_end_pos + 200];
674 if let Some(stream_pos) =
675 search_bytes.windows(6).position(|w| w == b"stream")
676 {
677 let check_bytes =
679 &buffer[obj_end_pos..obj_end_pos + stream_pos];
680 let check_str = String::from_utf8_lossy(check_bytes);
681 if check_str.contains("/Type") && check_str.contains("/ObjStm")
682 {
683 object_streams.push(obj_num);
684 eprintln!(
685 "XRef recovery: found object stream at object {obj_num}"
686 );
687 }
688 }
689 }
690 }
691 }
692 }
693
694 pos = abs_pos + 3;
695 } else {
696 break;
697 }
698 }
699
700 eprintln!(
701 "XRef recovery: found {} objects and {} object streams",
702 objects_found,
703 object_streams.len()
704 );
705
706 if objects_found == 0 {
707 return Err(ParseError::InvalidXRef);
708 }
709
710 let mut trailer = super::objects::PdfDictionary::new();
715 trailer.insert(
716 "Size".to_string(),
717 super::objects::PdfObject::Integer(table.len() as i64),
718 );
719
720 let mut catalog_candidate = None;
722
723 if let Some(xref_root) = xref_root_candidate {
725 if table.entries.contains_key(&xref_root) {
726 catalog_candidate = Some(xref_root);
727 eprintln!("Using Root {} from XRef stream as catalog", xref_root);
728 } else {
729 eprintln!(
730 "Warning: XRef Root {} not found in object table, searching manually",
731 xref_root
732 );
733 }
734 }
735
736 if catalog_candidate.is_none() {
738 catalog_candidate = find_catalog_by_content(&table, &buffer, &content);
739 }
740
741 if catalog_candidate.is_none() {
743 for obj_num in [1, 2, 3, 4, 5] {
744 if table.entries.contains_key(&obj_num) {
745 catalog_candidate = Some(obj_num);
746 eprintln!("Using fallback catalog candidate: object {}", obj_num);
747 break;
748 }
749 }
750 }
751
752 if catalog_candidate.is_none() && !table.entries.is_empty() {
754 catalog_candidate = Some(*table.entries.keys().min().unwrap_or(&1));
755 eprintln!(
756 "Using last resort catalog candidate: object {}",
757 catalog_candidate.unwrap()
758 );
759 }
760
761 if let Some(root_obj) = catalog_candidate {
762 trailer.insert(
763 "Root".to_string(),
764 super::objects::PdfObject::Reference(root_obj, 0),
765 );
766 }
767
768 table.set_trailer(trailer);
769
770 Ok(table)
771 }
772
773 fn parse_obj_header(line: &str) -> Option<(u32, u16)> {
775 let parts: Vec<&str> = line.split_whitespace().collect();
776
777 if parts.len() >= 3 && parts[2] == "obj" {
778 if let (Ok(obj_num), Ok(gen_num)) = (parts[0].parse::<u32>(), parts[1].parse::<u16>()) {
779 return Some((obj_num, gen_num));
780 }
781 }
782
783 None
784 }
785
786 fn validate_offset<R: Read + Seek>(reader: &mut BufReader<R>, offset: u64) -> ParseResult<()> {
788 let file_size = reader.seek(SeekFrom::End(0))?;
790
791 if offset >= file_size {
792 #[cfg(debug_assertions)]
793 eprintln!("Warning: XRef offset {offset} exceeds file size {file_size}");
794 return Err(ParseError::InvalidXRef);
795 }
796
797 reader.seek(SeekFrom::Start(offset))?;
799 let mut peek = [0u8; 20];
800 let read_bytes = reader.read(&mut peek)?;
801
802 if read_bytes == 0 {
803 #[cfg(debug_assertions)]
804 eprintln!("Warning: XRef offset {offset} points to EOF");
805 return Err(ParseError::InvalidXRef);
806 }
807
808 let content = String::from_utf8_lossy(&peek[..read_bytes]);
810 if !content.starts_with("xref") && !content.chars().next().unwrap_or(' ').is_ascii_digit() {
811 #[cfg(debug_assertions)]
812 eprintln!(
813 "Warning: XRef offset {} does not point to valid XRef content: {:?}",
814 offset,
815 &content[..std::cmp::min(10, content.len())]
816 );
817 }
819
820 Ok(())
821 }
822
823 fn parse_xref_entry(line: &str) -> ParseResult<XRefEntry> {
825 let line = line.trim();
826
827 if line.len() >= 18 {
829 if let Ok(entry) = Self::parse_xref_entry_standard(line) {
830 return Ok(entry);
831 }
832 }
833
834 Self::parse_xref_entry_flexible(line)
836 }
837
838 fn parse_xref_entry_standard(line: &str) -> ParseResult<XRefEntry> {
840 if line.len() < 18 {
843 return Err(ParseError::InvalidXRef);
844 }
845
846 let offset_str = &line[0..10];
847 let gen_str = &line[11..16];
848 let flag = line.chars().nth(17);
849
850 let offset = offset_str
851 .trim()
852 .parse::<u64>()
853 .map_err(|_| ParseError::InvalidXRef)?;
854 let generation = gen_str
855 .trim()
856 .parse::<u16>()
857 .map_err(|_| ParseError::InvalidXRef)?;
858
859 let in_use = match flag {
860 Some('n') => true,
861 Some('f') => false,
862 _ => return Err(ParseError::InvalidXRef),
863 };
864
865 Ok(XRefEntry {
866 offset,
867 generation,
868 in_use,
869 })
870 }
871
872 fn parse_xref_entry_flexible(line: &str) -> ParseResult<XRefEntry> {
874 let parts: Vec<&str> = line.split_whitespace().collect();
882
883 if parts.is_empty() {
884 return Err(ParseError::InvalidXRef);
885 }
886
887 let offset = parts[0]
889 .parse::<u64>()
890 .map_err(|_| ParseError::InvalidXRef)?;
891
892 let (generation, flag_from_gen) = if parts.len() >= 2 {
894 let gen_part = parts[1];
895 if gen_part == "n" || gen_part == "f" {
897 (0, gen_part.chars().next())
899 } else if gen_part.ends_with('n') || gen_part.ends_with('f') {
900 let flag_char = gen_part
902 .chars()
903 .last()
904 .expect("String should have at least one character after ends_with check");
905 let gen_str = &gen_part[..gen_part.len() - 1];
906 if gen_str.is_empty() {
907 (0, Some(flag_char))
909 } else {
910 let gen = gen_str
911 .parse::<u16>()
912 .map_err(|_| ParseError::InvalidXRef)?;
913 (gen, Some(flag_char))
914 }
915 } else {
916 let gen = gen_part
918 .parse::<u16>()
919 .map_err(|_| ParseError::InvalidXRef)?;
920 (gen, None)
921 }
922 } else {
923 (0, None)
924 };
925
926 let in_use = if let Some(flag_char) = flag_from_gen {
928 match flag_char {
930 'n' => true,
931 'f' => false,
932 _ => true, }
934 } else if parts.len() >= 3 {
935 match parts[2].chars().next() {
937 Some('n') => true,
938 Some('f') => false,
939 _ => {
940 #[cfg(debug_assertions)]
942 eprintln!("Warning: Invalid xref flag '{}', assuming 'n'", parts[2]);
943 true
944 }
945 }
946 } else {
947 true
949 };
950
951 Ok(XRefEntry {
952 offset,
953 generation,
954 in_use,
955 })
956 }
957
958 pub fn get_entry(&self, obj_num: u32) -> Option<&XRefEntry> {
960 self.entries.get(&obj_num)
961 }
962
963 pub fn get_entry_mut(&mut self, obj_num: u32) -> Option<&mut XRefEntry> {
965 self.entries.get_mut(&obj_num)
966 }
967
968 pub fn trailer(&self) -> Option<&super::objects::PdfDictionary> {
970 self.trailer.as_ref()
971 }
972
973 pub fn xref_offset(&self) -> u64 {
975 self.xref_offset
976 }
977
978 pub fn len(&self) -> usize {
980 self.entries.len()
981 }
982
983 pub fn is_empty(&self) -> bool {
985 self.entries.is_empty()
986 }
987
988 pub fn iter(&self) -> impl Iterator<Item = (&u32, &XRefEntry)> {
990 self.entries.iter()
991 }
992
993 pub fn get_extended_entry(&self, obj_num: u32) -> Option<&XRefEntryExt> {
995 self.extended_entries.get(&obj_num)
996 }
997
998 pub fn is_compressed(&self, obj_num: u32) -> bool {
1000 self.extended_entries
1001 .get(&obj_num)
1002 .map(|e| e.compressed_info.is_some())
1003 .unwrap_or(false)
1004 }
1005
1006 pub fn add_entry(&mut self, obj_num: u32, entry: XRefEntry) {
1008 self.entries.insert(obj_num, entry);
1009 }
1010
1011 pub fn set_trailer(&mut self, trailer: super::objects::PdfDictionary) {
1013 self.trailer = Some(trailer);
1014 }
1015
1016 pub fn add_extended_entry(&mut self, obj_num: u32, entry: XRefEntryExt) {
1018 self.extended_entries.insert(obj_num, entry);
1019 }
1020}
1021
1022#[derive(Debug, Clone)]
1025pub struct XRefStream {
1026 stream: super::objects::PdfStream,
1028 entries: HashMap<u32, XRefEntry>,
1030 extended_entries: HashMap<u32, XRefEntryExt>,
1032}
1033
1034impl XRefStream {
1035 pub fn parse(stream: super::objects::PdfStream) -> ParseResult<Self> {
1037 let mut xref_stream = Self {
1038 stream,
1039 entries: HashMap::new(),
1040 extended_entries: HashMap::new(),
1041 };
1042
1043 xref_stream.decode_entries()?;
1044 Ok(xref_stream)
1045 }
1046
1047 fn decode_entries(&mut self) -> ParseResult<()> {
1049 let dict = &self.stream.dict;
1051
1052 let size = dict
1054 .get("Size")
1055 .and_then(|obj| obj.as_integer())
1056 .ok_or_else(|| ParseError::MissingKey("Size".to_string()))?;
1057
1058 let index = match dict.get("Index") {
1060 Some(obj) => {
1061 let array = obj.as_array().ok_or_else(|| ParseError::SyntaxError {
1062 position: 0,
1063 message: "Index must be an array".to_string(),
1064 })?;
1065
1066 let mut pairs = Vec::new();
1068 for chunk in array.0.chunks(2) {
1069 if chunk.len() != 2 {
1070 return Err(ParseError::SyntaxError {
1071 position: 0,
1072 message: "Index array must have even number of elements".to_string(),
1073 });
1074 }
1075 let first = chunk[0]
1076 .as_integer()
1077 .ok_or_else(|| ParseError::SyntaxError {
1078 position: 0,
1079 message: "Index values must be integers".to_string(),
1080 })? as u32;
1081 let count = chunk[1]
1082 .as_integer()
1083 .ok_or_else(|| ParseError::SyntaxError {
1084 position: 0,
1085 message: "Index values must be integers".to_string(),
1086 })? as u32;
1087 pairs.push((first, count));
1088 }
1089 pairs
1090 }
1091 None => {
1092 vec![(0, size as u32)]
1094 }
1095 };
1096
1097 let w_array = dict
1099 .get("W")
1100 .and_then(|obj| obj.as_array())
1101 .ok_or_else(|| ParseError::MissingKey("W".to_string()))?;
1102
1103 if w_array.len() != 3 {
1104 return Err(ParseError::SyntaxError {
1105 position: 0,
1106 message: "W array must have exactly 3 elements".to_string(),
1107 });
1108 }
1109
1110 let w: Vec<usize> = w_array
1111 .0
1112 .iter()
1113 .map(|obj| {
1114 obj.as_integer()
1115 .ok_or_else(|| ParseError::SyntaxError {
1116 position: 0,
1117 message: "W values must be integers".to_string(),
1118 })
1119 .map(|i| i as usize)
1120 })
1121 .collect::<ParseResult<Vec<_>>>()?;
1122
1123 let data = self.stream.decode(&ParseOptions::default())?;
1125 let mut offset = 0;
1126
1127 for (first_obj_num, count) in index {
1129 for i in 0..count {
1130 if offset + w[0] + w[1] + w[2] > data.len() {
1131 return Err(ParseError::SyntaxError {
1132 position: 0,
1133 message: "Xref stream data truncated".to_string(),
1134 });
1135 }
1136
1137 let field1 = Self::read_field(&data[offset..], w[0]);
1139 offset += w[0];
1140
1141 let field2 = Self::read_field(&data[offset..], w[1]);
1142 offset += w[1];
1143
1144 let field3 = Self::read_field(&data[offset..], w[2]);
1145 offset += w[2];
1146
1147 let entry_info =
1149 XRefEntryInfo::new(XRefEntryType::from_value(field1), field2, field3);
1150
1151 let entry = match entry_info.entry_type {
1153 XRefEntryType::Free => XRefEntry {
1154 offset: entry_info.field2,
1155 generation: entry_info.field3 as u16,
1156 in_use: false,
1157 },
1158 XRefEntryType::Uncompressed => XRefEntry {
1159 offset: entry_info.field2,
1160 generation: entry_info.field3 as u16,
1161 in_use: true,
1162 },
1163 XRefEntryType::Compressed => {
1164 let ext_entry = XRefEntryExt {
1166 basic: XRefEntry {
1167 offset: 0,
1168 generation: 0,
1169 in_use: true,
1170 },
1171 compressed_info: entry_info.get_compressed_info(),
1172 };
1173 self.extended_entries
1174 .insert(first_obj_num + i, ext_entry.clone());
1175 ext_entry.basic
1176 }
1177 XRefEntryType::Custom(_type_num) => {
1178 #[cfg(debug_assertions)]
1181 eprintln!(
1182 "Note: Custom xref entry type {} for object {} (treating as in-use)",
1183 _type_num,
1184 first_obj_num + i
1185 );
1186
1187 let ext_entry = XRefEntryExt {
1189 basic: XRefEntry {
1190 offset: entry_info.field2,
1191 generation: entry_info.field3 as u16,
1192 in_use: entry_info.entry_type.is_in_use(),
1193 },
1194 compressed_info: None,
1195 };
1196 self.extended_entries
1197 .insert(first_obj_num + i, ext_entry.clone());
1198 ext_entry.basic
1199 }
1200 };
1201
1202 self.entries.insert(first_obj_num + i, entry);
1203 }
1204 }
1205
1206 Ok(())
1207 }
1208
1209 fn read_field(data: &[u8], width: usize) -> u64 {
1211 let mut value = 0u64;
1212 for i in 0..width {
1213 if i < data.len() {
1214 value = (value << 8) | (data[i] as u64);
1215 }
1216 }
1217 value
1218 }
1219
1220 pub fn get_entry(&self, obj_num: u32) -> Option<&XRefEntry> {
1222 self.entries.get(&obj_num)
1223 }
1224
1225 pub fn trailer(&self) -> &super::objects::PdfDictionary {
1227 &self.stream.dict
1228 }
1229}
1230
1231#[cfg(test)]
1232mod tests {
1233 use super::*;
1234
1235 use crate::parser::objects::{PdfDictionary, PdfObject};
1236 use std::io::Cursor;
1237
1238 #[test]
1239 fn test_parse_xref_entry() {
1240 let entry1 = XRefTable::parse_xref_entry("0000000000 65535 f ").unwrap();
1241 assert_eq!(entry1.offset, 0);
1242 assert_eq!(entry1.generation, 65535);
1243 assert!(!entry1.in_use);
1244
1245 let entry2 = XRefTable::parse_xref_entry("0000000017 00000 n ").unwrap();
1246 assert_eq!(entry2.offset, 17);
1247 assert_eq!(entry2.generation, 0);
1248 assert!(entry2.in_use);
1249 }
1250
1251 #[test]
1252 fn test_parse_xref_entry_flexible() {
1253 let entry1 = XRefTable::parse_xref_entry("17 0 n").unwrap();
1257 assert_eq!(entry1.offset, 17);
1258 assert_eq!(entry1.generation, 0);
1259 assert!(entry1.in_use);
1260
1261 let entry2 = XRefTable::parse_xref_entry("123 5 f").unwrap();
1263 assert_eq!(entry2.offset, 123);
1264 assert_eq!(entry2.generation, 5);
1265 assert!(!entry2.in_use);
1266
1267 let entry3 = XRefTable::parse_xref_entry("456 n").unwrap();
1269 assert_eq!(entry3.offset, 456);
1270 assert_eq!(entry3.generation, 0);
1271 assert!(entry3.in_use);
1272
1273 let entry4 = XRefTable::parse_xref_entry("789 2").unwrap();
1275 assert_eq!(entry4.offset, 789);
1276 assert_eq!(entry4.generation, 2);
1277 assert!(entry4.in_use);
1278
1279 let entry5 = XRefTable::parse_xref_entry("1000 0n").unwrap();
1281 assert_eq!(entry5.offset, 1000);
1282 assert_eq!(entry5.generation, 0);
1283 assert!(entry5.in_use);
1284
1285 let entry6 = XRefTable::parse_xref_entry("2000 1f").unwrap();
1286 assert_eq!(entry6.offset, 2000);
1287 assert_eq!(entry6.generation, 1);
1288 assert!(!entry6.in_use);
1289
1290 let entry7 = XRefTable::parse_xref_entry("3000\t0\tn").unwrap();
1292 assert_eq!(entry7.offset, 3000);
1293 assert_eq!(entry7.generation, 0);
1294 assert!(entry7.in_use);
1295 }
1296
1297 #[test]
1298 fn test_parse_xref_entry_invalid_flag_fallback() {
1299 let entry = XRefTable::parse_xref_entry("100 0 x").unwrap();
1301 assert_eq!(entry.offset, 100);
1302 assert_eq!(entry.generation, 0);
1303 assert!(entry.in_use); }
1305
1306 #[test]
1307 fn test_parse_xref_entry_malformed() {
1308 let result = XRefTable::parse_xref_entry("");
1310 assert!(result.is_err());
1311
1312 let result = XRefTable::parse_xref_entry("abc 0 n");
1314 assert!(result.is_err());
1315
1316 let result = XRefTable::parse_xref_entry(" ");
1318 assert!(result.is_err());
1319 }
1320
1321 #[test]
1322 fn test_xref_table_new() {
1323 let table = XRefTable::new();
1324 assert!(table.entries.is_empty());
1325 assert!(table.extended_entries.is_empty());
1326 assert!(table.trailer.is_none());
1327 assert_eq!(table.xref_offset, 0);
1328 }
1329
1330 #[test]
1331 fn test_xref_table_default() {
1332 let table = XRefTable::default();
1333 assert!(table.entries.is_empty());
1334 assert!(table.extended_entries.is_empty());
1335 assert!(table.trailer.is_none());
1336 }
1337
1338 #[test]
1339 fn test_xref_entry_struct() {
1340 let entry = XRefEntry {
1341 offset: 12345,
1342 generation: 7,
1343 in_use: true,
1344 };
1345 assert_eq!(entry.offset, 12345);
1346 assert_eq!(entry.generation, 7);
1347 assert!(entry.in_use);
1348 }
1349
1350 #[test]
1351 fn test_xref_entry_equality() {
1352 let entry1 = XRefEntry {
1353 offset: 100,
1354 generation: 0,
1355 in_use: true,
1356 };
1357 let entry2 = XRefEntry {
1358 offset: 100,
1359 generation: 0,
1360 in_use: true,
1361 };
1362 assert_eq!(entry1, entry2);
1363 }
1364
1365 #[test]
1366 fn test_xref_entry_clone() {
1367 let entry = XRefEntry {
1368 offset: 999,
1369 generation: 3,
1370 in_use: false,
1371 };
1372 let cloned = entry;
1373 assert_eq!(cloned.offset, 999);
1374 assert_eq!(cloned.generation, 3);
1375 assert!(!cloned.in_use);
1376 }
1377
1378 #[test]
1379 fn test_xref_entry_ext() {
1380 let ext_entry = XRefEntryExt {
1381 basic: XRefEntry {
1382 offset: 500,
1383 generation: 0,
1384 in_use: true,
1385 },
1386 compressed_info: Some((10, 5)),
1387 };
1388 assert_eq!(ext_entry.basic.offset, 500);
1389 assert_eq!(ext_entry.compressed_info, Some((10, 5)));
1390 }
1391
1392 #[test]
1393 fn test_xref_entry_ext_no_compression() {
1394 let ext_entry = XRefEntryExt {
1395 basic: XRefEntry {
1396 offset: 1000,
1397 generation: 1,
1398 in_use: true,
1399 },
1400 compressed_info: None,
1401 };
1402 assert!(ext_entry.compressed_info.is_none());
1403 }
1404
1405 #[test]
1406 fn test_add_entry() {
1407 let mut table = XRefTable::new();
1408 table.add_entry(
1409 5,
1410 XRefEntry {
1411 offset: 1000,
1412 generation: 0,
1413 in_use: true,
1414 },
1415 );
1416 assert_eq!(table.entries.len(), 1);
1417 assert!(table.entries.contains_key(&5));
1418 }
1419
1420 #[test]
1421 fn test_get_entry() {
1422 let mut table = XRefTable::new();
1423 let entry = XRefEntry {
1424 offset: 2000,
1425 generation: 1,
1426 in_use: true,
1427 };
1428 table.add_entry(10, entry);
1429
1430 let retrieved = table.get_entry(10);
1431 assert!(retrieved.is_some());
1432 assert_eq!(retrieved.unwrap().offset, 2000);
1433
1434 let missing = table.get_entry(999);
1435 assert!(missing.is_none());
1436 }
1437
1438 #[test]
1439 fn test_set_trailer() {
1440 let mut table = XRefTable::new();
1441 let mut trailer = PdfDictionary::new();
1442 trailer.insert("Size".to_string(), PdfObject::Integer(10));
1443
1444 table.set_trailer(trailer.clone());
1445 assert!(table.trailer.is_some());
1446 assert_eq!(
1447 table.trailer().unwrap().get("Size"),
1448 Some(&PdfObject::Integer(10))
1449 );
1450 }
1451
1452 #[test]
1453 fn test_parse_xref_entry_invalid() {
1454 let result = XRefTable::parse_xref_entry("0000000000 65535");
1456 assert!(result.is_ok()); let result = XRefTable::parse_xref_entry("not_a_number 65535 f ");
1460 assert!(result.is_err());
1461
1462 let result = XRefTable::parse_xref_entry("0000000000 65535 x ");
1464 assert!(result.is_ok()); assert!(result.unwrap().in_use); }
1467
1468 #[test]
1469 fn test_parse_xref_entry_various_offsets() {
1470 let entry = XRefTable::parse_xref_entry("0000000001 00000 n ").unwrap();
1472 assert_eq!(entry.offset, 1);
1473
1474 let entry = XRefTable::parse_xref_entry("9999999999 00000 n ").unwrap();
1476 assert_eq!(entry.offset, 9999999999);
1477
1478 let entry = XRefTable::parse_xref_entry("0000000000 65535 f ").unwrap();
1480 assert_eq!(entry.generation, 65535);
1481 }
1482
1483 #[test]
1484 fn test_add_extended_entry() {
1485 let mut table = XRefTable::new();
1486 let ext_entry = XRefEntryExt {
1487 basic: XRefEntry {
1488 offset: 0,
1489 generation: 0,
1490 in_use: true,
1491 },
1492 compressed_info: Some((5, 10)),
1493 };
1494
1495 table.add_extended_entry(15, ext_entry.clone());
1496 assert_eq!(table.extended_entries.len(), 1);
1497 assert!(table.extended_entries.contains_key(&15));
1498 }
1499
1500 #[test]
1501 fn test_get_extended_entry() {
1502 let mut table = XRefTable::new();
1503 let ext_entry = XRefEntryExt {
1504 basic: XRefEntry {
1505 offset: 0,
1506 generation: 0,
1507 in_use: true,
1508 },
1509 compressed_info: Some((20, 3)),
1510 };
1511
1512 table.add_extended_entry(7, ext_entry);
1513
1514 let retrieved = table.get_extended_entry(7);
1515 assert!(retrieved.is_some());
1516 assert_eq!(retrieved.unwrap().compressed_info, Some((20, 3)));
1517 }
1518
1519 #[test]
1520 fn test_xref_offset() {
1521 let mut table = XRefTable::new();
1522 assert_eq!(table.xref_offset(), 0);
1523
1524 table.xref_offset = 12345;
1525 assert_eq!(table.xref_offset(), 12345);
1526 }
1527
1528 #[test]
1529 fn test_find_xref_offset_simple() {
1530 let pdf_data = b"startxref\n12345\n%%EOF";
1531 let cursor = Cursor::new(pdf_data.to_vec());
1532 let mut reader = BufReader::new(cursor);
1533
1534 let offset = XRefTable::find_xref_offset(&mut reader).unwrap();
1535 assert_eq!(offset, 12345);
1536 }
1537
1538 #[test]
1539 fn test_find_xref_offset_with_spaces() {
1540 let pdf_data = b"startxref \n 12345 \n%%EOF";
1541 let cursor = Cursor::new(pdf_data.to_vec());
1542 let mut reader = BufReader::new(cursor);
1543
1544 let offset = XRefTable::find_xref_offset(&mut reader).unwrap();
1545 assert_eq!(offset, 12345);
1546 }
1547
1548 #[test]
1549 fn test_find_xref_offset_missing() {
1550 let pdf_data = b"no startxref here";
1551 let cursor = Cursor::new(pdf_data.to_vec());
1552 let mut reader = BufReader::new(cursor);
1553
1554 let result = XRefTable::find_xref_offset(&mut reader);
1555 assert!(result.is_err());
1556 }
1557
1558 #[test]
1559 fn test_trailer_getter() {
1560 let mut table = XRefTable::new();
1561 assert!(table.trailer().is_none());
1562
1563 let trailer = PdfDictionary::new();
1564 table.set_trailer(trailer);
1565 assert!(table.trailer().is_some());
1566 }
1567
1568 #[test]
1569 fn test_xref_table_clone() {
1570 let mut table = XRefTable::new();
1571 table.add_entry(
1572 1,
1573 XRefEntry {
1574 offset: 100,
1575 generation: 0,
1576 in_use: true,
1577 },
1578 );
1579 table.xref_offset = 5000;
1580
1581 let cloned = table.clone();
1582 assert_eq!(cloned.entries.len(), 1);
1583 assert_eq!(cloned.xref_offset, 5000);
1584 }
1585
1586 #[test]
1587 fn test_parse_obj_header() {
1588 assert_eq!(XRefTable::parse_obj_header("1 0 obj"), Some((1, 0)));
1590 assert_eq!(XRefTable::parse_obj_header("123 5 obj"), Some((123, 5)));
1591 assert_eq!(
1592 XRefTable::parse_obj_header(" 42 3 obj "),
1593 Some((42, 3))
1594 );
1595
1596 assert_eq!(XRefTable::parse_obj_header("1 obj"), None);
1598 assert_eq!(XRefTable::parse_obj_header("abc 0 obj"), None);
1599 assert_eq!(XRefTable::parse_obj_header("1 0 object"), None);
1600 assert_eq!(XRefTable::parse_obj_header(""), None);
1601 }
1602
1603 #[test]
1604 fn test_xref_recovery_parsing() {
1605 let pdf_content =
1607 b"1 0 obj\n<< /Type /Catalog >>\nendobj\n2 0 obj\n<< /Type /Page >>\nendobj\n";
1608 let mut reader = BufReader::new(Cursor::new(pdf_content));
1609
1610 let table = XRefTable::parse_with_recovery(&mut reader).unwrap();
1611
1612 assert_eq!(table.len(), 2);
1614 assert!(table.get_entry(1).is_some());
1615 assert!(table.get_entry(2).is_some());
1616
1617 assert!(table.get_entry(1).unwrap().in_use);
1619 assert!(table.get_entry(2).unwrap().in_use);
1620 }
1621
1622 #[test]
1623 fn test_xref_recovery_no_objects() {
1624 let pdf_content = b"This is not a PDF file\nNo objects here\n";
1626 let mut reader = BufReader::new(Cursor::new(pdf_content));
1627
1628 let result = XRefTable::parse_with_recovery(&mut reader);
1629 assert!(result.is_err());
1630 }
1631
1632 #[test]
1633 fn test_offset_validation() {
1634 let pdf_data = b"small file";
1635 let mut reader = BufReader::new(Cursor::new(pdf_data));
1636
1637 assert!(XRefTable::validate_offset(&mut reader, 5).is_ok());
1639
1640 assert!(XRefTable::validate_offset(&mut reader, 100).is_err());
1642
1643 assert!(XRefTable::validate_offset(&mut reader, 10).is_err());
1645 }
1646
1647 #[test]
1648 fn test_xref_parse_with_fallback() {
1649 let pdf_content =
1651 b"1 0 obj\n<< /Type /Catalog >>\nendobj\n2 0 obj\n<< /Type /Page >>\nendobj\n";
1652 let mut reader = BufReader::new(Cursor::new(pdf_content));
1653
1654 let result = XRefTable::parse(&mut reader);
1657 assert!(result.is_err());
1658 if let Err(e) = result {
1659 assert!(matches!(e, ParseError::InvalidXRef));
1660 }
1661 }
1662
1663 #[test]
1664 fn test_xref_entry_creation() {
1665 let entry = XRefEntry {
1666 offset: 1234,
1667 generation: 5,
1668 in_use: true,
1669 };
1670
1671 assert_eq!(entry.offset, 1234);
1672 assert_eq!(entry.generation, 5);
1673 assert!(entry.in_use);
1674 }
1675
1676 #[test]
1677 fn test_xref_entry_ext_creation() {
1678 let basic = XRefEntry {
1679 offset: 5000,
1680 generation: 0,
1681 in_use: true,
1682 };
1683
1684 let ext = XRefEntryExt {
1685 basic: basic.clone(),
1686 compressed_info: Some((10, 3)),
1687 };
1688
1689 assert_eq!(ext.basic.offset, 5000);
1690 assert_eq!(ext.compressed_info, Some((10, 3)));
1691 }
1692
1693 #[test]
1694 fn test_xref_table_new_advanced() {
1695 let table = XRefTable::new();
1696 assert_eq!(table.entries.len(), 0);
1697 assert_eq!(table.extended_entries.len(), 0);
1698 assert!(table.trailer.is_none());
1699 assert_eq!(table.xref_offset, 0);
1700 }
1701
1702 #[test]
1703 fn test_xref_table_default_advanced() {
1704 let table = XRefTable::default();
1705 assert_eq!(table.entries.len(), 0);
1706 assert!(table.trailer.is_none());
1707 }
1708
1709 #[test]
1710 fn test_xref_table_add_entry() {
1711 let mut table = XRefTable::new();
1712
1713 let entry1 = XRefEntry {
1714 offset: 100,
1715 generation: 0,
1716 in_use: true,
1717 };
1718 table.add_entry(1, entry1);
1719 let entry2 = XRefEntry {
1720 offset: 200,
1721 generation: 1,
1722 in_use: false,
1723 };
1724 table.add_entry(2, entry2);
1725
1726 assert_eq!(table.len(), 2);
1727
1728 let entry1 = table.get_entry(1).unwrap();
1729 assert_eq!(entry1.offset, 100);
1730 assert_eq!(entry1.generation, 0);
1731 assert!(entry1.in_use);
1732
1733 let entry2 = table.get_entry(2).unwrap();
1734 assert_eq!(entry2.offset, 200);
1735 assert_eq!(entry2.generation, 1);
1736 assert!(!entry2.in_use);
1737 }
1738
1739 #[test]
1740 fn test_xref_table_add_extended_entry() {
1741 let mut table = XRefTable::new();
1742
1743 let basic_entry = XRefEntry {
1744 offset: 0,
1745 generation: 0,
1746 in_use: true,
1747 };
1748
1749 let extended_entry = XRefEntryExt {
1750 basic: basic_entry,
1751 compressed_info: Some((10, 2)),
1752 };
1753
1754 table.add_extended_entry(5, extended_entry);
1755
1756 let ext = table.get_extended_entry(5);
1758 assert!(ext.is_some());
1759 if let Some(ext) = ext {
1760 assert_eq!(ext.compressed_info, Some((10, 2)));
1761 }
1762
1763 assert!(table.is_compressed(5));
1764 }
1765
1766 #[test]
1767 fn test_xref_table_get_nonexistent() {
1768 let table = XRefTable::new();
1769 assert!(table.get_entry(999).is_none());
1770 assert!(table.get_extended_entry(999).is_none());
1771 }
1772
1773 #[test]
1774 fn test_xref_table_update_entry() {
1775 let mut table = XRefTable::new();
1776
1777 let entry1 = XRefEntry {
1779 offset: 100,
1780 generation: 0,
1781 in_use: true,
1782 };
1783 table.add_entry(1, entry1);
1784
1785 let entry2 = XRefEntry {
1787 offset: 200,
1788 generation: 1,
1789 in_use: false,
1790 };
1791 table.add_entry(1, entry2);
1792
1793 let entry = table.get_entry(1).unwrap();
1795 assert_eq!(entry.offset, 200);
1796 assert_eq!(entry.generation, 1);
1797 assert!(!entry.in_use);
1798 }
1799
1800 #[test]
1801 fn test_xref_table_set_trailer() {
1802 let mut table = XRefTable::new();
1803 assert!(table.trailer.is_none());
1804
1805 let mut trailer = PdfDictionary::new();
1806 trailer.insert("Size".to_string(), PdfObject::Integer(10));
1807
1808 table.set_trailer(trailer.clone());
1809 assert!(table.trailer.is_some());
1810 assert_eq!(table.trailer(), Some(&trailer));
1811 }
1812
1813 #[test]
1814 fn test_xref_table_offset() {
1815 let table = XRefTable::new();
1816 assert_eq!(table.xref_offset(), 0);
1817 }
1818
1819 #[test]
1820 fn test_parse_xref_entry_invalid_static() {
1821 let invalid_lines = vec![
1822 "not a valid entry".to_string(),
1823 "12345 abcde n".to_string(), ];
1825
1826 for line in invalid_lines {
1827 let result = XRefTable::parse_xref_entry(&line);
1828 assert!(result.is_err());
1829 }
1830
1831 let result = XRefTable::parse_xref_entry("12345 00000");
1833 assert!(result.is_ok());
1834 let entry = result.unwrap();
1835 assert_eq!(entry.offset, 12345);
1836 assert_eq!(entry.generation, 0);
1837 assert!(entry.in_use); }
1839
1840 #[test]
1841 fn test_xref_entry_operations() {
1842 let mut table = XRefTable::new();
1843
1844 let entry1 = XRefEntry {
1846 offset: 1234,
1847 generation: 5,
1848 in_use: true,
1849 };
1850
1851 let entry2 = XRefEntry {
1852 offset: 5678,
1853 generation: 10,
1854 in_use: false,
1855 };
1856
1857 table.add_entry(1, entry1);
1858 table.add_entry(2, entry2);
1859
1860 assert_eq!(table.len(), 2);
1861
1862 let retrieved1 = table.get_entry(1).unwrap();
1863 assert_eq!(retrieved1.offset, 1234);
1864 assert_eq!(retrieved1.generation, 5);
1865 assert!(retrieved1.in_use);
1866
1867 let retrieved2 = table.get_entry(2).unwrap();
1868 assert_eq!(retrieved2.offset, 5678);
1869 assert_eq!(retrieved2.generation, 10);
1870 assert!(!retrieved2.in_use);
1871 }
1872
1873 #[test]
1874 fn test_parse_xref_with_comments() {
1875 let pdf_content = b"%PDF-1.4\n\
18761 0 obj\n<< /Type /Catalog >>\nendobj\n\
1877xref\n\
1878% This is a comment\n\
18790 2\n\
18800000000000 65535 f \n\
18810000000015 00000 n \n\
1882% Another comment\n\
1883trailer\n\
1884<< /Size 2 /Root 1 0 R >>\n\
1885startxref\n\
188645\n\
1887%%EOF";
1888
1889 let mut reader = BufReader::new(Cursor::new(pdf_content));
1890 reader.seek(SeekFrom::Start(45)).unwrap(); let result = XRefTable::parse(&mut reader);
1893 assert!(result.is_ok());
1894 let table = result.unwrap();
1895 assert_eq!(table.len(), 2);
1896 }
1897
1898 #[test]
1899 fn test_parse_multiple_xref_sections() {
1900 let pdf_content = b"%PDF-1.4\n\
19011 0 obj\n<< /Type /Catalog >>\nendobj\n\
19022 0 obj\n<< /Type /Page >>\nendobj\n\
1903xref\n\
19040 2\n\
19050000000000 65535 f \n\
19060000000015 00000 n \n\
19075 2\n\
19080000000100 00000 n \n\
19090000000200 00000 n \n\
1910trailer\n\
1911<< /Size 7 /Root 1 0 R >>\n\
1912startxref\n\
191378\n\
1914%%EOF";
1915
1916 let mut reader = BufReader::new(Cursor::new(pdf_content));
1917 reader.seek(SeekFrom::Start(78)).unwrap(); let result = XRefTable::parse(&mut reader);
1920 assert!(result.is_ok());
1921 let table = result.unwrap();
1922 assert_eq!(table.len(), 4);
1924 assert!(table.get_entry(0).is_some());
1925 assert!(table.get_entry(1).is_some());
1926 assert!(table.get_entry(5).is_some());
1927 assert!(table.get_entry(6).is_some());
1928 }
1929
1930 #[test]
1931 fn test_parse_xref_with_prev() {
1932 let pdf_content = b"%PDF-1.4\n\
1934% First xref at 15\n\
1935xref\n\
19360 2\n\
19370000000000 65535 f \n\
19380000000100 00000 n \n\
1939trailer\n\
1940<< /Size 2 >>\n\
1941% Second xref at 100\n\
1942xref\n\
19432 1\n\
19440000000200 00000 n \n\
1945trailer\n\
1946<< /Size 3 /Prev 15 >>\n\
1947startxref\n\
1948100\n\
1949%%EOF";
1950
1951 let mut reader = BufReader::new(Cursor::new(pdf_content));
1952 let options = ParseOptions {
1953 lenient_syntax: true,
1954 ..Default::default()
1955 };
1956
1957 let result = XRefTable::parse_with_options(&mut reader, &options);
1958 assert!(result.is_ok() || result.is_err());
1960 }
1961
1962 #[test]
1963 fn test_invalid_xref_format() {
1964 let pdf_content = b"xref\ninvalid content\ntrailer";
1965 let mut reader = BufReader::new(Cursor::new(pdf_content));
1966
1967 let result = XRefTable::parse(&mut reader);
1968 assert!(result.is_err());
1969 }
1970
1971 #[test]
1972 fn test_xref_entry_overflow() {
1973 let mut table = XRefTable::new();
1974
1975 let entry = XRefEntry {
1977 offset: u64::MAX,
1978 generation: u16::MAX,
1979 in_use: true,
1980 };
1981 table.add_entry(u32::MAX, entry);
1982
1983 let entry = table.get_entry(u32::MAX).unwrap();
1984 assert_eq!(entry.offset, u64::MAX);
1985 assert_eq!(entry.generation, u16::MAX);
1986 }
1987
1988 #[test]
1989 fn test_xref_table_operations() {
1990 let mut table = XRefTable::new();
1991
1992 let entry1 = XRefEntry {
1994 offset: 100,
1995 generation: 0,
1996 in_use: true,
1997 };
1998
1999 let entry2 = XRefEntry {
2000 offset: 200,
2001 generation: 0,
2002 in_use: true,
2003 };
2004
2005 table.add_entry(1, entry1);
2006 table.add_entry(2, entry2);
2007
2008 assert_eq!(table.len(), 2);
2009 assert!(table.get_entry(1).is_some());
2010 assert!(table.get_entry(2).is_some());
2011 assert!(table.get_entry(3).is_none());
2012 }
2013
2014 #[test]
2015 fn test_xref_table_merge() {
2016 let mut table1 = XRefTable::new();
2017 let entry1 = XRefEntry {
2018 offset: 100,
2019 generation: 0,
2020 in_use: true,
2021 };
2022 table1.add_entry(1, entry1);
2023 let entry2 = XRefEntry {
2024 offset: 200,
2025 generation: 0,
2026 in_use: true,
2027 };
2028 table1.add_entry(2, entry2);
2029
2030 let mut table2 = XRefTable::new();
2031 let entry3 = XRefEntry {
2032 offset: 250,
2033 generation: 1,
2034 in_use: true,
2035 }; table2.add_entry(2, entry3);
2037 let entry4 = XRefEntry {
2038 offset: 300,
2039 generation: 0,
2040 in_use: true,
2041 }; table2.add_entry(3, entry4);
2043
2044 for i in 2..=3 {
2047 if let Some(entry) = table2.get_entry(i) {
2048 table1.add_entry(
2049 i,
2050 XRefEntry {
2051 offset: entry.offset,
2052 generation: entry.generation,
2053 in_use: entry.in_use,
2054 },
2055 );
2056 }
2057 }
2058
2059 assert_eq!(table1.len(), 3);
2060
2061 let entry2 = table1.get_entry(2).unwrap();
2063 assert_eq!(entry2.offset, 250);
2064 assert_eq!(entry2.generation, 1);
2065
2066 assert!(table1.get_entry(3).is_some());
2068 }
2069
2070 #[test]
2071 fn test_xref_recovery_with_stream() {
2072 let pdf_content = b"1 0 obj\n<< /Type /ObjStm /N 2 /First 10 >>\nstream\n12345678901 0 2 0\nendstream\nendobj\n";
2073 let mut reader = BufReader::new(Cursor::new(pdf_content));
2074
2075 let result = XRefTable::parse_with_recovery(&mut reader);
2076 assert!(result.is_ok() || result.is_err());
2078 }
2079
2080 #[test]
2081 fn test_xref_entry_equality_advanced() {
2082 let entry1 = XRefEntry {
2083 offset: 100,
2084 generation: 0,
2085 in_use: true,
2086 };
2087
2088 let entry2 = XRefEntry {
2089 offset: 100,
2090 generation: 0,
2091 in_use: true,
2092 };
2093
2094 let entry3 = XRefEntry {
2095 offset: 200,
2096 generation: 0,
2097 in_use: true,
2098 };
2099
2100 assert_eq!(entry1, entry2);
2101 assert_ne!(entry1, entry3);
2102 }
2103
2104 #[test]
2105 fn test_parse_options_effect() {
2106 let pdf_content = b"xref 0 1 invalid";
2107 let mut reader = BufReader::new(Cursor::new(pdf_content));
2108
2109 let strict_options = ParseOptions {
2111 lenient_syntax: false,
2112 ..Default::default()
2113 };
2114 let result = XRefTable::parse_with_options(&mut reader, &strict_options);
2115 assert!(result.is_err());
2116
2117 reader.seek(SeekFrom::Start(0)).unwrap();
2119 let lenient_options = ParseOptions {
2120 lenient_syntax: true,
2121 ..Default::default()
2122 };
2123 let result = XRefTable::parse_with_options(&mut reader, &lenient_options);
2124 assert!(result.is_err() || result.is_ok());
2126 }
2127
2128 #[test]
2129 fn test_circular_reference_detection() {
2130 let pdf_content = b"%PDF-1.4\n\
2132xref\n\
21330 1\n\
21340000000000 65535 f \n\
2135trailer\n\
2136<< /Size 1 /Prev 10 >>\n\
2137startxref\n\
213810\n\
2139%%EOF";
2140
2141 let mut reader = BufReader::new(Cursor::new(pdf_content));
2142
2143 let result = XRefTable::parse_with_incremental_updates(&mut reader);
2145 assert!(result.is_ok() || result.is_err());
2147 }
2148
2149 #[test]
2150 fn test_linearized_xref_detection() {
2151 let pdf_content = b"%PDF-1.4\n\
21531 0 obj\n\
2154<< /Linearized 1 /L 1234 /H [100 200] /O 5 /E 500 /N 10 /T 600 >>\n\
2155endobj\n\
2156xref\n\
21570 2\n\
21580000000000 65535 f \n\
21590000000009 00000 n \n\
2160trailer\n\
2161<< /Size 2 >>\n\
2162startxref\n\
216363\n\
2164%%EOF";
2165
2166 let mut reader = BufReader::new(Cursor::new(pdf_content));
2167
2168 let result = XRefTable::find_linearized_xref(&mut reader);
2170 assert!(result.is_ok());
2171
2172 let xref_pos = result.unwrap();
2175 assert_eq!(
2176 xref_pos, 90,
2177 "Expected xref at position 90, got {}",
2178 xref_pos
2179 );
2180 }
2181
2182 #[test]
2183 fn test_xref_stream_parsing() {
2184 let pdf_content = b"%PDF-1.5\n\
21871 0 obj\n\
2188<< /Type /XRef /Size 3 /W [1 2 1] /Length 12 >>\n\
2189stream\n\
2190\x00\x00\x00\x00\
2191\x01\x00\x10\x00\
2192\x01\x00\x20\x00\
2193endstream\n\
2194endobj\n\
2195startxref\n\
21969\n\
2197%%EOF";
2198
2199 let mut reader = BufReader::new(Cursor::new(pdf_content));
2200 reader.seek(SeekFrom::Start(9)).unwrap();
2201
2202 let result = XRefTable::parse(&mut reader);
2204 assert!(result.is_err() || result.is_ok());
2206 }
2207
2208 #[test]
2209 fn test_xref_validation_max_object_exceeds_size() {
2210 let pdf_content = b"%PDF-1.4\n\
2212xref\n\
22130 1\n\
22140000000000 65535 f \n\
221510 1\n\
22160000000100 00000 n \n\
2217trailer\n\
2218<< /Size 5 /Root 1 0 R >>\n\
2219startxref\n\
22209\n\
2221%%EOF";
2222
2223 let mut reader = BufReader::new(Cursor::new(pdf_content));
2224 reader.seek(SeekFrom::Start(9)).unwrap();
2225
2226 let result = XRefTable::parse(&mut reader);
2228 assert!(result.is_err());
2229 }
2230
2231 #[test]
2232 fn test_parse_with_options_lenient_vs_strict() {
2233 let pdf_content = b"%PDF-1.4\n\
2235xref\n\
22360 2\n\
22370000000000 65535 f \n\
22380000000015 00000 n \n\
2239trailer\n\
2240<< /Size 2 >>\n\
2241startxref\n\
22429\n\
2243%%EOF";
2244
2245 let mut reader = BufReader::new(Cursor::new(pdf_content));
2246
2247 let strict_options = ParseOptions {
2249 lenient_syntax: false,
2250 recover_from_stream_errors: false,
2251 ..Default::default()
2252 };
2253 reader.seek(SeekFrom::Start(9)).unwrap();
2254 let strict_result = XRefTable::parse_with_options(&mut reader, &strict_options);
2255
2256 let lenient_options = ParseOptions {
2258 lenient_syntax: true,
2259 recover_from_stream_errors: true,
2260 ..Default::default()
2261 };
2262 reader.seek(SeekFrom::Start(9)).unwrap();
2263 let lenient_result = XRefTable::parse_with_options(&mut reader, &lenient_options);
2264
2265 assert!(strict_result.is_ok());
2267 assert!(lenient_result.is_ok());
2268 }
2269
2270 #[test]
2271 fn test_xref_entry_with_attached_flag() {
2272 let entry1 = XRefTable::parse_xref_entry("12345 0n");
2274 assert!(entry1.is_ok());
2275 let entry1 = entry1.unwrap();
2276 assert_eq!(entry1.offset, 12345);
2277 assert_eq!(entry1.generation, 0);
2278 assert!(entry1.in_use);
2279
2280 let entry2 = XRefTable::parse_xref_entry("54321 1f");
2281 assert!(entry2.is_ok());
2282 let entry2 = entry2.unwrap();
2283 assert_eq!(entry2.offset, 54321);
2284 assert_eq!(entry2.generation, 1);
2285 assert!(!entry2.in_use);
2286 }
2287
2288 #[test]
2289 fn test_find_xref_offset_edge_cases() {
2290 use std::io::{BufReader, Cursor};
2292
2293 let content = b"garbage\nstartxref \n 123 \n%%EOF";
2295 let mut reader = BufReader::new(Cursor::new(content));
2296 let result = XRefTable::find_xref_offset(&mut reader);
2297 assert_eq!(result.unwrap(), 123);
2298
2299 let content = b"startxref\n999\n%%EOF";
2301 let mut reader = BufReader::new(Cursor::new(content));
2302 let result = XRefTable::find_xref_offset(&mut reader);
2303 assert_eq!(result.unwrap(), 999);
2304
2305 let content = b"startxref\n456";
2307 let mut reader = BufReader::new(Cursor::new(content));
2308 let result = XRefTable::find_xref_offset(&mut reader);
2309 assert!(result.is_ok() || result.is_err());
2311
2312 let content = b"some content\n%%EOF";
2314 let mut reader = BufReader::new(Cursor::new(content));
2315 let result = XRefTable::find_xref_offset(&mut reader);
2316 assert!(result.is_err());
2317 }
2318
2319 #[test]
2320 fn test_xref_subsection_incomplete() {
2321 let pdf_content = b"%PDF-1.4\n\
2323xref\n\
23240 5\n\
23250000000000 65535 f \n\
23260000000015 00000 n \n\
2327trailer\n\
2328<< /Size 5 >>\n\
2329startxref\n\
23309\n\
2331%%EOF";
2332
2333 let mut reader = BufReader::new(Cursor::new(pdf_content));
2334 reader.seek(SeekFrom::Start(9)).unwrap();
2335
2336 let result = XRefTable::parse(&mut reader);
2338 assert!(result.is_err() || result.is_ok());
2340 }
2341}
2342
2343fn extract_root_from_xref_stream(content: &str) -> Option<u32> {
2345 let lines: Vec<&str> = content.lines().collect();
2350 let mut in_xref_obj = false;
2351
2352 for (i, line) in lines.iter().enumerate() {
2353 if line.contains(" obj")
2355 && lines
2356 .get(i + 1)
2357 .map_or(false, |next| next.contains("/Type /XRef"))
2358 {
2359 in_xref_obj = true;
2360 continue;
2361 }
2362
2363 if in_xref_obj {
2365 if line.contains("endobj") {
2366 in_xref_obj = false;
2367 continue;
2368 }
2369
2370 if let Some(root_pos) = line.find("/Root ") {
2372 let after_root = &line[root_pos + 6..]; if let Some(space_pos) = after_root.find(' ') {
2376 let number_part = &after_root[..space_pos];
2377 if let Ok(root_obj) = number_part.parse::<u32>() {
2378 eprintln!("Extracted Root {} from XRef stream", root_obj);
2379 return Some(root_obj);
2380 }
2381 }
2382 }
2383 }
2384 }
2385
2386 None
2387}
2388
2389fn find_catalog_by_content(table: &XRefTable, buffer: &[u8], content: &str) -> Option<u32> {
2391 for (obj_num, entry) in &table.entries {
2392 if entry.in_use {
2393 let offset = entry.offset as usize;
2394 if offset < buffer.len() {
2395 if let Some(obj_start) = content[offset..].find(&format!("{} 0 obj", obj_num)) {
2397 let absolute_start = offset + obj_start;
2398
2399 if let Some(endobj_pos) = content[absolute_start..].find("endobj") {
2401 let absolute_end = absolute_start + endobj_pos;
2402 let obj_content = &content[absolute_start..absolute_end];
2403
2404 if obj_content.contains("/Type /Catalog") {
2406 eprintln!(
2407 "Found catalog candidate at object {} (validated structure)",
2408 obj_num
2409 );
2410 return Some(*obj_num);
2411 }
2412 }
2413 }
2414 }
2415 }
2416 }
2417
2418 eprintln!("No valid catalog found by content search");
2419 None
2420}