1use super::xref_stream;
6use super::xref_types::{XRefEntryInfo, XRefEntryType};
7use super::{ParseError, ParseOptions, ParseResult};
8use crate::parser::reader::PDFLines;
9use std::collections::HashMap;
10use std::io::{BufRead, BufReader, Read, Seek, SeekFrom};
11
12fn find_byte_pattern(buffer: &[u8], pattern: &[u8]) -> Option<usize> {
23 buffer
24 .windows(pattern.len())
25 .position(|window| window == pattern)
26}
27
28fn rfind_byte_pattern(buffer: &[u8], pattern: &[u8]) -> Option<usize> {
30 buffer
31 .windows(pattern.len())
32 .rposition(|window| window == pattern)
33}
34
35fn parse_obj_header_bytes(line_bytes: &[u8]) -> Option<(u32, u16)> {
40 let line = String::from_utf8_lossy(line_bytes);
42 let parts: Vec<&str> = line.trim().split_whitespace().collect();
43
44 if parts.len() >= 3 && parts[2] == "obj" {
45 let obj_num = parts[0].parse::<u32>().ok()?;
46 let gen_num = parts[1].parse::<u16>().ok()?;
47 return Some((obj_num, gen_num));
48 }
49 None
50}
51
52#[derive(Debug, Clone, Copy, PartialEq, Eq)]
54struct ObjHeader {
55 obj_num: u32,
56 generation: u16,
57 offset: u64,
59}
60
61fn scan_window_for_headers(
66 window: &[u8],
67 window_base: u64,
68 out: &mut Vec<ObjHeader>,
69 seen: &mut std::collections::BTreeSet<u64>,
70) {
71 let mut pos = 0;
72 while pos < window.len() {
73 let Some(obj_rel) = find_byte_pattern(&window[pos..], b"obj") else {
74 break;
75 };
76 let abs = pos + obj_rel; if abs < 4 {
80 pos = abs + 3;
81 continue;
82 }
83
84 let line_start = window[..abs]
85 .iter()
86 .rposition(|&b| b == b'\n' || b == b'\r')
87 .map(|p| p + 1)
88 .unwrap_or(0);
89 let line_bytes = &window[line_start..abs + 3];
90
91 if let Some((obj_num, generation)) = parse_obj_header_bytes(line_bytes) {
92 let offset = window_base + line_start as u64;
93 if seen.insert(offset) {
94 out.push(ObjHeader {
95 obj_num,
96 generation,
97 offset,
98 });
99 }
100 }
101
102 pos = abs + 3;
103 }
104}
105
106fn scan_object_headers<R: Read + Seek>(reader: &mut R) -> ParseResult<Vec<ObjHeader>> {
114 scan_object_headers_chunked(reader, 64 * 1024)
115}
116
117fn scan_object_headers_chunked<R: Read + Seek>(
121 reader: &mut R,
122 chunk_size: usize,
123) -> ParseResult<Vec<ObjHeader>> {
124 let chunk_size = chunk_size.max(1);
125 const CARRY_CAP: usize = 1024;
128
129 reader.seek(SeekFrom::Start(0))?;
130
131 let mut headers: Vec<ObjHeader> = Vec::new();
132 let mut seen: std::collections::BTreeSet<u64> = std::collections::BTreeSet::new();
133 let mut carry: Vec<u8> = Vec::new();
134 let mut window_base: u64 = 0; let mut chunk = vec![0u8; chunk_size];
136
137 loop {
138 let mut filled = 0;
140 while filled < chunk_size {
141 let n = reader.read(&mut chunk[filled..])?;
142 if n == 0 {
143 break;
144 }
145 filled += n;
146 }
147 let eof = filled == 0;
148 if eof && carry.is_empty() {
149 break;
150 }
151
152 let mut window = std::mem::take(&mut carry);
154 window.extend_from_slice(&chunk[..filled]);
155
156 scan_window_for_headers(&window, window_base, &mut headers, &mut seen);
157
158 if eof {
159 break;
160 }
161
162 let last_nl = window.iter().rposition(|&b| b == b'\n' || b == b'\r');
165 let mut start = last_nl.map(|p| p + 1).unwrap_or(0);
166 if window.len() - start > CARRY_CAP {
167 start = window.len() - CARRY_CAP;
168 }
169 window_base += start as u64;
170 carry = window[start..].to_vec();
171 }
172
173 headers.sort_by_key(|h| h.offset);
174 Ok(headers)
175}
176
177fn read_window_at<R: Read + Seek>(reader: &mut R, offset: u64, max: usize) -> ParseResult<Vec<u8>> {
180 reader.seek(SeekFrom::Start(offset))?;
181 let mut buf = vec![0u8; max];
182 let mut filled = 0;
183 while filled < max {
184 let n = reader.read(&mut buf[filled..])?;
185 if n == 0 {
186 break;
187 }
188 filled += n;
189 }
190 buf.truncate(filled);
191 Ok(buf)
192}
193
194fn read_tail<R: Read + Seek>(reader: &mut R, max: usize) -> ParseResult<(u64, Vec<u8>)> {
197 let len = reader.seek(SeekFrom::End(0))?;
198 let start = len.saturating_sub(max as u64);
199 let bytes = read_window_at(reader, start, (len - start) as usize)?;
200 Ok((start, bytes))
201}
202
203fn read_object_content<R: Read + Seek>(
207 reader: &mut R,
208 obj_num: u32,
209 offset: u64,
210) -> ParseResult<Option<String>> {
211 const OBJ_WINDOW: usize = 64 * 1024;
212 let window = read_window_at(reader, offset, OBJ_WINDOW)?;
213 let obj_pattern = format!("{obj_num} 0 obj");
214 let Some(obj_start) = find_byte_pattern(&window, obj_pattern.as_bytes()) else {
215 return Ok(None);
216 };
217 let Some(endobj_rel) = find_byte_pattern(&window[obj_start..], b"endobj") else {
218 return Ok(None);
219 };
220 let content_bytes = &window[obj_start..obj_start + endobj_rel];
221 Ok(Some(String::from_utf8_lossy(content_bytes).into_owned()))
222}
223
224fn read_pdf_line<R: BufRead>(reader: &mut R, buf: &mut String) -> std::io::Result<usize> {
231 buf.clear();
232 let mut total_bytes = 0;
233
234 loop {
235 let available = reader.fill_buf()?;
236 if available.is_empty() {
237 break;
239 }
240
241 let mut found_terminator = false;
243 let mut consume_len = 0;
244
245 for (i, &byte) in available.iter().enumerate() {
246 if byte == b'\r' || byte == b'\n' {
247 let content = &available[..i];
250 buf.push_str(&String::from_utf8_lossy(content));
251 consume_len = i + 1; if byte == b'\r' && i + 1 < available.len() && available[i + 1] == b'\n' {
255 consume_len += 1; }
257
258 found_terminator = true;
259 break;
260 }
261 }
262
263 if found_terminator {
264 reader.consume(consume_len);
265 total_bytes += consume_len;
266 break;
267 } else {
268 let len = available.len();
270 buf.push_str(&String::from_utf8_lossy(available));
271 reader.consume(len);
272 total_bytes += len;
273 }
274 }
275
276 Ok(total_bytes)
277}
278
279#[derive(Debug, Clone, Copy, PartialEq)]
283pub struct XRefEntry {
284 pub offset: u64,
286 pub generation: u16,
288 pub in_use: bool,
290}
291
292#[derive(Debug, Clone, PartialEq)]
294pub struct XRefEntryExt {
295 pub basic: XRefEntry,
297 pub compressed_info: Option<(u32, u32)>, }
300
301#[derive(Debug, Clone)]
303pub struct XRefTable {
304 entries: HashMap<u32, XRefEntry>,
306 extended_entries: HashMap<u32, XRefEntryExt>,
308 trailer: Option<super::objects::PdfDictionary>,
310 xref_offset: u64,
312}
313
314impl Default for XRefTable {
315 fn default() -> Self {
316 Self::new()
317 }
318}
319
320impl XRefTable {
321 pub fn new() -> Self {
323 Self {
324 entries: HashMap::new(),
325 extended_entries: HashMap::new(),
326 trailer: None,
327 xref_offset: 0,
328 }
329 }
330
331 pub fn entries(&self) -> &HashMap<u32, XRefEntry> {
333 &self.entries
334 }
335
336 pub fn parse<R: Read + Seek>(reader: &mut BufReader<R>) -> ParseResult<Self> {
338 Self::parse_with_options(reader, &super::ParseOptions::default())
339 }
340
341 pub fn parse_with_options<R: Read + Seek>(
343 reader: &mut BufReader<R>,
344 options: &super::ParseOptions,
345 ) -> ParseResult<Self> {
346 match Self::parse_with_incremental_updates_options(reader, options) {
348 Ok(table) => Ok(table),
349 Err(e) => {
350 if options.lenient_syntax {
351 tracing::warn!("Primary XRef parsing failed: {e:?}, attempting recovery");
352
353 reader.seek(SeekFrom::Start(0))?;
355 Self::parse_with_recovery_options(reader, options)
356 } else {
357 Err(e)
358 }
359 }
360 }
361 }
362
363 #[allow(dead_code)]
365 fn parse_with_incremental_updates<R: Read + Seek>(
366 reader: &mut BufReader<R>,
367 ) -> ParseResult<Self> {
368 Self::parse_with_incremental_updates_options(reader, &super::ParseOptions::default())
369 }
370
371 fn parse_with_incremental_updates_options<R: Read + Seek>(
373 reader: &mut BufReader<R>,
374 options: &super::ParseOptions,
375 ) -> ParseResult<Self> {
376 let xref_offset = Self::find_xref_offset(reader)?;
378
379 let mut merged_table = Self::new();
381 let mut current_offset = Some(xref_offset);
382 let mut visited_offsets = std::collections::HashSet::new();
383
384 while let Some(offset) = current_offset {
385 if visited_offsets.contains(&offset) {
387 tracing::debug!(
388 "Circular reference in XRef chain at offset {} (already visited)",
389 offset
390 );
391 break;
392 }
393 visited_offsets.insert(offset);
394
395 reader.seek(SeekFrom::Start(offset))?;
397 let table = Self::parse_primary_with_options(reader, options)?;
398
399 let prev_offset = table
401 .trailer
402 .as_ref()
403 .and_then(|t| t.get("Prev"))
404 .and_then(|obj| obj.as_integer())
405 .map(|i| i as u64);
406
407 if let Some(_prev) = prev_offset {
408 } else {
409 }
410
411 let _regular_count = table.entries.len();
413 let _extended_count = table.extended_entries.len();
414
415 for (obj_num, entry) in table.entries {
416 merged_table.entries.entry(obj_num).or_insert(entry);
417 }
418 for (obj_num, ext_entry) in table.extended_entries {
419 merged_table
420 .extended_entries
421 .entry(obj_num)
422 .or_insert(ext_entry);
423 }
424
425 if merged_table.trailer.is_none() {
427 merged_table.trailer = table.trailer;
428 merged_table.xref_offset = table.xref_offset;
429 }
430
431 current_offset = prev_offset;
432 }
433
434 if options.lenient_syntax || options.collect_warnings {
438 if let Err(e) = Self::scan_and_fill_missing_objects(reader, &mut merged_table) {
446 tracing::debug!("scan_and_fill_missing_objects failed (non-fatal): {e}");
447 }
448 }
449
450 Ok(merged_table)
451 }
452
453 #[allow(dead_code)]
455 fn parse_primary<R: Read + Seek>(reader: &mut BufReader<R>) -> ParseResult<Self> {
456 Self::parse_primary_with_options(reader, &super::ParseOptions::default())
457 }
458
459 fn parse_primary_with_options<R: Read + Seek>(
465 reader: &mut BufReader<R>,
466 options: &super::ParseOptions,
467 ) -> ParseResult<Self> {
468 let mut table = Self::new();
469
470 let xref_offset = reader.stream_position()?;
474 table.xref_offset = xref_offset;
475
476 let mut line = String::new();
479 let pos = reader.stream_position()?;
480 read_pdf_line(reader, &mut line)?;
481
482 if line.trim() == "xref" {
483 Self::parse_traditional_xref_with_options(reader, &mut table, options)?;
485 } else {
486 tracing::debug!(
487 "Not a traditional xref, checking for xref stream. Line: {:?}",
488 line.trim()
489 );
490
491 reader.seek(SeekFrom::Start(pos))?;
493
494 let mut lexer = super::lexer::Lexer::new_with_options(&mut *reader, options.clone());
496
497 let obj_num = match lexer.next_token()? {
499 super::lexer::Token::Integer(n) => n as u32,
500 _ => return Err(ParseError::InvalidXRef),
501 };
502
503 tracing::debug!("Found object {obj_num} at xref position");
504
505 let _gen_num = match lexer.next_token()? {
506 super::lexer::Token::Integer(n) => n as u16,
507 _ => return Err(ParseError::InvalidXRef),
508 };
509
510 match lexer.next_token()? {
511 super::lexer::Token::Obj => {}
512 _ => return Err(ParseError::InvalidXRef),
513 };
514
515 let obj = super::objects::PdfObject::parse_with_options(&mut lexer, options)?;
517
518 if let Some(stream) = obj.as_stream() {
519 if stream
521 .dict
522 .get("Type")
523 .and_then(|o| o.as_name())
524 .map(|n| n.as_str())
525 == Some("XRef")
526 {
527 tracing::debug!("Parsing XRef stream");
528
529 let decoded_data = match stream.decode(options) {
531 Ok(data) => data,
532 Err(e) => {
533 tracing::debug!(
534 "XRef stream decode failed: {e:?}, attempting raw data fallback"
535 );
536
537 if !stream.data.is_empty() {
540 tracing::debug!(
541 "Using raw stream data ({} bytes) as fallback",
542 stream.data.len()
543 );
544 stream.data.clone()
545 } else {
546 tracing::debug!(
547 "No raw stream data available, triggering recovery mode"
548 );
549 return Err(e);
550 }
551 }
552 };
553
554 let xref_stream_parser = xref_stream::XRefStream::parse(
556 &mut *reader,
557 stream.dict.clone(),
558 decoded_data,
559 options,
560 )?;
561
562 let entries = xref_stream_parser.to_xref_entries()?;
564 tracing::debug!("XRef stream parsed, found {} entries", entries.len());
565
566 for (obj_num, entry) in entries {
568 match entry {
569 xref_stream::XRefEntry::Free {
570 next_free_object,
571 generation,
572 } => {
573 table.entries.insert(
574 obj_num,
575 XRefEntry {
576 offset: next_free_object as u64,
577 generation,
578 in_use: false,
579 },
580 );
581 }
582 xref_stream::XRefEntry::InUse { offset, generation } => {
583 table.entries.insert(
584 obj_num,
585 XRefEntry {
586 offset,
587 generation,
588 in_use: true,
589 },
590 );
591 }
592 xref_stream::XRefEntry::Compressed {
593 stream_object_number,
594 index_within_stream,
595 } => {
596 let ext_entry = XRefEntryExt {
598 basic: XRefEntry {
599 offset: 0,
600 generation: 0,
601 in_use: true,
602 },
603 compressed_info: Some((
604 stream_object_number,
605 index_within_stream,
606 )),
607 };
608 table.extended_entries.insert(obj_num, ext_entry);
609 table.entries.insert(
610 obj_num,
611 XRefEntry {
612 offset: 0,
613 generation: 0,
614 in_use: true,
615 },
616 );
617 }
618 }
619 }
620
621 table.trailer = Some(xref_stream_parser.trailer_dict().clone());
623 } else {
624 return Err(ParseError::InvalidXRef);
625 }
626 } else {
627 return Err(ParseError::InvalidXRef);
628 }
629 }
630
631 Ok(table)
632 }
633
634 #[allow(dead_code)]
636 fn parse_traditional_xref<R: Read + Seek>(
637 reader: &mut BufReader<R>,
638 table: &mut XRefTable,
639 ) -> ParseResult<()> {
640 Self::parse_traditional_xref_with_options(reader, table, &super::ParseOptions::default())
641 }
642
643 fn parse_traditional_xref_with_options<R: Read + Seek>(
645 reader: &mut BufReader<R>,
646 table: &mut XRefTable,
647 options: &super::ParseOptions,
648 ) -> ParseResult<()> {
649 let mut line = String::new();
650 let mut trailer_dict_offset: Option<u64> = None;
651
652 loop {
655 line.clear();
656 let line_start_pos = reader.stream_position()?;
657 read_pdf_line(reader, &mut line)?;
658 let trimmed_line = line.trim();
659
660 if trimmed_line.is_empty() || trimmed_line.starts_with('%') {
662 continue;
663 }
664
665 if trimmed_line == "trailer" {
669 break;
671 }
672 if let Some(dict_pos) = trimmed_line.find("<<") {
673 if trimmed_line.starts_with("trailer") {
674 let trailer_keyword_start =
677 trimmed_line.as_ptr() as usize - line.as_ptr() as usize;
678 trailer_dict_offset =
679 Some(line_start_pos + (trailer_keyword_start + dict_pos) as u64);
680 break;
681 }
682 }
683
684 if trimmed_line.starts_with("<<") {
686 tracing::warn!(" Found trailer dictionary without 'trailer' keyword");
687 trailer_dict_offset = Some(line_start_pos);
689 break;
690 }
691
692 let parts: Vec<&str> = trimmed_line.split_whitespace().collect();
694 if parts.len() != 2 {
695 return Err(ParseError::InvalidXRef);
697 }
698
699 let first_obj_num = parts[0]
700 .parse::<u32>()
701 .map_err(|_| ParseError::InvalidXRef)?;
702 let count = parts[1]
703 .parse::<u32>()
704 .map_err(|_| ParseError::InvalidXRef)?;
705
706 let mut entries_parsed = 0;
709 let mut i = 0;
710 while i < count {
711 line.clear();
712 let bytes_read = read_pdf_line(reader, &mut line)?;
713 let trimmed = line.trim();
714
715 if trimmed.starts_with('%') {
717 continue;
718 }
719
720 if bytes_read == 0 || trimmed == "trailer" {
722 tracing::debug!(
723 "Warning: XRef subsection incomplete - expected {count} entries but found only {entries_parsed}"
724 );
725 if line.trim() == "trailer" {
727 break;
729 }
730 break;
731 }
732
733 match Self::parse_xref_entry(&line) {
734 Ok(entry) => {
735 table.entries.insert(first_obj_num + i, entry);
736 entries_parsed += 1;
737 }
738 Err(_) => {
739 tracing::debug!(
740 "Warning: Invalid XRef entry at position {}: {:?}",
741 i,
742 line.trim()
743 );
744 }
746 }
747 i += 1;
748 }
749 }
751
752 if let Some(offset) = trailer_dict_offset {
756 reader.seek(SeekFrom::Start(offset))?;
757 }
758 let mut lexer = super::lexer::Lexer::new_with_options(reader, options.clone());
759 let trailer_obj = super::objects::PdfObject::parse_with_options(&mut lexer, options)?;
760 table.trailer = trailer_obj.as_dict().cloned();
763
764 if let Some(trailer) = &table.trailer {
766 if let Some(size_obj) = trailer.get("Size") {
767 if let Some(expected_size) = size_obj.as_integer() {
768 if let Some(max_obj_num) = table.entries.keys().max() {
771 let max_expected = (*max_obj_num + 1) as i64;
772 if max_expected > expected_size {
773 tracing::debug!(
774 "Warning: XRef table has object {} but trailer Size is only {}",
775 max_obj_num,
776 expected_size
777 );
778 return Err(ParseError::InvalidXRef);
780 }
781 }
782 }
783 }
784 }
785
786 Ok(())
790 }
791
792 #[allow(dead_code)]
799 fn find_linearized_xref<R: Read + Seek>(reader: &mut BufReader<R>) -> ParseResult<u64> {
800 reader.seek(SeekFrom::Start(0))?;
802 let mut header = String::new();
803 reader.read_line(&mut header)?;
804
805 if !header.starts_with("%PDF-") {
806 return Err(ParseError::InvalidHeader);
807 }
808
809 let mut line = String::new();
811 reader.read_line(&mut line)?;
812
813 let pos = reader.stream_position()?;
816 let mut buffer = vec![0u8; 1024];
817 let bytes_read = reader.read(&mut buffer)?;
818 buffer.truncate(bytes_read);
819
820 tracing::debug!(
824 "Checking for linearized PDF, first 100 bytes: {:?}",
825 String::from_utf8_lossy(&buffer[..buffer.len().min(100)])
826 );
827
828 if find_byte_pattern(&buffer, b"/Linearized").is_some() {
830 if let Some(xref_pos) = find_byte_pattern(&buffer, b"xref") {
836 return Ok(pos + xref_pos as u64);
837 }
838
839 if find_byte_pattern(&buffer, b"/Type/XRef").is_some()
841 || find_byte_pattern(&buffer, b"/Type /XRef").is_some()
842 {
843 if let Some(obj_pos) = find_byte_pattern(&buffer, b" obj") {
846 let search_from = obj_pos + 4;
848 if search_from < buffer.len() {
849 let after_first_obj = &buffer[search_from..];
850 if let Some(next_obj) = find_byte_pattern(after_first_obj, b" obj") {
851 let second_obj_start =
853 pos + (search_from + next_obj).saturating_sub(10) as u64;
854 return Ok(second_obj_start);
855 }
856 }
857 }
858 }
859 }
860
861 Err(ParseError::InvalidXRef)
862 }
863
864 fn find_xref_offset<R: Read + Seek>(reader: &mut BufReader<R>) -> ParseResult<u64> {
866 reader.seek(SeekFrom::End(0))?;
868 let file_size = reader.stream_position()?;
869
870 let read_size = std::cmp::min(1024, file_size);
872 reader.seek(SeekFrom::End(-(read_size as i64)))?;
873
874 let mut buffer = vec![0u8; read_size as usize];
875 reader.read_exact(&mut buffer)?;
876
877 let content = String::from_utf8_lossy(&buffer);
879
880 let debug_content = content.chars().take(200).collect::<String>();
882 tracing::debug!("XRef search in last {read_size} bytes: {debug_content:?}");
883
884 let mut lines = content.pdf_lines();
885
886 let mut last_offset = None;
893 while let Some(line) = lines.next() {
894 if line.trim() == "startxref" {
895 if let Some(offset_line) = lines.next() {
897 if let Ok(offset) = offset_line.trim().parse::<u64>() {
898 last_offset = Some(offset);
899 }
900 }
901 }
902 }
903
904 last_offset.ok_or(ParseError::InvalidXRef)
905 }
906
907 fn scan_and_fill_missing_objects<R: Read + Seek>(
909 reader: &mut BufReader<R>,
910 table: &mut Self,
911 ) -> ParseResult<()> {
912 for header in scan_object_headers(reader)? {
917 if !table.entries.contains_key(&header.obj_num)
918 && !table.extended_entries.contains_key(&header.obj_num)
919 {
920 table.add_entry(
921 header.obj_num,
922 XRefEntry {
923 offset: header.offset,
924 generation: header.generation,
925 in_use: true,
926 },
927 );
928 }
929 }
930
931 Ok(())
932 }
933
934 #[allow(dead_code)]
936 fn parse_with_recovery<R: Read + Seek>(reader: &mut BufReader<R>) -> ParseResult<Self> {
937 Self::parse_with_recovery_options(reader, &super::ParseOptions::default())
938 }
939
940 fn parse_with_recovery_options<R: Read + Seek>(
942 reader: &mut BufReader<R>,
943 _options: &super::ParseOptions,
944 ) -> ParseResult<Self> {
945 const ROOT_TAIL: usize = 256 * 1024;
949 const CATALOG_TAIL: usize = 100 * 1024;
950
951 let mut table = Self::new();
952
953 let headers = scan_object_headers(reader)?;
956 for h in &headers {
957 if !table.entries.contains_key(&h.obj_num) {
958 table.add_entry(
959 h.obj_num,
960 XRefEntry {
961 offset: h.offset,
962 generation: h.generation,
963 in_use: true,
964 },
965 );
966 }
967 }
968
969 if table.entries.is_empty() {
970 return Err(ParseError::InvalidXRef);
971 }
972 tracing::debug!("XRef recovery: found {} objects", table.len());
973
974 let (_, root_tail) = read_tail(reader, ROOT_TAIL)?;
977 let root_tail_str = String::from_utf8_lossy(&root_tail);
978 let xref_root_candidate = extract_root_from_xref_stream(&root_tail_str);
979
980 let mut trailer = super::objects::PdfDictionary::new();
982 trailer.insert(
983 "Size".to_string(),
984 super::objects::PdfObject::Integer(table.len() as i64),
985 );
986
987 let mut catalog_candidate = None;
989
990 if let Some(xref_root) = xref_root_candidate {
992 if table.entries.contains_key(&xref_root) {
993 catalog_candidate = Some(xref_root);
994 tracing::debug!("Using Root {} from XRef stream as catalog", xref_root);
995 } else {
996 tracing::debug!(
997 "Warning: XRef Root {} not found in object table, searching manually",
998 xref_root
999 );
1000 }
1001 }
1002
1003 if catalog_candidate.is_none() {
1005 catalog_candidate = find_catalog_by_content(reader, &table)?;
1006 }
1007
1008 if catalog_candidate.is_none() {
1010 for obj_num in [1, 2, 3, 4, 5] {
1011 let offset = match table.entries.get(&obj_num) {
1012 Some(entry) if entry.in_use => entry.offset,
1013 _ => continue,
1014 };
1015 if let Some(content) = read_object_content(reader, obj_num, offset)? {
1016 if content.contains("/Type/Sig") || content.contains("/Type /Sig") {
1018 tracing::debug!("Skipping object {} (Type: Sig)", obj_num);
1019 continue;
1020 }
1021 if content.contains("/Type/Catalog")
1022 || content.contains("/Type /Catalog")
1023 || content.contains("/Pages")
1024 {
1025 catalog_candidate = Some(obj_num);
1026 tracing::debug!(
1027 "Using fallback catalog candidate: object {} (validated)",
1028 obj_num
1029 );
1030 break;
1031 }
1032 }
1033 }
1034 }
1035
1036 if catalog_candidate.is_none() && !table.entries.is_empty() {
1038 tracing::debug!(
1039 "Last resort: Scanning all {} objects for any with /Pages or /Catalog",
1040 table.entries.len()
1041 );
1042
1043 let mut obj_numbers: Vec<u32> = table.entries.keys().copied().collect();
1044 obj_numbers.sort_unstable();
1045
1046 for obj_num in obj_numbers {
1047 let offset = match table.entries.get(&obj_num) {
1048 Some(entry) if entry.in_use => entry.offset,
1049 _ => continue,
1050 };
1051 if let Some(content) = read_object_content(reader, obj_num, offset)? {
1052 if content.contains("/Type/Sig") || content.contains("/Type /Sig") {
1053 continue;
1054 }
1055 if content.contains("/Type/Catalog") || content.contains("/Type /Catalog") {
1056 catalog_candidate = Some(obj_num);
1057 tracing::debug!(
1058 "Last resort: Found catalog at object {} (/Type/Catalog)",
1059 obj_num
1060 );
1061 break;
1062 } else if content.contains("/Pages") {
1063 catalog_candidate = Some(obj_num);
1064 tracing::debug!(
1065 "Last resort: Found catalog at object {} (has /Pages)",
1066 obj_num
1067 );
1068 break;
1069 }
1070 }
1071 }
1072
1073 if catalog_candidate.is_none() {
1076 tracing::debug!("Extreme last resort: Scanning last 100KB for /Type/Catalog");
1077
1078 let (_, search_buffer) = read_tail(reader, CATALOG_TAIL)?;
1079 if let Some(catalog_pos) = rfind_byte_pattern(&search_buffer, b"/Type/Catalog") {
1080 let local_search_start = catalog_pos.saturating_sub(200);
1081 let search_area = &search_buffer[local_search_start..catalog_pos];
1082
1083 if let Some(obj_pattern_pos) = rfind_byte_pattern(search_area, b" 0 obj") {
1084 let before_obj = &search_area[..obj_pattern_pos];
1085 let before_obj_str = String::from_utf8_lossy(before_obj);
1086 let trimmed = before_obj_str.trim_end();
1087
1088 if let Some(digit_start) = trimmed.rfind(|c: char| !c.is_ascii_digit()) {
1089 let num_str = trimmed[digit_start + 1..].trim();
1090 if !num_str.is_empty() {
1091 if let Ok(obj_num) = num_str.parse::<u32>() {
1092 tracing::debug!(
1093 "Extreme last resort: Found /Type/Catalog at object {}",
1094 obj_num
1095 );
1096 catalog_candidate = Some(obj_num);
1097 }
1098 }
1099 } else if let Ok(obj_num) = trimmed.trim().parse::<u32>() {
1100 tracing::debug!(
1101 "Extreme last resort: Found /Type/Catalog at object {}",
1102 obj_num
1103 );
1104 catalog_candidate = Some(obj_num);
1105 }
1106 }
1107 } else {
1108 tracing::debug!("Extreme last resort: No /Type/Catalog found in last 100KB");
1109 }
1110 }
1111
1112 if catalog_candidate.is_none() {
1114 tracing::warn!(" Could not find any catalog object, using first non-signature object as absolute last resort");
1115 let mut obj_numbers: Vec<u32> = table.entries.keys().copied().collect();
1116 obj_numbers.sort_unstable();
1117 for obj_num in obj_numbers {
1118 let offset = match table.entries.get(&obj_num) {
1119 Some(entry) => entry.offset,
1120 None => continue,
1121 };
1122 if let Some(content) = read_object_content(reader, obj_num, offset)? {
1123 if !content.contains("/Type/Sig") && !content.contains("/Type /Sig") {
1124 catalog_candidate = Some(obj_num);
1125 tracing::debug!("Using object {} as absolute last resort", obj_num);
1126 break;
1127 }
1128 }
1129 }
1130 }
1131 }
1132
1133 if let Some(root_obj) = catalog_candidate {
1134 trailer.insert(
1135 "Root".to_string(),
1136 super::objects::PdfObject::Reference(root_obj, 0),
1137 );
1138 }
1139
1140 table.set_trailer(trailer);
1141
1142 Ok(table)
1143 }
1144
1145 #[allow(dead_code)]
1151 fn validate_offset<R: Read + Seek>(reader: &mut BufReader<R>, offset: u64) -> ParseResult<()> {
1152 let file_size = reader.seek(SeekFrom::End(0))?;
1154
1155 if offset >= file_size {
1156 #[cfg(debug_assertions)]
1157 tracing::warn!(" XRef offset {offset} exceeds file size {file_size}");
1158 return Err(ParseError::InvalidXRef);
1159 }
1160
1161 reader.seek(SeekFrom::Start(offset))?;
1163 let mut peek = [0u8; 20];
1164 let read_bytes = reader.read(&mut peek)?;
1165
1166 if read_bytes == 0 {
1167 #[cfg(debug_assertions)]
1168 tracing::warn!(" XRef offset {offset} points to EOF");
1169 return Err(ParseError::InvalidXRef);
1170 }
1171
1172 let peek_slice = &peek[..read_bytes];
1175 let starts_with_xref = peek_slice.len() >= 4 && &peek_slice[..4] == b"xref";
1176 let starts_with_digit = peek_slice.first().map_or(false, |&b| b.is_ascii_digit());
1177
1178 if !starts_with_xref && !starts_with_digit {
1179 #[cfg(debug_assertions)]
1180 {
1181 let debug_len = std::cmp::min(10, read_bytes);
1182 let debug_content = String::from_utf8_lossy(&peek[..debug_len]);
1183 tracing::debug!(
1184 "Warning: XRef offset {} does not point to valid XRef content: {:?}",
1185 offset,
1186 debug_content
1187 );
1188 }
1189 }
1191
1192 Ok(())
1193 }
1194
1195 fn parse_xref_entry(line: &str) -> ParseResult<XRefEntry> {
1197 let line = line.trim();
1198
1199 if line.len() >= 18 {
1201 if let Ok(entry) = Self::parse_xref_entry_standard(line) {
1202 return Ok(entry);
1203 }
1204 }
1205
1206 Self::parse_xref_entry_flexible(line)
1208 }
1209
1210 fn parse_xref_entry_standard(line: &str) -> ParseResult<XRefEntry> {
1212 if line.len() < 18 {
1215 return Err(ParseError::InvalidXRef);
1216 }
1217
1218 let offset_str = &line[0..10];
1219 let gen_str = &line[11..16];
1220 let flag = line.chars().nth(17);
1221
1222 let offset = offset_str
1223 .trim()
1224 .parse::<u64>()
1225 .map_err(|_| ParseError::InvalidXRef)?;
1226 let generation = gen_str
1227 .trim()
1228 .parse::<u16>()
1229 .map_err(|_| ParseError::InvalidXRef)?;
1230
1231 let in_use = match flag {
1232 Some('n') => true,
1233 Some('f') => false,
1234 _ => return Err(ParseError::InvalidXRef),
1235 };
1236
1237 Ok(XRefEntry {
1238 offset,
1239 generation,
1240 in_use,
1241 })
1242 }
1243
1244 fn parse_xref_entry_flexible(line: &str) -> ParseResult<XRefEntry> {
1246 let parts: Vec<&str> = line.split_whitespace().collect();
1254
1255 if parts.is_empty() {
1256 return Err(ParseError::InvalidXRef);
1257 }
1258
1259 let offset = parts[0]
1261 .parse::<u64>()
1262 .map_err(|_| ParseError::InvalidXRef)?;
1263
1264 let (generation, flag_from_gen) = if parts.len() >= 2 {
1266 let gen_part = parts[1];
1267 if gen_part == "n" || gen_part == "f" {
1269 (0, gen_part.chars().next())
1271 } else if gen_part.ends_with('n') || gen_part.ends_with('f') {
1272 let flag_char = gen_part.chars().last().ok_or(ParseError::InvalidXRef)?;
1274 let gen_str = &gen_part[..gen_part.len() - 1];
1275 if gen_str.is_empty() {
1276 (0, Some(flag_char))
1278 } else {
1279 let gen = gen_str
1280 .parse::<u16>()
1281 .map_err(|_| ParseError::InvalidXRef)?;
1282 (gen, Some(flag_char))
1283 }
1284 } else {
1285 let gen = gen_part
1287 .parse::<u16>()
1288 .map_err(|_| ParseError::InvalidXRef)?;
1289 (gen, None)
1290 }
1291 } else {
1292 (0, None)
1293 };
1294
1295 let in_use = if let Some(flag_char) = flag_from_gen {
1297 match flag_char {
1299 'n' => true,
1300 'f' => false,
1301 _ => true, }
1303 } else if parts.len() >= 3 {
1304 match parts[2].chars().next() {
1306 Some('n') => true,
1307 Some('f') => false,
1308 _ => {
1309 #[cfg(debug_assertions)]
1311 tracing::warn!(" Invalid xref flag '{}', assuming 'n'", parts[2]);
1312 true
1313 }
1314 }
1315 } else {
1316 true
1318 };
1319
1320 Ok(XRefEntry {
1321 offset,
1322 generation,
1323 in_use,
1324 })
1325 }
1326
1327 pub fn get_entry(&self, obj_num: u32) -> Option<&XRefEntry> {
1329 self.entries.get(&obj_num)
1330 }
1331
1332 pub fn get_entry_mut(&mut self, obj_num: u32) -> Option<&mut XRefEntry> {
1334 self.entries.get_mut(&obj_num)
1335 }
1336
1337 pub fn trailer(&self) -> Option<&super::objects::PdfDictionary> {
1339 self.trailer.as_ref()
1340 }
1341
1342 pub fn xref_offset(&self) -> u64 {
1344 self.xref_offset
1345 }
1346
1347 pub fn len(&self) -> usize {
1349 self.entries.len()
1350 }
1351
1352 pub fn is_empty(&self) -> bool {
1354 self.entries.is_empty()
1355 }
1356
1357 pub fn iter(&self) -> impl Iterator<Item = (&u32, &XRefEntry)> {
1359 self.entries.iter()
1360 }
1361
1362 pub fn get_extended_entry(&self, obj_num: u32) -> Option<&XRefEntryExt> {
1364 self.extended_entries.get(&obj_num)
1365 }
1366
1367 pub fn is_compressed(&self, obj_num: u32) -> bool {
1369 self.extended_entries
1370 .get(&obj_num)
1371 .map(|e| e.compressed_info.is_some())
1372 .unwrap_or(false)
1373 }
1374
1375 pub fn add_entry(&mut self, obj_num: u32, entry: XRefEntry) {
1377 self.entries.insert(obj_num, entry);
1378 }
1379
1380 pub fn set_trailer(&mut self, trailer: super::objects::PdfDictionary) {
1382 self.trailer = Some(trailer);
1383 }
1384
1385 pub fn add_extended_entry(&mut self, obj_num: u32, entry: XRefEntryExt) {
1387 self.extended_entries.insert(obj_num, entry);
1388 }
1389}
1390
1391#[derive(Debug, Clone)]
1394pub struct XRefStream {
1395 stream: super::objects::PdfStream,
1397 entries: HashMap<u32, XRefEntry>,
1399 extended_entries: HashMap<u32, XRefEntryExt>,
1401}
1402
1403impl XRefStream {
1404 pub fn parse(stream: super::objects::PdfStream) -> ParseResult<Self> {
1406 let mut xref_stream = Self {
1407 stream,
1408 entries: HashMap::new(),
1409 extended_entries: HashMap::new(),
1410 };
1411
1412 xref_stream.decode_entries()?;
1413 Ok(xref_stream)
1414 }
1415
1416 fn decode_entries(&mut self) -> ParseResult<()> {
1418 let dict = &self.stream.dict;
1420
1421 let size = dict
1423 .get("Size")
1424 .and_then(|obj| obj.as_integer())
1425 .ok_or_else(|| ParseError::MissingKey("Size".to_string()))?;
1426
1427 let index = match dict.get("Index") {
1429 Some(obj) => {
1430 let array = obj.as_array().ok_or_else(|| ParseError::SyntaxError {
1431 position: 0,
1432 message: "Index must be an array".to_string(),
1433 })?;
1434
1435 let mut pairs = Vec::new();
1437 for chunk in array.0.chunks(2) {
1438 if chunk.len() != 2 {
1439 return Err(ParseError::SyntaxError {
1440 position: 0,
1441 message: "Index array must have even number of elements".to_string(),
1442 });
1443 }
1444 let first = chunk[0]
1445 .as_integer()
1446 .ok_or_else(|| ParseError::SyntaxError {
1447 position: 0,
1448 message: "Index values must be integers".to_string(),
1449 })? as u32;
1450 let count = chunk[1]
1451 .as_integer()
1452 .ok_or_else(|| ParseError::SyntaxError {
1453 position: 0,
1454 message: "Index values must be integers".to_string(),
1455 })? as u32;
1456 pairs.push((first, count));
1457 }
1458 pairs
1459 }
1460 None => {
1461 vec![(0, size as u32)]
1463 }
1464 };
1465
1466 let w_array = dict
1468 .get("W")
1469 .and_then(|obj| obj.as_array())
1470 .ok_or_else(|| ParseError::MissingKey("W".to_string()))?;
1471
1472 if w_array.len() != 3 {
1473 return Err(ParseError::SyntaxError {
1474 position: 0,
1475 message: "W array must have exactly 3 elements".to_string(),
1476 });
1477 }
1478
1479 let w: Vec<usize> = w_array
1480 .0
1481 .iter()
1482 .map(|obj| {
1483 obj.as_integer()
1484 .ok_or_else(|| ParseError::SyntaxError {
1485 position: 0,
1486 message: "W values must be integers".to_string(),
1487 })
1488 .map(|i| i as usize)
1489 })
1490 .collect::<ParseResult<Vec<_>>>()?;
1491
1492 let data = self.stream.decode(&ParseOptions::default())?;
1494 let mut offset = 0;
1495
1496 for (first_obj_num, count) in index {
1498 for i in 0..count {
1499 if offset + w[0] + w[1] + w[2] > data.len() {
1500 return Err(ParseError::SyntaxError {
1501 position: 0,
1502 message: "Xref stream data truncated".to_string(),
1503 });
1504 }
1505
1506 let field1 = Self::read_field(&data[offset..], w[0]);
1508 offset += w[0];
1509
1510 let field2 = Self::read_field(&data[offset..], w[1]);
1511 offset += w[1];
1512
1513 let field3 = Self::read_field(&data[offset..], w[2]);
1514 offset += w[2];
1515
1516 let entry_info =
1518 XRefEntryInfo::new(XRefEntryType::from_value(field1), field2, field3);
1519
1520 let entry = match entry_info.entry_type {
1522 XRefEntryType::Free => XRefEntry {
1523 offset: entry_info.field2,
1524 generation: entry_info.field3 as u16,
1525 in_use: false,
1526 },
1527 XRefEntryType::Uncompressed => XRefEntry {
1528 offset: entry_info.field2,
1529 generation: entry_info.field3 as u16,
1530 in_use: true,
1531 },
1532 XRefEntryType::Compressed => {
1533 let ext_entry = XRefEntryExt {
1535 basic: XRefEntry {
1536 offset: 0,
1537 generation: 0,
1538 in_use: true,
1539 },
1540 compressed_info: entry_info.get_compressed_info(),
1541 };
1542 self.extended_entries
1543 .insert(first_obj_num + i, ext_entry.clone());
1544 ext_entry.basic
1545 }
1546 XRefEntryType::Custom(_type_num) => {
1547 #[cfg(debug_assertions)]
1550 tracing::debug!(
1551 "Note: Custom xref entry type {} for object {} (treating as in-use)",
1552 _type_num,
1553 first_obj_num + i
1554 );
1555
1556 let ext_entry = XRefEntryExt {
1558 basic: XRefEntry {
1559 offset: entry_info.field2,
1560 generation: entry_info.field3 as u16,
1561 in_use: entry_info.entry_type.is_in_use(),
1562 },
1563 compressed_info: None,
1564 };
1565 self.extended_entries
1566 .insert(first_obj_num + i, ext_entry.clone());
1567 ext_entry.basic
1568 }
1569 };
1570
1571 self.entries.insert(first_obj_num + i, entry);
1572 }
1573 }
1574
1575 Ok(())
1576 }
1577
1578 fn read_field(data: &[u8], width: usize) -> u64 {
1580 let mut value = 0u64;
1581 for i in 0..width {
1582 if i < data.len() {
1583 value = (value << 8) | (data[i] as u64);
1584 }
1585 }
1586 value
1587 }
1588
1589 pub fn get_entry(&self, obj_num: u32) -> Option<&XRefEntry> {
1591 self.entries.get(&obj_num)
1592 }
1593
1594 pub fn trailer(&self) -> &super::objects::PdfDictionary {
1596 &self.stream.dict
1597 }
1598}
1599
1600#[cfg(test)]
1601mod tests {
1602 use super::*;
1603
1604 use crate::parser::objects::{PdfDictionary, PdfObject};
1605 use std::io::Cursor;
1606
1607 #[test]
1610 fn test_scan_object_headers_finds_simple_headers() {
1611 let mut buf = Vec::new();
1612 buf.extend_from_slice(b"%PDF-1.7\n");
1613 let off1 = buf.len() as u64;
1614 buf.extend_from_slice(b"1 0 obj\n<< /Type /Catalog >>\nendobj\n");
1615 let off2 = buf.len() as u64;
1616 buf.extend_from_slice(b"2 0 obj\n<< /Type /Pages >>\nendobj\n");
1617 let off10 = buf.len() as u64;
1618 buf.extend_from_slice(b"10 0 obj\n<< /Length 0 >>\nendobj\n");
1619
1620 let mut cursor = Cursor::new(buf);
1621 let headers = scan_object_headers(&mut cursor).unwrap();
1622
1623 assert_eq!(
1624 headers,
1625 vec![
1626 ObjHeader {
1627 obj_num: 1,
1628 generation: 0,
1629 offset: off1
1630 },
1631 ObjHeader {
1632 obj_num: 2,
1633 generation: 0,
1634 offset: off2
1635 },
1636 ObjHeader {
1637 obj_num: 10,
1638 generation: 0,
1639 offset: off10
1640 },
1641 ]
1642 );
1643 }
1644
1645 #[test]
1646 fn test_scan_object_headers_chunk_invariant_across_boundaries() {
1647 let mut buf = Vec::new();
1650 buf.extend_from_slice(b"%PDF-1.7\n");
1651 let mut expected: Vec<(u32, u64)> = Vec::new();
1652 for i in 1..=50u32 {
1653 for _ in 0..(i as usize % 7) {
1654 buf.push(b' ');
1655 }
1656 buf.push(b'\n');
1657 expected.push((i, buf.len() as u64));
1658 buf.extend_from_slice(format!("{i} 0 obj\n<< /N {i} >>\nendobj\n").as_bytes());
1659 }
1660
1661 let reference =
1663 scan_object_headers_chunked(&mut Cursor::new(buf.clone()), buf.len().max(1)).unwrap();
1664
1665 let got: Vec<(u32, u64)> = reference.iter().map(|h| (h.obj_num, h.offset)).collect();
1666 assert_eq!(
1667 got, expected,
1668 "reference scan disagrees with hand-computed offsets"
1669 );
1670
1671 for cs in [1usize, 2, 3, 7, 13, 16, 64, 256] {
1672 let chunked = scan_object_headers_chunked(&mut Cursor::new(buf.clone()), cs).unwrap();
1673 assert_eq!(chunked, reference, "scan mismatch at chunk_size={cs}");
1674 }
1675 }
1676
1677 #[test]
1678 fn test_scan_object_headers_ignores_endobj_keyword() {
1679 let mut buf = Vec::new();
1681 buf.extend_from_slice(b"%PDF\n");
1682 let off = buf.len() as u64;
1683 buf.extend_from_slice(b"7 0 obj\n<< >>\nendobj\nendobj\n");
1684
1685 let headers = scan_object_headers(&mut Cursor::new(buf)).unwrap();
1686 assert_eq!(
1687 headers,
1688 vec![ObjHeader {
1689 obj_num: 7,
1690 generation: 0,
1691 offset: off
1692 }]
1693 );
1694 }
1695
1696 #[test]
1697 fn test_scan_object_headers_carry_truncation_no_newline_run() {
1698 let mut buf = Vec::new();
1703 buf.extend_from_slice(b"%PDF-1.7\n");
1704
1705 let filler: Vec<u8> = (0..2000u32)
1707 .map(|i| if i % 5 == 0 { b' ' } else { b'7' })
1708 .collect();
1709
1710 buf.extend_from_slice(&filler);
1712 buf.push(b'\n');
1713 let off_a = buf.len() as u64;
1714 buf.extend_from_slice(b"5 0 obj\n<< >>\nendobj\n");
1715
1716 buf.extend_from_slice(&filler);
1719 buf.extend_from_slice(b"7 0 obj\n<< >>\nendobj\n");
1720
1721 let reference =
1722 scan_object_headers_chunked(&mut Cursor::new(buf.clone()), buf.len().max(1)).unwrap();
1723
1724 for cs in [16usize, 64, 256] {
1726 let chunked = scan_object_headers_chunked(&mut Cursor::new(buf.clone()), cs).unwrap();
1727 assert_eq!(
1728 chunked, reference,
1729 "carry-truncation mismatch at chunk_size={cs}"
1730 );
1731 }
1732
1733 assert!(reference
1736 .iter()
1737 .any(|h| h.obj_num == 5 && h.offset == off_a));
1738 assert!(!reference.iter().any(|h| h.obj_num == 7));
1739 }
1740
1741 #[test]
1742 fn test_scan_object_headers_empty_input() {
1743 let headers = scan_object_headers(&mut Cursor::new(Vec::new())).unwrap();
1744 assert!(headers.is_empty());
1745 }
1746
1747 #[test]
1748 fn test_scan_object_headers_reads_in_bounded_chunks() {
1749 struct MaxReadReader<R> {
1751 inner: R,
1752 max_read: usize,
1753 }
1754 impl<R: Read> Read for MaxReadReader<R> {
1755 fn read(&mut self, buf: &mut [u8]) -> std::io::Result<usize> {
1756 self.max_read = self.max_read.max(buf.len());
1757 self.inner.read(buf)
1758 }
1759 }
1760 impl<R: Seek> Seek for MaxReadReader<R> {
1761 fn seek(&mut self, p: SeekFrom) -> std::io::Result<u64> {
1762 self.inner.seek(p)
1763 }
1764 }
1765
1766 let mut buf = Vec::new();
1767 buf.extend_from_slice(b"%PDF\n");
1768 for i in 1..=2000u32 {
1769 buf.extend_from_slice(format!("{i} 0 obj\n<< >>\nendobj\n").as_bytes());
1770 }
1771 let total = buf.len();
1772 assert!(
1773 total > 8192,
1774 "fixture must exceed the chunk size to be meaningful"
1775 );
1776
1777 let mut r = MaxReadReader {
1778 inner: Cursor::new(buf),
1779 max_read: 0,
1780 };
1781 let headers = scan_object_headers_chunked(&mut r, 4096).unwrap();
1782
1783 assert_eq!(headers.len(), 2000);
1784 assert_eq!(headers[0].obj_num, 1);
1785 assert_eq!(headers[1999].obj_num, 2000);
1786 assert!(
1789 r.max_read <= 4096,
1790 "scanner requested {} bytes in a single read (chunk=4096, file={total}); not bounded",
1791 r.max_read
1792 );
1793 }
1794
1795 #[test]
1796 fn test_scan_and_fill_adds_missing_preserves_present() {
1797 let mut buf = Vec::new();
1798 buf.extend_from_slice(b"%PDF-1.7\n");
1799 let off1 = buf.len() as u64;
1800 buf.extend_from_slice(b"1 0 obj\n<< /Type /Catalog >>\nendobj\n");
1801 buf.extend_from_slice(b"2 0 obj\n<< /Type /Pages >>\nendobj\n");
1802 let off3 = buf.len() as u64;
1803 buf.extend_from_slice(b"3 0 obj\n<< >>\nendobj\n");
1804
1805 let mut table = XRefTable::new();
1806 table.add_entry(
1808 2,
1809 XRefEntry {
1810 offset: 99999,
1811 generation: 0,
1812 in_use: true,
1813 },
1814 );
1815
1816 let mut reader = BufReader::new(Cursor::new(buf));
1817 XRefTable::scan_and_fill_missing_objects(&mut reader, &mut table).unwrap();
1818
1819 assert_eq!(table.get_entry(1).map(|e| e.offset), Some(off1));
1821 assert_eq!(table.get_entry(3).map(|e| e.offset), Some(off3));
1822 assert_eq!(table.get_entry(2).map(|e| e.offset), Some(99999));
1824 }
1825
1826 #[test]
1827 fn test_recovery_finds_objects_and_catalog_root() {
1828 let mut buf = Vec::new();
1829 buf.extend_from_slice(b"%PDF-1.7\n");
1830 let off1 = buf.len() as u64;
1831 buf.extend_from_slice(b"1 0 obj\n<< /Type /Catalog /Pages 2 0 R >>\nendobj\n");
1832 let off2 = buf.len() as u64;
1833 buf.extend_from_slice(b"2 0 obj\n<< /Type /Pages /Kids [3 0 R] /Count 1 >>\nendobj\n");
1834 let off3 = buf.len() as u64;
1835 buf.extend_from_slice(b"3 0 obj\n<< /Type /Page /Parent 2 0 R >>\nendobj\n");
1836
1837 let mut reader = BufReader::new(Cursor::new(buf));
1838 let table =
1839 XRefTable::parse_with_recovery_options(&mut reader, &ParseOptions::default()).unwrap();
1840
1841 assert_eq!(table.get_entry(1).map(|e| e.offset), Some(off1));
1842 assert_eq!(table.get_entry(2).map(|e| e.offset), Some(off2));
1843 assert_eq!(table.get_entry(3).map(|e| e.offset), Some(off3));
1844
1845 let root = table.trailer().and_then(|t| t.get("Root")).cloned();
1847 assert_eq!(root, Some(PdfObject::Reference(1, 0)));
1848 }
1849
1850 #[test]
1851 fn test_recovery_uses_root_from_xref_stream() {
1852 let mut buf = Vec::new();
1853 buf.extend_from_slice(b"%PDF-1.7\n");
1854 buf.extend_from_slice(b"5 0 obj\n<< /Type /Catalog /Pages 6 0 R >>\nendobj\n");
1855 buf.extend_from_slice(b"6 0 obj\n<< /Type /Pages /Count 0 >>\nendobj\n");
1856 buf.extend_from_slice(
1858 b"9 0 obj\n<< /Type /XRef /Root 5 0 R /Size 10 >>\nstream\n....\nendstream\nendobj\n",
1859 );
1860
1861 let mut reader = BufReader::new(Cursor::new(buf));
1862 let table =
1863 XRefTable::parse_with_recovery_options(&mut reader, &ParseOptions::default()).unwrap();
1864
1865 let root = table.trailer().and_then(|t| t.get("Root")).cloned();
1866 assert_eq!(root, Some(PdfObject::Reference(5, 0)));
1867 }
1868
1869 #[test]
1870 fn test_recovery_empty_when_no_objects() {
1871 let mut reader = BufReader::new(Cursor::new(b"%PDF-1.7\nnothing useful here\n".to_vec()));
1872 let result = XRefTable::parse_with_recovery_options(&mut reader, &ParseOptions::default());
1873 assert!(matches!(result, Err(ParseError::InvalidXRef)));
1874 }
1875
1876 #[test]
1877 fn test_parse_xref_entry() {
1878 let entry1 = XRefTable::parse_xref_entry("0000000000 65535 f ").unwrap();
1879 assert_eq!(entry1.offset, 0);
1880 assert_eq!(entry1.generation, 65535);
1881 assert!(!entry1.in_use);
1882
1883 let entry2 = XRefTable::parse_xref_entry("0000000017 00000 n ").unwrap();
1884 assert_eq!(entry2.offset, 17);
1885 assert_eq!(entry2.generation, 0);
1886 assert!(entry2.in_use);
1887 }
1888
1889 #[test]
1890 fn test_parse_xref_entry_flexible() {
1891 let entry1 = XRefTable::parse_xref_entry("17 0 n").unwrap();
1895 assert_eq!(entry1.offset, 17);
1896 assert_eq!(entry1.generation, 0);
1897 assert!(entry1.in_use);
1898
1899 let entry2 = XRefTable::parse_xref_entry("123 5 f").unwrap();
1901 assert_eq!(entry2.offset, 123);
1902 assert_eq!(entry2.generation, 5);
1903 assert!(!entry2.in_use);
1904
1905 let entry3 = XRefTable::parse_xref_entry("456 n").unwrap();
1907 assert_eq!(entry3.offset, 456);
1908 assert_eq!(entry3.generation, 0);
1909 assert!(entry3.in_use);
1910
1911 let entry4 = XRefTable::parse_xref_entry("789 2").unwrap();
1913 assert_eq!(entry4.offset, 789);
1914 assert_eq!(entry4.generation, 2);
1915 assert!(entry4.in_use);
1916
1917 let entry5 = XRefTable::parse_xref_entry("1000 0n").unwrap();
1919 assert_eq!(entry5.offset, 1000);
1920 assert_eq!(entry5.generation, 0);
1921 assert!(entry5.in_use);
1922
1923 let entry6 = XRefTable::parse_xref_entry("2000 1f").unwrap();
1924 assert_eq!(entry6.offset, 2000);
1925 assert_eq!(entry6.generation, 1);
1926 assert!(!entry6.in_use);
1927
1928 let entry7 = XRefTable::parse_xref_entry("3000\t0\tn").unwrap();
1930 assert_eq!(entry7.offset, 3000);
1931 assert_eq!(entry7.generation, 0);
1932 assert!(entry7.in_use);
1933 }
1934
1935 #[test]
1936 fn test_parse_xref_entry_invalid_flag_fallback() {
1937 let entry = XRefTable::parse_xref_entry("100 0 x").unwrap();
1939 assert_eq!(entry.offset, 100);
1940 assert_eq!(entry.generation, 0);
1941 assert!(entry.in_use); }
1943
1944 #[test]
1945 fn test_parse_xref_entry_malformed() {
1946 let result = XRefTable::parse_xref_entry("");
1948 assert!(result.is_err());
1949
1950 let result = XRefTable::parse_xref_entry("abc 0 n");
1952 assert!(result.is_err());
1953
1954 let result = XRefTable::parse_xref_entry(" ");
1956 assert!(result.is_err());
1957 }
1958
1959 #[test]
1960 fn test_xref_table_new() {
1961 let table = XRefTable::new();
1962 assert!(table.entries.is_empty());
1963 assert!(table.extended_entries.is_empty());
1964 assert!(table.trailer.is_none());
1965 assert_eq!(table.xref_offset, 0);
1966 }
1967
1968 #[test]
1969 fn test_xref_table_default() {
1970 let table = XRefTable::default();
1971 assert!(table.entries.is_empty());
1972 assert!(table.extended_entries.is_empty());
1973 assert!(table.trailer.is_none());
1974 }
1975
1976 #[test]
1977 fn test_xref_entry_struct() {
1978 let entry = XRefEntry {
1979 offset: 12345,
1980 generation: 7,
1981 in_use: true,
1982 };
1983 assert_eq!(entry.offset, 12345);
1984 assert_eq!(entry.generation, 7);
1985 assert!(entry.in_use);
1986 }
1987
1988 #[test]
1989 fn test_xref_entry_equality() {
1990 let entry1 = XRefEntry {
1991 offset: 100,
1992 generation: 0,
1993 in_use: true,
1994 };
1995 let entry2 = XRefEntry {
1996 offset: 100,
1997 generation: 0,
1998 in_use: true,
1999 };
2000 assert_eq!(entry1, entry2);
2001 }
2002
2003 #[test]
2004 fn test_xref_entry_clone() {
2005 let entry = XRefEntry {
2006 offset: 999,
2007 generation: 3,
2008 in_use: false,
2009 };
2010 let cloned = entry;
2011 assert_eq!(cloned.offset, 999);
2012 assert_eq!(cloned.generation, 3);
2013 assert!(!cloned.in_use);
2014 }
2015
2016 #[test]
2017 fn test_xref_entry_ext() {
2018 let ext_entry = XRefEntryExt {
2019 basic: XRefEntry {
2020 offset: 500,
2021 generation: 0,
2022 in_use: true,
2023 },
2024 compressed_info: Some((10, 5)),
2025 };
2026 assert_eq!(ext_entry.basic.offset, 500);
2027 assert_eq!(ext_entry.compressed_info, Some((10, 5)));
2028 }
2029
2030 #[test]
2031 fn test_xref_entry_ext_no_compression() {
2032 let ext_entry = XRefEntryExt {
2033 basic: XRefEntry {
2034 offset: 1000,
2035 generation: 1,
2036 in_use: true,
2037 },
2038 compressed_info: None,
2039 };
2040 assert!(ext_entry.compressed_info.is_none());
2041 }
2042
2043 #[test]
2044 fn test_add_entry() {
2045 let mut table = XRefTable::new();
2046 table.add_entry(
2047 5,
2048 XRefEntry {
2049 offset: 1000,
2050 generation: 0,
2051 in_use: true,
2052 },
2053 );
2054 assert_eq!(table.entries.len(), 1);
2055 assert!(table.entries.contains_key(&5));
2056 }
2057
2058 #[test]
2059 fn test_get_entry() {
2060 let mut table = XRefTable::new();
2061 let entry = XRefEntry {
2062 offset: 2000,
2063 generation: 1,
2064 in_use: true,
2065 };
2066 table.add_entry(10, entry);
2067
2068 let retrieved = table.get_entry(10);
2069 assert!(retrieved.is_some());
2070 assert_eq!(retrieved.unwrap().offset, 2000);
2071
2072 let missing = table.get_entry(999);
2073 assert!(missing.is_none());
2074 }
2075
2076 #[test]
2077 fn test_set_trailer() {
2078 let mut table = XRefTable::new();
2079 let mut trailer = PdfDictionary::new();
2080 trailer.insert("Size".to_string(), PdfObject::Integer(10));
2081
2082 table.set_trailer(trailer.clone());
2083 assert!(table.trailer.is_some());
2084 assert_eq!(
2085 table.trailer().unwrap().get("Size"),
2086 Some(&PdfObject::Integer(10))
2087 );
2088 }
2089
2090 #[test]
2091 fn test_parse_xref_entry_invalid() {
2092 let result = XRefTable::parse_xref_entry("0000000000 65535");
2094 assert!(result.is_ok()); let result = XRefTable::parse_xref_entry("not_a_number 65535 f ");
2098 assert!(result.is_err());
2099
2100 let result = XRefTable::parse_xref_entry("0000000000 65535 x ");
2102 assert!(result.is_ok()); assert!(result.unwrap().in_use); }
2105
2106 #[test]
2107 fn test_parse_xref_entry_various_offsets() {
2108 let entry = XRefTable::parse_xref_entry("0000000001 00000 n ").unwrap();
2110 assert_eq!(entry.offset, 1);
2111
2112 let entry = XRefTable::parse_xref_entry("9999999999 00000 n ").unwrap();
2114 assert_eq!(entry.offset, 9999999999);
2115
2116 let entry = XRefTable::parse_xref_entry("0000000000 65535 f ").unwrap();
2118 assert_eq!(entry.generation, 65535);
2119 }
2120
2121 #[test]
2122 fn test_add_extended_entry() {
2123 let mut table = XRefTable::new();
2124 let ext_entry = XRefEntryExt {
2125 basic: XRefEntry {
2126 offset: 0,
2127 generation: 0,
2128 in_use: true,
2129 },
2130 compressed_info: Some((5, 10)),
2131 };
2132
2133 table.add_extended_entry(15, ext_entry);
2134 assert_eq!(table.extended_entries.len(), 1);
2135 assert!(table.extended_entries.contains_key(&15));
2136 }
2137
2138 #[test]
2139 fn test_get_extended_entry() {
2140 let mut table = XRefTable::new();
2141 let ext_entry = XRefEntryExt {
2142 basic: XRefEntry {
2143 offset: 0,
2144 generation: 0,
2145 in_use: true,
2146 },
2147 compressed_info: Some((20, 3)),
2148 };
2149
2150 table.add_extended_entry(7, ext_entry);
2151
2152 let retrieved = table.get_extended_entry(7);
2153 assert!(retrieved.is_some());
2154 assert_eq!(retrieved.unwrap().compressed_info, Some((20, 3)));
2155 }
2156
2157 #[test]
2158 fn test_xref_offset() {
2159 let mut table = XRefTable::new();
2160 assert_eq!(table.xref_offset(), 0);
2161
2162 table.xref_offset = 12345;
2163 assert_eq!(table.xref_offset(), 12345);
2164 }
2165
2166 #[test]
2167 fn test_find_xref_offset_simple() {
2168 let pdf_data = b"startxref\n12345\n%%EOF";
2169 let cursor = Cursor::new(pdf_data.to_vec());
2170 let mut reader = BufReader::new(cursor);
2171
2172 let offset = XRefTable::find_xref_offset(&mut reader).unwrap();
2173 assert_eq!(offset, 12345);
2174 }
2175
2176 #[test]
2177 fn test_find_xref_offset_with_spaces() {
2178 let pdf_data = b"startxref \n 12345 \n%%EOF";
2179 let cursor = Cursor::new(pdf_data.to_vec());
2180 let mut reader = BufReader::new(cursor);
2181
2182 let offset = XRefTable::find_xref_offset(&mut reader).unwrap();
2183 assert_eq!(offset, 12345);
2184 }
2185
2186 #[test]
2187 fn test_find_xref_offset_missing() {
2188 let pdf_data = b"no startxref here";
2189 let cursor = Cursor::new(pdf_data.to_vec());
2190 let mut reader = BufReader::new(cursor);
2191
2192 let result = XRefTable::find_xref_offset(&mut reader);
2193 assert!(result.is_err());
2194 }
2195
2196 #[test]
2197 fn test_trailer_getter() {
2198 let mut table = XRefTable::new();
2199 assert!(table.trailer().is_none());
2200
2201 let trailer = PdfDictionary::new();
2202 table.set_trailer(trailer);
2203 assert!(table.trailer().is_some());
2204 }
2205
2206 #[test]
2207 fn test_xref_table_clone() {
2208 let mut table = XRefTable::new();
2209 table.add_entry(
2210 1,
2211 XRefEntry {
2212 offset: 100,
2213 generation: 0,
2214 in_use: true,
2215 },
2216 );
2217 table.xref_offset = 5000;
2218
2219 let cloned = table.clone();
2220 assert_eq!(cloned.entries.len(), 1);
2221 assert_eq!(cloned.xref_offset, 5000);
2222 }
2223
2224 #[test]
2225 fn test_parse_obj_header() {
2226 assert_eq!(parse_obj_header_bytes(b"1 0 obj"), Some((1, 0)));
2228 assert_eq!(parse_obj_header_bytes(b"123 5 obj"), Some((123, 5)));
2229 assert_eq!(parse_obj_header_bytes(b" 42 3 obj "), Some((42, 3)));
2230
2231 assert_eq!(parse_obj_header_bytes(b"1 obj"), None);
2233 assert_eq!(parse_obj_header_bytes(b"abc 0 obj"), None);
2234 assert_eq!(parse_obj_header_bytes(b"1 0 object"), None);
2235 assert_eq!(parse_obj_header_bytes(b""), None);
2236 }
2237
2238 #[test]
2239 fn test_xref_recovery_parsing() {
2240 let pdf_content =
2242 b"1 0 obj\n<< /Type /Catalog >>\nendobj\n2 0 obj\n<< /Type /Page >>\nendobj\n";
2243 let mut reader = BufReader::new(Cursor::new(pdf_content));
2244
2245 let table = XRefTable::parse_with_recovery(&mut reader).unwrap();
2246
2247 assert_eq!(table.len(), 2);
2249 assert!(table.get_entry(1).is_some());
2250 assert!(table.get_entry(2).is_some());
2251
2252 assert!(table.get_entry(1).unwrap().in_use);
2254 assert!(table.get_entry(2).unwrap().in_use);
2255 }
2256
2257 #[test]
2258 fn test_xref_recovery_no_objects() {
2259 let pdf_content = b"This is not a PDF file\nNo objects here\n";
2261 let mut reader = BufReader::new(Cursor::new(pdf_content));
2262
2263 let result = XRefTable::parse_with_recovery(&mut reader);
2264 assert!(result.is_err());
2265 }
2266
2267 #[test]
2268 fn test_offset_validation() {
2269 let pdf_data = b"small file";
2270 let mut reader = BufReader::new(Cursor::new(pdf_data));
2271
2272 assert!(XRefTable::validate_offset(&mut reader, 5).is_ok());
2274
2275 assert!(XRefTable::validate_offset(&mut reader, 100).is_err());
2277
2278 assert!(XRefTable::validate_offset(&mut reader, 10).is_err());
2280 }
2281
2282 #[test]
2283 fn test_xref_parse_with_fallback() {
2284 let pdf_content =
2286 b"1 0 obj\n<< /Type /Catalog >>\nendobj\n2 0 obj\n<< /Type /Page >>\nendobj\n";
2287 let mut reader = BufReader::new(Cursor::new(pdf_content));
2288
2289 let result = XRefTable::parse(&mut reader);
2292 assert!(result.is_err());
2293 if let Err(e) = result {
2294 assert!(matches!(e, ParseError::InvalidXRef));
2295 }
2296 }
2297
2298 #[test]
2299 fn test_xref_entry_creation() {
2300 let entry = XRefEntry {
2301 offset: 1234,
2302 generation: 5,
2303 in_use: true,
2304 };
2305
2306 assert_eq!(entry.offset, 1234);
2307 assert_eq!(entry.generation, 5);
2308 assert!(entry.in_use);
2309 }
2310
2311 #[test]
2312 fn test_xref_entry_ext_creation() {
2313 let basic = XRefEntry {
2314 offset: 5000,
2315 generation: 0,
2316 in_use: true,
2317 };
2318
2319 let ext = XRefEntryExt {
2320 basic: basic.clone(),
2321 compressed_info: Some((10, 3)),
2322 };
2323
2324 assert_eq!(ext.basic.offset, 5000);
2325 assert_eq!(ext.compressed_info, Some((10, 3)));
2326 }
2327
2328 #[test]
2329 fn test_xref_table_new_advanced() {
2330 let table = XRefTable::new();
2331 assert_eq!(table.entries.len(), 0);
2332 assert_eq!(table.extended_entries.len(), 0);
2333 assert!(table.trailer.is_none());
2334 assert_eq!(table.xref_offset, 0);
2335 }
2336
2337 #[test]
2338 fn test_xref_table_default_advanced() {
2339 let table = XRefTable::default();
2340 assert_eq!(table.entries.len(), 0);
2341 assert!(table.trailer.is_none());
2342 }
2343
2344 #[test]
2345 fn test_xref_table_add_entry() {
2346 let mut table = XRefTable::new();
2347
2348 let entry1 = XRefEntry {
2349 offset: 100,
2350 generation: 0,
2351 in_use: true,
2352 };
2353 table.add_entry(1, entry1);
2354 let entry2 = XRefEntry {
2355 offset: 200,
2356 generation: 1,
2357 in_use: false,
2358 };
2359 table.add_entry(2, entry2);
2360
2361 assert_eq!(table.len(), 2);
2362
2363 let entry1 = table.get_entry(1).unwrap();
2364 assert_eq!(entry1.offset, 100);
2365 assert_eq!(entry1.generation, 0);
2366 assert!(entry1.in_use);
2367
2368 let entry2 = table.get_entry(2).unwrap();
2369 assert_eq!(entry2.offset, 200);
2370 assert_eq!(entry2.generation, 1);
2371 assert!(!entry2.in_use);
2372 }
2373
2374 #[test]
2375 fn test_xref_table_add_extended_entry() {
2376 let mut table = XRefTable::new();
2377
2378 let basic_entry = XRefEntry {
2379 offset: 0,
2380 generation: 0,
2381 in_use: true,
2382 };
2383
2384 let extended_entry = XRefEntryExt {
2385 basic: basic_entry,
2386 compressed_info: Some((10, 2)),
2387 };
2388
2389 table.add_extended_entry(5, extended_entry);
2390
2391 let ext = table.get_extended_entry(5);
2393 assert!(ext.is_some());
2394 if let Some(ext) = ext {
2395 assert_eq!(ext.compressed_info, Some((10, 2)));
2396 }
2397
2398 assert!(table.is_compressed(5));
2399 }
2400
2401 #[test]
2402 fn test_xref_table_get_nonexistent() {
2403 let table = XRefTable::new();
2404 assert!(table.get_entry(999).is_none());
2405 assert!(table.get_extended_entry(999).is_none());
2406 }
2407
2408 #[test]
2409 fn test_xref_table_update_entry() {
2410 let mut table = XRefTable::new();
2411
2412 let entry1 = XRefEntry {
2414 offset: 100,
2415 generation: 0,
2416 in_use: true,
2417 };
2418 table.add_entry(1, entry1);
2419
2420 let entry2 = XRefEntry {
2422 offset: 200,
2423 generation: 1,
2424 in_use: false,
2425 };
2426 table.add_entry(1, entry2);
2427
2428 let entry = table.get_entry(1).unwrap();
2430 assert_eq!(entry.offset, 200);
2431 assert_eq!(entry.generation, 1);
2432 assert!(!entry.in_use);
2433 }
2434
2435 #[test]
2436 fn test_xref_table_set_trailer() {
2437 let mut table = XRefTable::new();
2438 assert!(table.trailer.is_none());
2439
2440 let mut trailer = PdfDictionary::new();
2441 trailer.insert("Size".to_string(), PdfObject::Integer(10));
2442
2443 table.set_trailer(trailer.clone());
2444 assert!(table.trailer.is_some());
2445 assert_eq!(table.trailer(), Some(&trailer));
2446 }
2447
2448 #[test]
2449 fn test_xref_table_offset() {
2450 let table = XRefTable::new();
2451 assert_eq!(table.xref_offset(), 0);
2452 }
2453
2454 #[test]
2455 fn test_parse_xref_entry_invalid_static() {
2456 let invalid_lines = vec![
2457 "not a valid entry".to_string(),
2458 "12345 abcde n".to_string(), ];
2460
2461 for line in invalid_lines {
2462 let result = XRefTable::parse_xref_entry(&line);
2463 assert!(result.is_err());
2464 }
2465
2466 let result = XRefTable::parse_xref_entry("12345 00000");
2468 assert!(result.is_ok());
2469 let entry = result.unwrap();
2470 assert_eq!(entry.offset, 12345);
2471 assert_eq!(entry.generation, 0);
2472 assert!(entry.in_use); }
2474
2475 #[test]
2476 fn test_xref_entry_operations() {
2477 let mut table = XRefTable::new();
2478
2479 let entry1 = XRefEntry {
2481 offset: 1234,
2482 generation: 5,
2483 in_use: true,
2484 };
2485
2486 let entry2 = XRefEntry {
2487 offset: 5678,
2488 generation: 10,
2489 in_use: false,
2490 };
2491
2492 table.add_entry(1, entry1);
2493 table.add_entry(2, entry2);
2494
2495 assert_eq!(table.len(), 2);
2496
2497 let retrieved1 = table.get_entry(1).unwrap();
2498 assert_eq!(retrieved1.offset, 1234);
2499 assert_eq!(retrieved1.generation, 5);
2500 assert!(retrieved1.in_use);
2501
2502 let retrieved2 = table.get_entry(2).unwrap();
2503 assert_eq!(retrieved2.offset, 5678);
2504 assert_eq!(retrieved2.generation, 10);
2505 assert!(!retrieved2.in_use);
2506 }
2507
2508 #[test]
2509 fn test_parse_xref_with_comments() {
2510 let pdf_content = b"%PDF-1.4\n\
25111 0 obj\n<< /Type /Catalog >>\nendobj\n\
2512xref\n\
2513% This is a comment\n\
25140 2\n\
25150000000000 65535 f \n\
25160000000015 00000 n \n\
2517% Another comment\n\
2518trailer\n\
2519<< /Size 2 /Root 1 0 R >>\n\
2520startxref\n\
252145\n\
2522%%EOF";
2523
2524 let mut reader = BufReader::new(Cursor::new(pdf_content));
2525 reader.seek(SeekFrom::Start(45)).unwrap(); let result = XRefTable::parse(&mut reader);
2528 assert!(result.is_ok());
2529 let table = result.unwrap();
2530 assert_eq!(table.len(), 2);
2531 }
2532
2533 #[test]
2534 fn test_parse_multiple_xref_sections() {
2535 let pdf_content = b"%PDF-1.4\n\
25361 0 obj\n<< /Type /Catalog >>\nendobj\n\
25372 0 obj\n<< /Type /Page >>\nendobj\n\
2538xref\n\
25390 2\n\
25400000000000 65535 f \n\
25410000000015 00000 n \n\
25425 2\n\
25430000000100 00000 n \n\
25440000000200 00000 n \n\
2545trailer\n\
2546<< /Size 7 /Root 1 0 R >>\n\
2547startxref\n\
254878\n\
2549%%EOF";
2550
2551 let mut reader = BufReader::new(Cursor::new(pdf_content));
2552 reader.seek(SeekFrom::Start(78)).unwrap(); let result = XRefTable::parse(&mut reader);
2555 assert!(result.is_ok());
2556 let table = result.unwrap();
2557 assert_eq!(table.len(), 4);
2559 assert!(table.get_entry(0).is_some());
2560 assert!(table.get_entry(1).is_some());
2561 assert!(table.get_entry(5).is_some());
2562 assert!(table.get_entry(6).is_some());
2563 }
2564
2565 #[test]
2566 fn test_parse_xref_with_prev() {
2567 let pdf_content = b"%PDF-1.4\n\
2569% First xref at 15\n\
2570xref\n\
25710 2\n\
25720000000000 65535 f \n\
25730000000100 00000 n \n\
2574trailer\n\
2575<< /Size 2 >>\n\
2576% Second xref at 100\n\
2577xref\n\
25782 1\n\
25790000000200 00000 n \n\
2580trailer\n\
2581<< /Size 3 /Prev 15 >>\n\
2582startxref\n\
2583100\n\
2584%%EOF";
2585
2586 let mut reader = BufReader::new(Cursor::new(pdf_content));
2587 let options = ParseOptions {
2588 lenient_syntax: true,
2589 ..Default::default()
2590 };
2591
2592 let result = XRefTable::parse_with_options(&mut reader, &options);
2593 assert!(result.is_ok() || result.is_err());
2595 }
2596
2597 #[test]
2598 fn test_invalid_xref_format() {
2599 let pdf_content = b"xref\ninvalid content\ntrailer";
2600 let mut reader = BufReader::new(Cursor::new(pdf_content));
2601
2602 let result = XRefTable::parse(&mut reader);
2603 assert!(result.is_err());
2604 }
2605
2606 #[test]
2607 fn test_xref_entry_overflow() {
2608 let mut table = XRefTable::new();
2609
2610 let entry = XRefEntry {
2612 offset: u64::MAX,
2613 generation: u16::MAX,
2614 in_use: true,
2615 };
2616 table.add_entry(u32::MAX, entry);
2617
2618 let entry = table.get_entry(u32::MAX).unwrap();
2619 assert_eq!(entry.offset, u64::MAX);
2620 assert_eq!(entry.generation, u16::MAX);
2621 }
2622
2623 #[test]
2624 fn test_xref_table_operations() {
2625 let mut table = XRefTable::new();
2626
2627 let entry1 = XRefEntry {
2629 offset: 100,
2630 generation: 0,
2631 in_use: true,
2632 };
2633
2634 let entry2 = XRefEntry {
2635 offset: 200,
2636 generation: 0,
2637 in_use: true,
2638 };
2639
2640 table.add_entry(1, entry1);
2641 table.add_entry(2, entry2);
2642
2643 assert_eq!(table.len(), 2);
2644 assert!(table.get_entry(1).is_some());
2645 assert!(table.get_entry(2).is_some());
2646 assert!(table.get_entry(3).is_none());
2647 }
2648
2649 #[test]
2650 fn test_xref_table_merge() {
2651 let mut table1 = XRefTable::new();
2652 let entry1 = XRefEntry {
2653 offset: 100,
2654 generation: 0,
2655 in_use: true,
2656 };
2657 table1.add_entry(1, entry1);
2658 let entry2 = XRefEntry {
2659 offset: 200,
2660 generation: 0,
2661 in_use: true,
2662 };
2663 table1.add_entry(2, entry2);
2664
2665 let mut table2 = XRefTable::new();
2666 let entry3 = XRefEntry {
2667 offset: 250,
2668 generation: 1,
2669 in_use: true,
2670 }; table2.add_entry(2, entry3);
2672 let entry4 = XRefEntry {
2673 offset: 300,
2674 generation: 0,
2675 in_use: true,
2676 }; table2.add_entry(3, entry4);
2678
2679 for i in 2..=3 {
2682 if let Some(entry) = table2.get_entry(i) {
2683 table1.add_entry(
2684 i,
2685 XRefEntry {
2686 offset: entry.offset,
2687 generation: entry.generation,
2688 in_use: entry.in_use,
2689 },
2690 );
2691 }
2692 }
2693
2694 assert_eq!(table1.len(), 3);
2695
2696 let entry2 = table1.get_entry(2).unwrap();
2698 assert_eq!(entry2.offset, 250);
2699 assert_eq!(entry2.generation, 1);
2700
2701 assert!(table1.get_entry(3).is_some());
2703 }
2704
2705 #[test]
2706 fn test_xref_recovery_with_stream() {
2707 let pdf_content = b"1 0 obj\n<< /Type /ObjStm /N 2 /First 10 >>\nstream\n12345678901 0 2 0\nendstream\nendobj\n";
2708 let mut reader = BufReader::new(Cursor::new(pdf_content));
2709
2710 let result = XRefTable::parse_with_recovery(&mut reader);
2711 assert!(result.is_ok() || result.is_err());
2713 }
2714
2715 #[test]
2716 fn test_xref_entry_equality_advanced() {
2717 let entry1 = XRefEntry {
2718 offset: 100,
2719 generation: 0,
2720 in_use: true,
2721 };
2722
2723 let entry2 = XRefEntry {
2724 offset: 100,
2725 generation: 0,
2726 in_use: true,
2727 };
2728
2729 let entry3 = XRefEntry {
2730 offset: 200,
2731 generation: 0,
2732 in_use: true,
2733 };
2734
2735 assert_eq!(entry1, entry2);
2736 assert_ne!(entry1, entry3);
2737 }
2738
2739 #[test]
2740 fn test_parse_options_effect() {
2741 let pdf_content = b"xref 0 1 invalid";
2742 let mut reader = BufReader::new(Cursor::new(pdf_content));
2743
2744 let strict_options = ParseOptions {
2746 lenient_syntax: false,
2747 ..Default::default()
2748 };
2749 let result = XRefTable::parse_with_options(&mut reader, &strict_options);
2750 assert!(result.is_err());
2751
2752 reader.seek(SeekFrom::Start(0)).unwrap();
2754 let lenient_options = ParseOptions {
2755 lenient_syntax: true,
2756 ..Default::default()
2757 };
2758 let result = XRefTable::parse_with_options(&mut reader, &lenient_options);
2759 assert!(result.is_err() || result.is_ok());
2761 }
2762
2763 #[test]
2764 fn test_circular_reference_detection() {
2765 let pdf_content = b"%PDF-1.4\n\
2767xref\n\
27680 1\n\
27690000000000 65535 f \n\
2770trailer\n\
2771<< /Size 1 /Prev 10 >>\n\
2772startxref\n\
277310\n\
2774%%EOF";
2775
2776 let mut reader = BufReader::new(Cursor::new(pdf_content));
2777
2778 let result = XRefTable::parse_with_incremental_updates(&mut reader);
2780 assert!(result.is_ok() || result.is_err());
2782 }
2783
2784 #[test]
2785 fn test_linearized_xref_detection() {
2786 let pdf_content = b"%PDF-1.4\n\
27881 0 obj\n\
2789<< /Linearized 1 /L 1234 /H [100 200] /O 5 /E 500 /N 10 /T 600 >>\n\
2790endobj\n\
2791xref\n\
27920 2\n\
27930000000000 65535 f \n\
27940000000009 00000 n \n\
2795trailer\n\
2796<< /Size 2 >>\n\
2797startxref\n\
279863\n\
2799%%EOF";
2800
2801 let mut reader = BufReader::new(Cursor::new(pdf_content));
2802
2803 let result = XRefTable::find_linearized_xref(&mut reader);
2805 assert!(result.is_ok());
2806
2807 let xref_pos = result.unwrap();
2810 assert_eq!(
2811 xref_pos, 90,
2812 "Expected xref at position 90, got {}",
2813 xref_pos
2814 );
2815 }
2816
2817 #[test]
2818 fn test_xref_stream_parsing() {
2819 let pdf_content = b"%PDF-1.5\n\
28221 0 obj\n\
2823<< /Type /XRef /Size 3 /W [1 2 1] /Length 12 >>\n\
2824stream\n\
2825\x00\x00\x00\x00\
2826\x01\x00\x10\x00\
2827\x01\x00\x20\x00\
2828endstream\n\
2829endobj\n\
2830startxref\n\
28319\n\
2832%%EOF";
2833
2834 let mut reader = BufReader::new(Cursor::new(pdf_content));
2835 reader.seek(SeekFrom::Start(9)).unwrap();
2836
2837 let result = XRefTable::parse(&mut reader);
2839 assert!(result.is_err() || result.is_ok());
2841 }
2842
2843 #[test]
2844 fn test_xref_validation_max_object_exceeds_size() {
2845 let pdf_content = b"%PDF-1.4\n\
2847xref\n\
28480 1\n\
28490000000000 65535 f \n\
285010 1\n\
28510000000100 00000 n \n\
2852trailer\n\
2853<< /Size 5 /Root 1 0 R >>\n\
2854startxref\n\
28559\n\
2856%%EOF";
2857
2858 let mut reader = BufReader::new(Cursor::new(pdf_content));
2859 reader.seek(SeekFrom::Start(9)).unwrap();
2860
2861 let result = XRefTable::parse(&mut reader);
2863 assert!(result.is_err());
2864 }
2865
2866 #[test]
2867 fn test_parse_with_options_lenient_vs_strict() {
2868 let pdf_content = b"%PDF-1.4\n\
2870xref\n\
28710 2\n\
28720000000000 65535 f \n\
28730000000015 00000 n \n\
2874trailer\n\
2875<< /Size 2 >>\n\
2876startxref\n\
28779\n\
2878%%EOF";
2879
2880 let mut reader = BufReader::new(Cursor::new(pdf_content));
2881
2882 let strict_options = ParseOptions {
2884 lenient_syntax: false,
2885 recover_from_stream_errors: false,
2886 ..Default::default()
2887 };
2888 reader.seek(SeekFrom::Start(9)).unwrap();
2889 let strict_result = XRefTable::parse_with_options(&mut reader, &strict_options);
2890
2891 let lenient_options = ParseOptions {
2893 lenient_syntax: true,
2894 recover_from_stream_errors: true,
2895 ..Default::default()
2896 };
2897 reader.seek(SeekFrom::Start(9)).unwrap();
2898 let lenient_result = XRefTable::parse_with_options(&mut reader, &lenient_options);
2899
2900 assert!(strict_result.is_ok());
2902 assert!(lenient_result.is_ok());
2903 }
2904
2905 #[test]
2906 fn test_xref_entry_with_attached_flag() {
2907 let entry1 = XRefTable::parse_xref_entry("12345 0n");
2909 assert!(entry1.is_ok());
2910 let entry1 = entry1.unwrap();
2911 assert_eq!(entry1.offset, 12345);
2912 assert_eq!(entry1.generation, 0);
2913 assert!(entry1.in_use);
2914
2915 let entry2 = XRefTable::parse_xref_entry("54321 1f");
2916 assert!(entry2.is_ok());
2917 let entry2 = entry2.unwrap();
2918 assert_eq!(entry2.offset, 54321);
2919 assert_eq!(entry2.generation, 1);
2920 assert!(!entry2.in_use);
2921 }
2922
2923 #[test]
2924 fn test_find_xref_offset_edge_cases() {
2925 use std::io::{BufReader, Cursor};
2927
2928 let content = b"garbage\nstartxref \n 123 \n%%EOF";
2930 let mut reader = BufReader::new(Cursor::new(content));
2931 let result = XRefTable::find_xref_offset(&mut reader);
2932 assert_eq!(result.unwrap(), 123);
2933
2934 let content = b"startxref\n999\n%%EOF";
2936 let mut reader = BufReader::new(Cursor::new(content));
2937 let result = XRefTable::find_xref_offset(&mut reader);
2938 assert_eq!(result.unwrap(), 999);
2939
2940 let content = b"startxref\n456";
2942 let mut reader = BufReader::new(Cursor::new(content));
2943 let result = XRefTable::find_xref_offset(&mut reader);
2944 assert!(result.is_ok() || result.is_err());
2946
2947 let content = b"some content\n%%EOF";
2949 let mut reader = BufReader::new(Cursor::new(content));
2950 let result = XRefTable::find_xref_offset(&mut reader);
2951 assert!(result.is_err());
2952 }
2953
2954 #[test]
2955 fn test_xref_subsection_incomplete() {
2956 let pdf_content = b"%PDF-1.4\n\
2958xref\n\
29590 5\n\
29600000000000 65535 f \n\
29610000000015 00000 n \n\
2962trailer\n\
2963<< /Size 5 >>\n\
2964startxref\n\
29659\n\
2966%%EOF";
2967
2968 let mut reader = BufReader::new(Cursor::new(pdf_content));
2969 reader.seek(SeekFrom::Start(9)).unwrap();
2970
2971 let result = XRefTable::parse(&mut reader);
2973 assert!(result.is_err() || result.is_ok());
2975 }
2976}
2977
2978fn extract_root_from_xref_stream(content: &str) -> Option<u32> {
2980 let lines: Vec<&str> = content.lines().collect();
2985 let mut in_xref_obj = false;
2986
2987 for (i, line) in lines.iter().enumerate() {
2988 if line.contains(" obj")
2990 && lines
2991 .get(i + 1)
2992 .map_or(false, |next| next.contains("/Type /XRef"))
2993 {
2994 in_xref_obj = true;
2995 continue;
2996 }
2997
2998 if in_xref_obj {
3000 if line.contains("endobj") {
3001 in_xref_obj = false;
3002 continue;
3003 }
3004
3005 if let Some(root_pos) = line.find("/Root ") {
3007 let after_root = &line[root_pos + 6..]; if let Some(space_pos) = after_root.find(' ') {
3011 let number_part = &after_root[..space_pos];
3012 if let Ok(root_obj) = number_part.parse::<u32>() {
3013 tracing::debug!("Extracted Root {} from XRef stream", root_obj);
3014 return Some(root_obj);
3015 }
3016 }
3017 }
3018 }
3019 }
3020
3021 None
3022}
3023
3024fn find_catalog_by_content<R: Read + Seek>(
3027 reader: &mut R,
3028 table: &XRefTable,
3029) -> ParseResult<Option<u32>> {
3030 let mut obj_numbers: Vec<u32> = table.entries.keys().copied().collect();
3033 obj_numbers.sort_unstable();
3034
3035 for obj_num in obj_numbers {
3036 let offset = match table.entries.get(&obj_num) {
3037 Some(entry) if entry.in_use => entry.offset,
3038 _ => continue,
3039 };
3040 if let Some(content) = read_object_content(reader, obj_num, offset)? {
3042 if content.contains("/Type /Catalog") {
3043 tracing::debug!(
3044 "Found catalog candidate at object {} (validated structure)",
3045 obj_num
3046 );
3047 return Ok(Some(obj_num));
3048 }
3049 }
3050 }
3051
3052 tracing::debug!("No valid catalog found by content search");
3053 Ok(None)
3054}