1use super::xref_stream;
6use super::xref_types::{XRefEntryInfo, XRefEntryType};
7use super::{ParseError, ParseOptions, ParseResult};
8use crate::parser::reader::PDFLines;
9use std::collections::HashMap;
10use std::io::{BufRead, BufReader, Read, Seek, SeekFrom};
11
12fn find_byte_pattern(buffer: &[u8], pattern: &[u8]) -> Option<usize> {
23 buffer
24 .windows(pattern.len())
25 .position(|window| window == pattern)
26}
27
28fn rfind_byte_pattern(buffer: &[u8], pattern: &[u8]) -> Option<usize> {
30 buffer
31 .windows(pattern.len())
32 .rposition(|window| window == pattern)
33}
34
35fn parse_obj_header_bytes(line_bytes: &[u8]) -> Option<(u32, u16)> {
40 let line = String::from_utf8_lossy(line_bytes);
42 let parts: Vec<&str> = line.trim().split_whitespace().collect();
43
44 if parts.len() >= 3 && parts[2] == "obj" {
45 let obj_num = parts[0].parse::<u32>().ok()?;
46 let gen_num = parts[1].parse::<u16>().ok()?;
47 return Some((obj_num, gen_num));
48 }
49 None
50}
51
52fn read_pdf_line<R: BufRead>(reader: &mut R, buf: &mut String) -> std::io::Result<usize> {
59 buf.clear();
60 let mut total_bytes = 0;
61
62 loop {
63 let available = reader.fill_buf()?;
64 if available.is_empty() {
65 break;
67 }
68
69 let mut found_terminator = false;
71 let mut consume_len = 0;
72
73 for (i, &byte) in available.iter().enumerate() {
74 if byte == b'\r' || byte == b'\n' {
75 let content = &available[..i];
78 buf.push_str(&String::from_utf8_lossy(content));
79 consume_len = i + 1; if byte == b'\r' && i + 1 < available.len() && available[i + 1] == b'\n' {
83 consume_len += 1; }
85
86 found_terminator = true;
87 break;
88 }
89 }
90
91 if found_terminator {
92 reader.consume(consume_len);
93 total_bytes += consume_len;
94 break;
95 } else {
96 let len = available.len();
98 buf.push_str(&String::from_utf8_lossy(available));
99 reader.consume(len);
100 total_bytes += len;
101 }
102 }
103
104 Ok(total_bytes)
105}
106
107#[derive(Debug, Clone, Copy, PartialEq)]
111pub struct XRefEntry {
112 pub offset: u64,
114 pub generation: u16,
116 pub in_use: bool,
118}
119
120#[derive(Debug, Clone, PartialEq)]
122pub struct XRefEntryExt {
123 pub basic: XRefEntry,
125 pub compressed_info: Option<(u32, u32)>, }
128
129#[derive(Debug, Clone)]
131pub struct XRefTable {
132 entries: HashMap<u32, XRefEntry>,
134 extended_entries: HashMap<u32, XRefEntryExt>,
136 trailer: Option<super::objects::PdfDictionary>,
138 xref_offset: u64,
140}
141
142impl Default for XRefTable {
143 fn default() -> Self {
144 Self::new()
145 }
146}
147
148impl XRefTable {
149 pub fn new() -> Self {
151 Self {
152 entries: HashMap::new(),
153 extended_entries: HashMap::new(),
154 trailer: None,
155 xref_offset: 0,
156 }
157 }
158
159 pub fn entries(&self) -> &HashMap<u32, XRefEntry> {
161 &self.entries
162 }
163
164 pub fn parse<R: Read + Seek>(reader: &mut BufReader<R>) -> ParseResult<Self> {
166 Self::parse_with_options(reader, &super::ParseOptions::default())
167 }
168
169 pub fn parse_with_options<R: Read + Seek>(
171 reader: &mut BufReader<R>,
172 options: &super::ParseOptions,
173 ) -> ParseResult<Self> {
174 match Self::parse_with_incremental_updates_options(reader, options) {
176 Ok(table) => Ok(table),
177 Err(e) => {
178 if options.lenient_syntax {
179 tracing::warn!("Primary XRef parsing failed: {e:?}, attempting recovery");
180
181 reader.seek(SeekFrom::Start(0))?;
183 Self::parse_with_recovery_options(reader, options)
184 } else {
185 Err(e)
186 }
187 }
188 }
189 }
190
191 #[allow(dead_code)]
193 fn parse_with_incremental_updates<R: Read + Seek>(
194 reader: &mut BufReader<R>,
195 ) -> ParseResult<Self> {
196 Self::parse_with_incremental_updates_options(reader, &super::ParseOptions::default())
197 }
198
199 fn parse_with_incremental_updates_options<R: Read + Seek>(
201 reader: &mut BufReader<R>,
202 options: &super::ParseOptions,
203 ) -> ParseResult<Self> {
204 let xref_offset = Self::find_xref_offset(reader)?;
206
207 let mut merged_table = Self::new();
209 let mut current_offset = Some(xref_offset);
210 let mut visited_offsets = std::collections::HashSet::new();
211
212 while let Some(offset) = current_offset {
213 if visited_offsets.contains(&offset) {
215 tracing::debug!(
216 "Circular reference in XRef chain at offset {} (already visited)",
217 offset
218 );
219 break;
220 }
221 visited_offsets.insert(offset);
222
223 reader.seek(SeekFrom::Start(offset))?;
225 let table = Self::parse_primary_with_options(reader, options)?;
226
227 let prev_offset = table
229 .trailer
230 .as_ref()
231 .and_then(|t| t.get("Prev"))
232 .and_then(|obj| obj.as_integer())
233 .map(|i| i as u64);
234
235 if let Some(_prev) = prev_offset {
236 } else {
237 }
238
239 let _regular_count = table.entries.len();
241 let _extended_count = table.extended_entries.len();
242
243 for (obj_num, entry) in table.entries {
244 merged_table.entries.entry(obj_num).or_insert(entry);
245 }
246 for (obj_num, ext_entry) in table.extended_entries {
247 merged_table
248 .extended_entries
249 .entry(obj_num)
250 .or_insert(ext_entry);
251 }
252
253 if merged_table.trailer.is_none() {
255 merged_table.trailer = table.trailer;
256 merged_table.xref_offset = table.xref_offset;
257 }
258
259 current_offset = prev_offset;
260 }
261
262 if options.lenient_syntax || options.collect_warnings {
266 reader.seek(SeekFrom::Start(0))?;
269
270 if let Err(_e) = Self::scan_and_fill_missing_objects(reader, &mut merged_table) {
271 } else {
272 }
273 }
274
275 Ok(merged_table)
276 }
277
278 #[allow(dead_code)]
280 fn parse_primary<R: Read + Seek>(reader: &mut BufReader<R>) -> ParseResult<Self> {
281 Self::parse_primary_with_options(reader, &super::ParseOptions::default())
282 }
283
284 fn parse_primary_with_options<R: Read + Seek>(
290 reader: &mut BufReader<R>,
291 options: &super::ParseOptions,
292 ) -> ParseResult<Self> {
293 let mut table = Self::new();
294
295 let xref_offset = reader.stream_position()?;
299 table.xref_offset = xref_offset;
300
301 let mut line = String::new();
304 let pos = reader.stream_position()?;
305 read_pdf_line(reader, &mut line)?;
306
307 if line.trim() == "xref" {
308 Self::parse_traditional_xref_with_options(reader, &mut table, options)?;
310 } else {
311 tracing::debug!(
312 "Not a traditional xref, checking for xref stream. Line: {:?}",
313 line.trim()
314 );
315
316 reader.seek(SeekFrom::Start(pos))?;
318
319 let mut lexer = super::lexer::Lexer::new_with_options(&mut *reader, options.clone());
321
322 let obj_num = match lexer.next_token()? {
324 super::lexer::Token::Integer(n) => n as u32,
325 _ => return Err(ParseError::InvalidXRef),
326 };
327
328 tracing::debug!("Found object {obj_num} at xref position");
329
330 let _gen_num = match lexer.next_token()? {
331 super::lexer::Token::Integer(n) => n as u16,
332 _ => return Err(ParseError::InvalidXRef),
333 };
334
335 match lexer.next_token()? {
336 super::lexer::Token::Obj => {}
337 _ => return Err(ParseError::InvalidXRef),
338 };
339
340 let obj = super::objects::PdfObject::parse_with_options(&mut lexer, options)?;
342
343 if let Some(stream) = obj.as_stream() {
344 if stream
346 .dict
347 .get("Type")
348 .and_then(|o| o.as_name())
349 .map(|n| n.as_str())
350 == Some("XRef")
351 {
352 tracing::debug!("Parsing XRef stream");
353
354 let decoded_data = match stream.decode(options) {
356 Ok(data) => data,
357 Err(e) => {
358 tracing::debug!(
359 "XRef stream decode failed: {e:?}, attempting raw data fallback"
360 );
361
362 if !stream.data.is_empty() {
365 tracing::debug!(
366 "Using raw stream data ({} bytes) as fallback",
367 stream.data.len()
368 );
369 stream.data.clone()
370 } else {
371 tracing::debug!(
372 "No raw stream data available, triggering recovery mode"
373 );
374 return Err(e);
375 }
376 }
377 };
378
379 let xref_stream_parser = xref_stream::XRefStream::parse(
381 &mut *reader,
382 stream.dict.clone(),
383 decoded_data,
384 options,
385 )?;
386
387 let entries = xref_stream_parser.to_xref_entries()?;
389 tracing::debug!("XRef stream parsed, found {} entries", entries.len());
390
391 for (obj_num, entry) in entries {
393 match entry {
394 xref_stream::XRefEntry::Free {
395 next_free_object,
396 generation,
397 } => {
398 table.entries.insert(
399 obj_num,
400 XRefEntry {
401 offset: next_free_object as u64,
402 generation,
403 in_use: false,
404 },
405 );
406 }
407 xref_stream::XRefEntry::InUse { offset, generation } => {
408 table.entries.insert(
409 obj_num,
410 XRefEntry {
411 offset,
412 generation,
413 in_use: true,
414 },
415 );
416 }
417 xref_stream::XRefEntry::Compressed {
418 stream_object_number,
419 index_within_stream,
420 } => {
421 let ext_entry = XRefEntryExt {
423 basic: XRefEntry {
424 offset: 0,
425 generation: 0,
426 in_use: true,
427 },
428 compressed_info: Some((
429 stream_object_number,
430 index_within_stream,
431 )),
432 };
433 table.extended_entries.insert(obj_num, ext_entry);
434 table.entries.insert(
435 obj_num,
436 XRefEntry {
437 offset: 0,
438 generation: 0,
439 in_use: true,
440 },
441 );
442 }
443 }
444 }
445
446 table.trailer = Some(xref_stream_parser.trailer_dict().clone());
448 } else {
449 return Err(ParseError::InvalidXRef);
450 }
451 } else {
452 return Err(ParseError::InvalidXRef);
453 }
454 }
455
456 Ok(table)
457 }
458
459 #[allow(dead_code)]
461 fn parse_traditional_xref<R: Read + Seek>(
462 reader: &mut BufReader<R>,
463 table: &mut XRefTable,
464 ) -> ParseResult<()> {
465 Self::parse_traditional_xref_with_options(reader, table, &super::ParseOptions::default())
466 }
467
468 fn parse_traditional_xref_with_options<R: Read + Seek>(
470 reader: &mut BufReader<R>,
471 table: &mut XRefTable,
472 options: &super::ParseOptions,
473 ) -> ParseResult<()> {
474 let mut line = String::new();
475 let mut trailer_dict_offset: Option<u64> = None;
476
477 loop {
480 line.clear();
481 let line_start_pos = reader.stream_position()?;
482 read_pdf_line(reader, &mut line)?;
483 let trimmed_line = line.trim();
484
485 if trimmed_line.is_empty() || trimmed_line.starts_with('%') {
487 continue;
488 }
489
490 if trimmed_line == "trailer" {
494 break;
496 }
497 if let Some(dict_pos) = trimmed_line.find("<<") {
498 if trimmed_line.starts_with("trailer") {
499 let trailer_keyword_start =
502 trimmed_line.as_ptr() as usize - line.as_ptr() as usize;
503 trailer_dict_offset =
504 Some(line_start_pos + (trailer_keyword_start + dict_pos) as u64);
505 break;
506 }
507 }
508
509 if trimmed_line.starts_with("<<") {
511 tracing::warn!(" Found trailer dictionary without 'trailer' keyword");
512 trailer_dict_offset = Some(line_start_pos);
514 break;
515 }
516
517 let parts: Vec<&str> = trimmed_line.split_whitespace().collect();
519 if parts.len() != 2 {
520 return Err(ParseError::InvalidXRef);
522 }
523
524 let first_obj_num = parts[0]
525 .parse::<u32>()
526 .map_err(|_| ParseError::InvalidXRef)?;
527 let count = parts[1]
528 .parse::<u32>()
529 .map_err(|_| ParseError::InvalidXRef)?;
530
531 let mut entries_parsed = 0;
534 let mut i = 0;
535 while i < count {
536 line.clear();
537 let bytes_read = read_pdf_line(reader, &mut line)?;
538 let trimmed = line.trim();
539
540 if trimmed.starts_with('%') {
542 continue;
543 }
544
545 if bytes_read == 0 || trimmed == "trailer" {
547 tracing::debug!(
548 "Warning: XRef subsection incomplete - expected {count} entries but found only {entries_parsed}"
549 );
550 if line.trim() == "trailer" {
552 break;
554 }
555 break;
556 }
557
558 match Self::parse_xref_entry(&line) {
559 Ok(entry) => {
560 table.entries.insert(first_obj_num + i, entry);
561 entries_parsed += 1;
562 }
563 Err(_) => {
564 tracing::debug!(
565 "Warning: Invalid XRef entry at position {}: {:?}",
566 i,
567 line.trim()
568 );
569 }
571 }
572 i += 1;
573 }
574 }
576
577 if let Some(offset) = trailer_dict_offset {
581 reader.seek(SeekFrom::Start(offset))?;
582 }
583 let mut lexer = super::lexer::Lexer::new_with_options(reader, options.clone());
584 let trailer_obj = super::objects::PdfObject::parse_with_options(&mut lexer, options)?;
585 table.trailer = trailer_obj.as_dict().cloned();
588
589 if let Some(trailer) = &table.trailer {
591 if let Some(size_obj) = trailer.get("Size") {
592 if let Some(expected_size) = size_obj.as_integer() {
593 if let Some(max_obj_num) = table.entries.keys().max() {
596 let max_expected = (*max_obj_num + 1) as i64;
597 if max_expected > expected_size {
598 tracing::debug!(
599 "Warning: XRef table has object {} but trailer Size is only {}",
600 max_obj_num,
601 expected_size
602 );
603 return Err(ParseError::InvalidXRef);
605 }
606 }
607 }
608 }
609 }
610
611 Ok(())
615 }
616
617 #[allow(dead_code)]
624 fn find_linearized_xref<R: Read + Seek>(reader: &mut BufReader<R>) -> ParseResult<u64> {
625 reader.seek(SeekFrom::Start(0))?;
627 let mut header = String::new();
628 reader.read_line(&mut header)?;
629
630 if !header.starts_with("%PDF-") {
631 return Err(ParseError::InvalidHeader);
632 }
633
634 let mut line = String::new();
636 reader.read_line(&mut line)?;
637
638 let pos = reader.stream_position()?;
641 let mut buffer = vec![0u8; 1024];
642 let bytes_read = reader.read(&mut buffer)?;
643 buffer.truncate(bytes_read);
644
645 tracing::debug!(
649 "Checking for linearized PDF, first 100 bytes: {:?}",
650 String::from_utf8_lossy(&buffer[..buffer.len().min(100)])
651 );
652
653 if find_byte_pattern(&buffer, b"/Linearized").is_some() {
655 if let Some(xref_pos) = find_byte_pattern(&buffer, b"xref") {
661 return Ok(pos + xref_pos as u64);
662 }
663
664 if find_byte_pattern(&buffer, b"/Type/XRef").is_some()
666 || find_byte_pattern(&buffer, b"/Type /XRef").is_some()
667 {
668 if let Some(obj_pos) = find_byte_pattern(&buffer, b" obj") {
671 let search_from = obj_pos + 4;
673 if search_from < buffer.len() {
674 let after_first_obj = &buffer[search_from..];
675 if let Some(next_obj) = find_byte_pattern(after_first_obj, b" obj") {
676 let second_obj_start =
678 pos + (search_from + next_obj).saturating_sub(10) as u64;
679 return Ok(second_obj_start);
680 }
681 }
682 }
683 }
684 }
685
686 Err(ParseError::InvalidXRef)
687 }
688
689 fn find_xref_offset<R: Read + Seek>(reader: &mut BufReader<R>) -> ParseResult<u64> {
691 reader.seek(SeekFrom::End(0))?;
693 let file_size = reader.stream_position()?;
694
695 let read_size = std::cmp::min(1024, file_size);
697 reader.seek(SeekFrom::End(-(read_size as i64)))?;
698
699 let mut buffer = vec![0u8; read_size as usize];
700 reader.read_exact(&mut buffer)?;
701
702 let content = String::from_utf8_lossy(&buffer);
704
705 let debug_content = content.chars().take(200).collect::<String>();
707 tracing::debug!("XRef search in last {read_size} bytes: {debug_content:?}");
708
709 let mut lines = content.pdf_lines();
710
711 while let Some(line) = lines.next() {
713 if line.trim() == "startxref" {
714 if let Some(offset_line) = lines.next() {
716 let offset = offset_line
717 .trim()
718 .parse::<u64>()
719 .map_err(|_| ParseError::InvalidXRef)?;
720 return Ok(offset);
721 }
722 }
723 }
724
725 Err(ParseError::InvalidXRef)
726 }
727
728 fn scan_and_fill_missing_objects<R: Read + Seek>(
730 reader: &mut BufReader<R>,
731 table: &mut Self,
732 ) -> ParseResult<()> {
733 let mut buffer = Vec::new();
735 reader.read_to_end(&mut buffer)?;
736
737 let mut _objects_added = 0;
738
739 let mut pos = 0;
741 while pos < buffer.len() {
742 if let Some(obj_pos) = buffer[pos..].windows(3).position(|w| w == b"obj") {
744 let abs_pos = pos + obj_pos;
745 if abs_pos < 4 {
746 pos += obj_pos + 3;
747 continue;
748 }
749
750 let line_start = buffer[..abs_pos]
752 .iter()
753 .rposition(|&b| b == b'\n' || b == b'\r')
754 .map(|p| p + 1)
755 .unwrap_or(0);
756
757 let line_bytes = &buffer[line_start..abs_pos + 3];
759 let line = String::from_utf8_lossy(line_bytes);
760
761 if let Some((obj_num, gen_num)) = Self::parse_obj_header(line.trim()) {
762 if !table.entries.contains_key(&obj_num)
764 && !table.extended_entries.contains_key(&obj_num)
765 {
766 table.add_entry(
768 obj_num,
769 XRefEntry {
770 offset: line_start as u64,
771 generation: gen_num,
772 in_use: true,
773 },
774 );
775 _objects_added += 1;
776 }
777 }
778
779 pos = abs_pos + 3;
780 } else {
781 break;
782 }
783 }
784
785 Ok(())
786 }
787
788 #[allow(dead_code)]
790 fn parse_with_recovery<R: Read + Seek>(reader: &mut BufReader<R>) -> ParseResult<Self> {
791 Self::parse_with_recovery_options(reader, &super::ParseOptions::default())
792 }
793
794 fn parse_with_recovery_options<R: Read + Seek>(
796 reader: &mut BufReader<R>,
797 _options: &super::ParseOptions,
798 ) -> ParseResult<Self> {
799 let mut table = Self::new();
800
801 let mut buffer = Vec::new();
803 reader.read_to_end(&mut buffer)?;
804
805 tracing::debug!("XRef recovery: scanning {} bytes for objects", buffer.len());
806
807 let content = String::from_utf8_lossy(&buffer);
810
811 let mut xref_root_candidate = None;
812 if let Some(root_match) = extract_root_from_xref_stream(&content) {
813 xref_root_candidate = Some(root_match);
814 tracing::debug!("XRef recovery: Found Root {} in XRef stream", root_match);
815 }
816
817 let mut objects_found = 0;
818 let mut object_streams = Vec::new();
819
820 let mut pos = 0;
823 while pos < buffer.len() {
824 let remaining = &buffer[pos..];
826
827 if let Some(obj_pos) = find_byte_pattern(remaining, b"obj") {
829 let abs_pos = pos + obj_pos;
831 if abs_pos < 4 {
832 pos += obj_pos + 3;
833 continue;
834 }
835
836 let line_start = buffer[..abs_pos]
839 .iter()
840 .rposition(|&b| b == b'\n' || b == b'\r')
841 .map(|p| p + 1)
842 .unwrap_or(0);
843 let line_end = abs_pos + 3; if line_end <= buffer.len() {
847 let line_bytes = &buffer[line_start..line_end];
848
849 if let Some((obj_num, gen_num)) = parse_obj_header_bytes(line_bytes) {
850 let offset = line_start;
851
852 if !table.entries.contains_key(&obj_num) {
854 table.add_entry(
855 obj_num,
856 XRefEntry {
857 offset: offset as u64,
858 generation: gen_num,
859 in_use: true,
860 },
861 );
862 objects_found += 1;
863
864 let obj_end_pos = line_end;
866 if obj_end_pos + 200 < buffer.len() {
868 let search_bytes = &buffer[obj_end_pos..obj_end_pos + 200];
869 if let Some(stream_pos) =
870 search_bytes.windows(6).position(|w| w == b"stream")
871 {
872 let check_bytes =
874 &buffer[obj_end_pos..obj_end_pos + stream_pos];
875 let check_str = String::from_utf8_lossy(check_bytes);
876 if check_str.contains("/Type") && check_str.contains("/ObjStm")
877 {
878 object_streams.push(obj_num);
879 tracing::debug!(
880 "XRef recovery: found object stream at object {obj_num}"
881 );
882 }
883 }
884 }
885 }
886 }
887 }
888
889 pos = abs_pos + 3;
890 } else {
891 break;
892 }
893 }
894
895 tracing::debug!(
896 "XRef recovery: found {} objects and {} object streams",
897 objects_found,
898 object_streams.len()
899 );
900
901 if objects_found == 0 {
902 return Err(ParseError::InvalidXRef);
903 }
904
905 let mut trailer = super::objects::PdfDictionary::new();
910 trailer.insert(
911 "Size".to_string(),
912 super::objects::PdfObject::Integer(table.len() as i64),
913 );
914
915 let mut catalog_candidate = None;
917
918 if let Some(xref_root) = xref_root_candidate {
920 if table.entries.contains_key(&xref_root) {
921 catalog_candidate = Some(xref_root);
922 tracing::debug!("Using Root {} from XRef stream as catalog", xref_root);
923 } else {
924 tracing::debug!(
925 "Warning: XRef Root {} not found in object table, searching manually",
926 xref_root
927 );
928 }
929 }
930
931 if catalog_candidate.is_none() {
933 catalog_candidate = find_catalog_by_content(&table, &buffer);
934 }
935
936 if catalog_candidate.is_none() {
940 for obj_num in [1, 2, 3, 4, 5] {
941 if let Some(entry) = table.entries.get(&obj_num) {
942 if entry.in_use {
943 let offset = entry.offset as usize;
944 if offset < buffer.len() {
945 let obj_pattern = format!("{} 0 obj", obj_num);
947 if let Some(obj_start) =
948 find_byte_pattern(&buffer[offset..], obj_pattern.as_bytes())
949 {
950 let absolute_start = offset + obj_start;
951 if let Some(endobj_pos) =
952 find_byte_pattern(&buffer[absolute_start..], b"endobj")
953 {
954 let absolute_end = absolute_start + endobj_pos;
955 let obj_content_bytes = &buffer[absolute_start..absolute_end];
956 let obj_content = String::from_utf8_lossy(obj_content_bytes);
957
958 if obj_content.contains("/Type/Sig")
960 || obj_content.contains("/Type /Sig")
961 {
962 tracing::debug!("Skipping object {} (Type: Sig)", obj_num);
963 continue;
964 }
965
966 if obj_content.contains("/Type/Catalog")
968 || obj_content.contains("/Type /Catalog")
969 || obj_content.contains("/Pages")
970 {
971 catalog_candidate = Some(obj_num);
972 tracing::debug!("Using fallback catalog candidate: object {} (validated)", obj_num);
973 break;
974 }
975 }
976 }
977 }
978 }
979 }
980 }
981 }
982
983 if catalog_candidate.is_none() && !table.entries.is_empty() {
986 tracing::debug!(
987 "Last resort: Scanning all {} objects for any with /Pages or /Catalog",
988 table.entries.len()
989 );
990
991 let mut obj_numbers: Vec<u32> = table.entries.keys().copied().collect();
993 obj_numbers.sort_unstable();
994
995 for obj_num in obj_numbers {
996 if let Some(entry) = table.entries.get(&obj_num) {
997 if entry.in_use {
998 let offset = entry.offset as usize;
999 if offset < buffer.len() {
1000 let obj_pattern = format!("{} 0 obj", obj_num);
1002 if let Some(obj_start) =
1003 find_byte_pattern(&buffer[offset..], obj_pattern.as_bytes())
1004 {
1005 let absolute_start = offset + obj_start;
1006 if let Some(endobj_pos) =
1007 find_byte_pattern(&buffer[absolute_start..], b"endobj")
1008 {
1009 let absolute_end = absolute_start + endobj_pos;
1010 let obj_content_bytes = &buffer[absolute_start..absolute_end];
1011
1012 let obj_content = String::from_utf8_lossy(obj_content_bytes);
1014
1015 if obj_content.contains("/Type/Sig")
1017 || obj_content.contains("/Type /Sig")
1018 {
1019 continue;
1020 }
1021
1022 if obj_content.contains("/Type/Catalog")
1024 || obj_content.contains("/Type /Catalog")
1025 {
1026 catalog_candidate = Some(obj_num);
1027 tracing::debug!("Last resort: Found catalog at object {} (/Type/Catalog)", obj_num);
1028 break;
1029 } else if obj_content.contains("/Pages") {
1030 catalog_candidate = Some(obj_num);
1031 tracing::debug!(
1032 "Last resort: Found catalog at object {} (has /Pages)",
1033 obj_num
1034 );
1035 break;
1036 }
1037 }
1038 }
1039 }
1040 }
1041 }
1042 }
1043
1044 if catalog_candidate.is_none() {
1049 tracing::debug!("Extreme last resort: Scanning last 100KB for /Type/Catalog");
1050
1051 const SEARCH_WINDOW: usize = 100 * 1024; let search_start = if buffer.len() > SEARCH_WINDOW {
1055 buffer.len() - SEARCH_WINDOW
1056 } else {
1057 0
1058 };
1059 let search_buffer = &buffer[search_start..];
1060
1061 let catalog_pattern = b"/Type/Catalog";
1062 if let Some(catalog_pos) = rfind_byte_pattern(search_buffer, catalog_pattern) {
1063 let absolute_pos = search_start + catalog_pos;
1064 tracing::debug!(
1065 "Extreme last resort: Found /Type/Catalog at position {}",
1066 absolute_pos
1067 );
1068
1069 let local_search_start = if catalog_pos > 200 {
1072 catalog_pos - 200
1073 } else {
1074 0
1075 };
1076 let search_area = &search_buffer[local_search_start..catalog_pos];
1077
1078 if let Some(obj_pattern_pos) = rfind_byte_pattern(search_area, b" 0 obj") {
1080 let before_obj = &search_area[..obj_pattern_pos];
1082
1083 let before_obj_str = String::from_utf8_lossy(before_obj);
1085 let trimmed = before_obj_str.trim_end();
1086
1087 if let Some(digit_start) = trimmed.rfind(|c: char| !c.is_ascii_digit()) {
1088 let num_str = trimmed[digit_start + 1..].trim();
1089 if !num_str.is_empty() {
1090 if let Ok(obj_num) = num_str.parse::<u32>() {
1091 tracing::debug!(
1092 "Extreme last resort: Found /Type/Catalog at object {}",
1093 obj_num
1094 );
1095 catalog_candidate = Some(obj_num);
1096 }
1097 }
1098 } else {
1099 let num_str = trimmed.trim();
1101 if let Ok(obj_num) = num_str.parse::<u32>() {
1102 tracing::debug!(
1103 "Extreme last resort: Found /Type/Catalog at object {}",
1104 obj_num
1105 );
1106 catalog_candidate = Some(obj_num);
1107 }
1108 }
1109 }
1110 } else {
1111 tracing::debug!("Extreme last resort: No /Type/Catalog found in last 100KB");
1112 }
1113
1114 if catalog_candidate.is_none() {
1117 tracing::warn!(" Could not find any catalog object, using first non-signature object as absolute last resort");
1118 for obj_num in table.entries.keys().copied().collect::<Vec<_>>().iter() {
1119 let offset = match table.entries.get(obj_num) {
1120 Some(entry) => entry.offset as usize,
1121 None => continue, };
1123 if offset < buffer.len() {
1124 let obj_pattern = format!("{} 0 obj", obj_num);
1125 if let Some(obj_start) =
1126 find_byte_pattern(&buffer[offset..], obj_pattern.as_bytes())
1127 {
1128 let absolute_start = offset + obj_start;
1129 if let Some(endobj_pos) =
1130 find_byte_pattern(&buffer[absolute_start..], b"endobj")
1131 {
1132 let absolute_end = absolute_start + endobj_pos;
1133 let obj_content_bytes = &buffer[absolute_start..absolute_end];
1134 let obj_content = String::from_utf8_lossy(obj_content_bytes);
1135 if !obj_content.contains("/Type/Sig")
1136 && !obj_content.contains("/Type /Sig")
1137 {
1138 catalog_candidate = Some(*obj_num);
1139 tracing::debug!(
1140 "Using object {} as absolute last resort",
1141 obj_num
1142 );
1143 break;
1144 }
1145 }
1146 }
1147 }
1148 }
1149 }
1150 }
1151 }
1152
1153 if let Some(root_obj) = catalog_candidate {
1154 trailer.insert(
1155 "Root".to_string(),
1156 super::objects::PdfObject::Reference(root_obj, 0),
1157 );
1158 }
1159
1160 table.set_trailer(trailer);
1161
1162 Ok(table)
1163 }
1164
1165 fn parse_obj_header(line: &str) -> Option<(u32, u16)> {
1167 let parts: Vec<&str> = line.split_whitespace().collect();
1168
1169 if parts.len() >= 3 && parts[2] == "obj" {
1170 if let (Ok(obj_num), Ok(gen_num)) = (parts[0].parse::<u32>(), parts[1].parse::<u16>()) {
1171 return Some((obj_num, gen_num));
1172 }
1173 }
1174
1175 None
1176 }
1177
1178 #[allow(dead_code)]
1183 fn validate_offset<R: Read + Seek>(reader: &mut BufReader<R>, offset: u64) -> ParseResult<()> {
1184 let file_size = reader.seek(SeekFrom::End(0))?;
1186
1187 if offset >= file_size {
1188 #[cfg(debug_assertions)]
1189 tracing::warn!(" XRef offset {offset} exceeds file size {file_size}");
1190 return Err(ParseError::InvalidXRef);
1191 }
1192
1193 reader.seek(SeekFrom::Start(offset))?;
1195 let mut peek = [0u8; 20];
1196 let read_bytes = reader.read(&mut peek)?;
1197
1198 if read_bytes == 0 {
1199 #[cfg(debug_assertions)]
1200 tracing::warn!(" XRef offset {offset} points to EOF");
1201 return Err(ParseError::InvalidXRef);
1202 }
1203
1204 let peek_slice = &peek[..read_bytes];
1207 let starts_with_xref = peek_slice.len() >= 4 && &peek_slice[..4] == b"xref";
1208 let starts_with_digit = peek_slice.first().map_or(false, |&b| b.is_ascii_digit());
1209
1210 if !starts_with_xref && !starts_with_digit {
1211 #[cfg(debug_assertions)]
1212 {
1213 let debug_len = std::cmp::min(10, read_bytes);
1214 let debug_content = String::from_utf8_lossy(&peek[..debug_len]);
1215 tracing::debug!(
1216 "Warning: XRef offset {} does not point to valid XRef content: {:?}",
1217 offset,
1218 debug_content
1219 );
1220 }
1221 }
1223
1224 Ok(())
1225 }
1226
1227 fn parse_xref_entry(line: &str) -> ParseResult<XRefEntry> {
1229 let line = line.trim();
1230
1231 if line.len() >= 18 {
1233 if let Ok(entry) = Self::parse_xref_entry_standard(line) {
1234 return Ok(entry);
1235 }
1236 }
1237
1238 Self::parse_xref_entry_flexible(line)
1240 }
1241
1242 fn parse_xref_entry_standard(line: &str) -> ParseResult<XRefEntry> {
1244 if line.len() < 18 {
1247 return Err(ParseError::InvalidXRef);
1248 }
1249
1250 let offset_str = &line[0..10];
1251 let gen_str = &line[11..16];
1252 let flag = line.chars().nth(17);
1253
1254 let offset = offset_str
1255 .trim()
1256 .parse::<u64>()
1257 .map_err(|_| ParseError::InvalidXRef)?;
1258 let generation = gen_str
1259 .trim()
1260 .parse::<u16>()
1261 .map_err(|_| ParseError::InvalidXRef)?;
1262
1263 let in_use = match flag {
1264 Some('n') => true,
1265 Some('f') => false,
1266 _ => return Err(ParseError::InvalidXRef),
1267 };
1268
1269 Ok(XRefEntry {
1270 offset,
1271 generation,
1272 in_use,
1273 })
1274 }
1275
1276 fn parse_xref_entry_flexible(line: &str) -> ParseResult<XRefEntry> {
1278 let parts: Vec<&str> = line.split_whitespace().collect();
1286
1287 if parts.is_empty() {
1288 return Err(ParseError::InvalidXRef);
1289 }
1290
1291 let offset = parts[0]
1293 .parse::<u64>()
1294 .map_err(|_| ParseError::InvalidXRef)?;
1295
1296 let (generation, flag_from_gen) = if parts.len() >= 2 {
1298 let gen_part = parts[1];
1299 if gen_part == "n" || gen_part == "f" {
1301 (0, gen_part.chars().next())
1303 } else if gen_part.ends_with('n') || gen_part.ends_with('f') {
1304 let flag_char = gen_part.chars().last().ok_or(ParseError::InvalidXRef)?;
1306 let gen_str = &gen_part[..gen_part.len() - 1];
1307 if gen_str.is_empty() {
1308 (0, Some(flag_char))
1310 } else {
1311 let gen = gen_str
1312 .parse::<u16>()
1313 .map_err(|_| ParseError::InvalidXRef)?;
1314 (gen, Some(flag_char))
1315 }
1316 } else {
1317 let gen = gen_part
1319 .parse::<u16>()
1320 .map_err(|_| ParseError::InvalidXRef)?;
1321 (gen, None)
1322 }
1323 } else {
1324 (0, None)
1325 };
1326
1327 let in_use = if let Some(flag_char) = flag_from_gen {
1329 match flag_char {
1331 'n' => true,
1332 'f' => false,
1333 _ => true, }
1335 } else if parts.len() >= 3 {
1336 match parts[2].chars().next() {
1338 Some('n') => true,
1339 Some('f') => false,
1340 _ => {
1341 #[cfg(debug_assertions)]
1343 tracing::warn!(" Invalid xref flag '{}', assuming 'n'", parts[2]);
1344 true
1345 }
1346 }
1347 } else {
1348 true
1350 };
1351
1352 Ok(XRefEntry {
1353 offset,
1354 generation,
1355 in_use,
1356 })
1357 }
1358
1359 pub fn get_entry(&self, obj_num: u32) -> Option<&XRefEntry> {
1361 self.entries.get(&obj_num)
1362 }
1363
1364 pub fn get_entry_mut(&mut self, obj_num: u32) -> Option<&mut XRefEntry> {
1366 self.entries.get_mut(&obj_num)
1367 }
1368
1369 pub fn trailer(&self) -> Option<&super::objects::PdfDictionary> {
1371 self.trailer.as_ref()
1372 }
1373
1374 pub fn xref_offset(&self) -> u64 {
1376 self.xref_offset
1377 }
1378
1379 pub fn len(&self) -> usize {
1381 self.entries.len()
1382 }
1383
1384 pub fn is_empty(&self) -> bool {
1386 self.entries.is_empty()
1387 }
1388
1389 pub fn iter(&self) -> impl Iterator<Item = (&u32, &XRefEntry)> {
1391 self.entries.iter()
1392 }
1393
1394 pub fn get_extended_entry(&self, obj_num: u32) -> Option<&XRefEntryExt> {
1396 self.extended_entries.get(&obj_num)
1397 }
1398
1399 pub fn is_compressed(&self, obj_num: u32) -> bool {
1401 self.extended_entries
1402 .get(&obj_num)
1403 .map(|e| e.compressed_info.is_some())
1404 .unwrap_or(false)
1405 }
1406
1407 pub fn add_entry(&mut self, obj_num: u32, entry: XRefEntry) {
1409 self.entries.insert(obj_num, entry);
1410 }
1411
1412 pub fn set_trailer(&mut self, trailer: super::objects::PdfDictionary) {
1414 self.trailer = Some(trailer);
1415 }
1416
1417 pub fn add_extended_entry(&mut self, obj_num: u32, entry: XRefEntryExt) {
1419 self.extended_entries.insert(obj_num, entry);
1420 }
1421}
1422
1423#[derive(Debug, Clone)]
1426pub struct XRefStream {
1427 stream: super::objects::PdfStream,
1429 entries: HashMap<u32, XRefEntry>,
1431 extended_entries: HashMap<u32, XRefEntryExt>,
1433}
1434
1435impl XRefStream {
1436 pub fn parse(stream: super::objects::PdfStream) -> ParseResult<Self> {
1438 let mut xref_stream = Self {
1439 stream,
1440 entries: HashMap::new(),
1441 extended_entries: HashMap::new(),
1442 };
1443
1444 xref_stream.decode_entries()?;
1445 Ok(xref_stream)
1446 }
1447
1448 fn decode_entries(&mut self) -> ParseResult<()> {
1450 let dict = &self.stream.dict;
1452
1453 let size = dict
1455 .get("Size")
1456 .and_then(|obj| obj.as_integer())
1457 .ok_or_else(|| ParseError::MissingKey("Size".to_string()))?;
1458
1459 let index = match dict.get("Index") {
1461 Some(obj) => {
1462 let array = obj.as_array().ok_or_else(|| ParseError::SyntaxError {
1463 position: 0,
1464 message: "Index must be an array".to_string(),
1465 })?;
1466
1467 let mut pairs = Vec::new();
1469 for chunk in array.0.chunks(2) {
1470 if chunk.len() != 2 {
1471 return Err(ParseError::SyntaxError {
1472 position: 0,
1473 message: "Index array must have even number of elements".to_string(),
1474 });
1475 }
1476 let first = chunk[0]
1477 .as_integer()
1478 .ok_or_else(|| ParseError::SyntaxError {
1479 position: 0,
1480 message: "Index values must be integers".to_string(),
1481 })? as u32;
1482 let count = chunk[1]
1483 .as_integer()
1484 .ok_or_else(|| ParseError::SyntaxError {
1485 position: 0,
1486 message: "Index values must be integers".to_string(),
1487 })? as u32;
1488 pairs.push((first, count));
1489 }
1490 pairs
1491 }
1492 None => {
1493 vec![(0, size as u32)]
1495 }
1496 };
1497
1498 let w_array = dict
1500 .get("W")
1501 .and_then(|obj| obj.as_array())
1502 .ok_or_else(|| ParseError::MissingKey("W".to_string()))?;
1503
1504 if w_array.len() != 3 {
1505 return Err(ParseError::SyntaxError {
1506 position: 0,
1507 message: "W array must have exactly 3 elements".to_string(),
1508 });
1509 }
1510
1511 let w: Vec<usize> = w_array
1512 .0
1513 .iter()
1514 .map(|obj| {
1515 obj.as_integer()
1516 .ok_or_else(|| ParseError::SyntaxError {
1517 position: 0,
1518 message: "W values must be integers".to_string(),
1519 })
1520 .map(|i| i as usize)
1521 })
1522 .collect::<ParseResult<Vec<_>>>()?;
1523
1524 let data = self.stream.decode(&ParseOptions::default())?;
1526 let mut offset = 0;
1527
1528 for (first_obj_num, count) in index {
1530 for i in 0..count {
1531 if offset + w[0] + w[1] + w[2] > data.len() {
1532 return Err(ParseError::SyntaxError {
1533 position: 0,
1534 message: "Xref stream data truncated".to_string(),
1535 });
1536 }
1537
1538 let field1 = Self::read_field(&data[offset..], w[0]);
1540 offset += w[0];
1541
1542 let field2 = Self::read_field(&data[offset..], w[1]);
1543 offset += w[1];
1544
1545 let field3 = Self::read_field(&data[offset..], w[2]);
1546 offset += w[2];
1547
1548 let entry_info =
1550 XRefEntryInfo::new(XRefEntryType::from_value(field1), field2, field3);
1551
1552 let entry = match entry_info.entry_type {
1554 XRefEntryType::Free => XRefEntry {
1555 offset: entry_info.field2,
1556 generation: entry_info.field3 as u16,
1557 in_use: false,
1558 },
1559 XRefEntryType::Uncompressed => XRefEntry {
1560 offset: entry_info.field2,
1561 generation: entry_info.field3 as u16,
1562 in_use: true,
1563 },
1564 XRefEntryType::Compressed => {
1565 let ext_entry = XRefEntryExt {
1567 basic: XRefEntry {
1568 offset: 0,
1569 generation: 0,
1570 in_use: true,
1571 },
1572 compressed_info: entry_info.get_compressed_info(),
1573 };
1574 self.extended_entries
1575 .insert(first_obj_num + i, ext_entry.clone());
1576 ext_entry.basic
1577 }
1578 XRefEntryType::Custom(_type_num) => {
1579 #[cfg(debug_assertions)]
1582 tracing::debug!(
1583 "Note: Custom xref entry type {} for object {} (treating as in-use)",
1584 _type_num,
1585 first_obj_num + i
1586 );
1587
1588 let ext_entry = XRefEntryExt {
1590 basic: XRefEntry {
1591 offset: entry_info.field2,
1592 generation: entry_info.field3 as u16,
1593 in_use: entry_info.entry_type.is_in_use(),
1594 },
1595 compressed_info: None,
1596 };
1597 self.extended_entries
1598 .insert(first_obj_num + i, ext_entry.clone());
1599 ext_entry.basic
1600 }
1601 };
1602
1603 self.entries.insert(first_obj_num + i, entry);
1604 }
1605 }
1606
1607 Ok(())
1608 }
1609
1610 fn read_field(data: &[u8], width: usize) -> u64 {
1612 let mut value = 0u64;
1613 for i in 0..width {
1614 if i < data.len() {
1615 value = (value << 8) | (data[i] as u64);
1616 }
1617 }
1618 value
1619 }
1620
1621 pub fn get_entry(&self, obj_num: u32) -> Option<&XRefEntry> {
1623 self.entries.get(&obj_num)
1624 }
1625
1626 pub fn trailer(&self) -> &super::objects::PdfDictionary {
1628 &self.stream.dict
1629 }
1630}
1631
1632#[cfg(test)]
1633mod tests {
1634 use super::*;
1635
1636 use crate::parser::objects::{PdfDictionary, PdfObject};
1637 use std::io::Cursor;
1638
1639 #[test]
1640 fn test_parse_xref_entry() {
1641 let entry1 = XRefTable::parse_xref_entry("0000000000 65535 f ").unwrap();
1642 assert_eq!(entry1.offset, 0);
1643 assert_eq!(entry1.generation, 65535);
1644 assert!(!entry1.in_use);
1645
1646 let entry2 = XRefTable::parse_xref_entry("0000000017 00000 n ").unwrap();
1647 assert_eq!(entry2.offset, 17);
1648 assert_eq!(entry2.generation, 0);
1649 assert!(entry2.in_use);
1650 }
1651
1652 #[test]
1653 fn test_parse_xref_entry_flexible() {
1654 let entry1 = XRefTable::parse_xref_entry("17 0 n").unwrap();
1658 assert_eq!(entry1.offset, 17);
1659 assert_eq!(entry1.generation, 0);
1660 assert!(entry1.in_use);
1661
1662 let entry2 = XRefTable::parse_xref_entry("123 5 f").unwrap();
1664 assert_eq!(entry2.offset, 123);
1665 assert_eq!(entry2.generation, 5);
1666 assert!(!entry2.in_use);
1667
1668 let entry3 = XRefTable::parse_xref_entry("456 n").unwrap();
1670 assert_eq!(entry3.offset, 456);
1671 assert_eq!(entry3.generation, 0);
1672 assert!(entry3.in_use);
1673
1674 let entry4 = XRefTable::parse_xref_entry("789 2").unwrap();
1676 assert_eq!(entry4.offset, 789);
1677 assert_eq!(entry4.generation, 2);
1678 assert!(entry4.in_use);
1679
1680 let entry5 = XRefTable::parse_xref_entry("1000 0n").unwrap();
1682 assert_eq!(entry5.offset, 1000);
1683 assert_eq!(entry5.generation, 0);
1684 assert!(entry5.in_use);
1685
1686 let entry6 = XRefTable::parse_xref_entry("2000 1f").unwrap();
1687 assert_eq!(entry6.offset, 2000);
1688 assert_eq!(entry6.generation, 1);
1689 assert!(!entry6.in_use);
1690
1691 let entry7 = XRefTable::parse_xref_entry("3000\t0\tn").unwrap();
1693 assert_eq!(entry7.offset, 3000);
1694 assert_eq!(entry7.generation, 0);
1695 assert!(entry7.in_use);
1696 }
1697
1698 #[test]
1699 fn test_parse_xref_entry_invalid_flag_fallback() {
1700 let entry = XRefTable::parse_xref_entry("100 0 x").unwrap();
1702 assert_eq!(entry.offset, 100);
1703 assert_eq!(entry.generation, 0);
1704 assert!(entry.in_use); }
1706
1707 #[test]
1708 fn test_parse_xref_entry_malformed() {
1709 let result = XRefTable::parse_xref_entry("");
1711 assert!(result.is_err());
1712
1713 let result = XRefTable::parse_xref_entry("abc 0 n");
1715 assert!(result.is_err());
1716
1717 let result = XRefTable::parse_xref_entry(" ");
1719 assert!(result.is_err());
1720 }
1721
1722 #[test]
1723 fn test_xref_table_new() {
1724 let table = XRefTable::new();
1725 assert!(table.entries.is_empty());
1726 assert!(table.extended_entries.is_empty());
1727 assert!(table.trailer.is_none());
1728 assert_eq!(table.xref_offset, 0);
1729 }
1730
1731 #[test]
1732 fn test_xref_table_default() {
1733 let table = XRefTable::default();
1734 assert!(table.entries.is_empty());
1735 assert!(table.extended_entries.is_empty());
1736 assert!(table.trailer.is_none());
1737 }
1738
1739 #[test]
1740 fn test_xref_entry_struct() {
1741 let entry = XRefEntry {
1742 offset: 12345,
1743 generation: 7,
1744 in_use: true,
1745 };
1746 assert_eq!(entry.offset, 12345);
1747 assert_eq!(entry.generation, 7);
1748 assert!(entry.in_use);
1749 }
1750
1751 #[test]
1752 fn test_xref_entry_equality() {
1753 let entry1 = XRefEntry {
1754 offset: 100,
1755 generation: 0,
1756 in_use: true,
1757 };
1758 let entry2 = XRefEntry {
1759 offset: 100,
1760 generation: 0,
1761 in_use: true,
1762 };
1763 assert_eq!(entry1, entry2);
1764 }
1765
1766 #[test]
1767 fn test_xref_entry_clone() {
1768 let entry = XRefEntry {
1769 offset: 999,
1770 generation: 3,
1771 in_use: false,
1772 };
1773 let cloned = entry;
1774 assert_eq!(cloned.offset, 999);
1775 assert_eq!(cloned.generation, 3);
1776 assert!(!cloned.in_use);
1777 }
1778
1779 #[test]
1780 fn test_xref_entry_ext() {
1781 let ext_entry = XRefEntryExt {
1782 basic: XRefEntry {
1783 offset: 500,
1784 generation: 0,
1785 in_use: true,
1786 },
1787 compressed_info: Some((10, 5)),
1788 };
1789 assert_eq!(ext_entry.basic.offset, 500);
1790 assert_eq!(ext_entry.compressed_info, Some((10, 5)));
1791 }
1792
1793 #[test]
1794 fn test_xref_entry_ext_no_compression() {
1795 let ext_entry = XRefEntryExt {
1796 basic: XRefEntry {
1797 offset: 1000,
1798 generation: 1,
1799 in_use: true,
1800 },
1801 compressed_info: None,
1802 };
1803 assert!(ext_entry.compressed_info.is_none());
1804 }
1805
1806 #[test]
1807 fn test_add_entry() {
1808 let mut table = XRefTable::new();
1809 table.add_entry(
1810 5,
1811 XRefEntry {
1812 offset: 1000,
1813 generation: 0,
1814 in_use: true,
1815 },
1816 );
1817 assert_eq!(table.entries.len(), 1);
1818 assert!(table.entries.contains_key(&5));
1819 }
1820
1821 #[test]
1822 fn test_get_entry() {
1823 let mut table = XRefTable::new();
1824 let entry = XRefEntry {
1825 offset: 2000,
1826 generation: 1,
1827 in_use: true,
1828 };
1829 table.add_entry(10, entry);
1830
1831 let retrieved = table.get_entry(10);
1832 assert!(retrieved.is_some());
1833 assert_eq!(retrieved.unwrap().offset, 2000);
1834
1835 let missing = table.get_entry(999);
1836 assert!(missing.is_none());
1837 }
1838
1839 #[test]
1840 fn test_set_trailer() {
1841 let mut table = XRefTable::new();
1842 let mut trailer = PdfDictionary::new();
1843 trailer.insert("Size".to_string(), PdfObject::Integer(10));
1844
1845 table.set_trailer(trailer.clone());
1846 assert!(table.trailer.is_some());
1847 assert_eq!(
1848 table.trailer().unwrap().get("Size"),
1849 Some(&PdfObject::Integer(10))
1850 );
1851 }
1852
1853 #[test]
1854 fn test_parse_xref_entry_invalid() {
1855 let result = XRefTable::parse_xref_entry("0000000000 65535");
1857 assert!(result.is_ok()); let result = XRefTable::parse_xref_entry("not_a_number 65535 f ");
1861 assert!(result.is_err());
1862
1863 let result = XRefTable::parse_xref_entry("0000000000 65535 x ");
1865 assert!(result.is_ok()); assert!(result.unwrap().in_use); }
1868
1869 #[test]
1870 fn test_parse_xref_entry_various_offsets() {
1871 let entry = XRefTable::parse_xref_entry("0000000001 00000 n ").unwrap();
1873 assert_eq!(entry.offset, 1);
1874
1875 let entry = XRefTable::parse_xref_entry("9999999999 00000 n ").unwrap();
1877 assert_eq!(entry.offset, 9999999999);
1878
1879 let entry = XRefTable::parse_xref_entry("0000000000 65535 f ").unwrap();
1881 assert_eq!(entry.generation, 65535);
1882 }
1883
1884 #[test]
1885 fn test_add_extended_entry() {
1886 let mut table = XRefTable::new();
1887 let ext_entry = XRefEntryExt {
1888 basic: XRefEntry {
1889 offset: 0,
1890 generation: 0,
1891 in_use: true,
1892 },
1893 compressed_info: Some((5, 10)),
1894 };
1895
1896 table.add_extended_entry(15, ext_entry);
1897 assert_eq!(table.extended_entries.len(), 1);
1898 assert!(table.extended_entries.contains_key(&15));
1899 }
1900
1901 #[test]
1902 fn test_get_extended_entry() {
1903 let mut table = XRefTable::new();
1904 let ext_entry = XRefEntryExt {
1905 basic: XRefEntry {
1906 offset: 0,
1907 generation: 0,
1908 in_use: true,
1909 },
1910 compressed_info: Some((20, 3)),
1911 };
1912
1913 table.add_extended_entry(7, ext_entry);
1914
1915 let retrieved = table.get_extended_entry(7);
1916 assert!(retrieved.is_some());
1917 assert_eq!(retrieved.unwrap().compressed_info, Some((20, 3)));
1918 }
1919
1920 #[test]
1921 fn test_xref_offset() {
1922 let mut table = XRefTable::new();
1923 assert_eq!(table.xref_offset(), 0);
1924
1925 table.xref_offset = 12345;
1926 assert_eq!(table.xref_offset(), 12345);
1927 }
1928
1929 #[test]
1930 fn test_find_xref_offset_simple() {
1931 let pdf_data = b"startxref\n12345\n%%EOF";
1932 let cursor = Cursor::new(pdf_data.to_vec());
1933 let mut reader = BufReader::new(cursor);
1934
1935 let offset = XRefTable::find_xref_offset(&mut reader).unwrap();
1936 assert_eq!(offset, 12345);
1937 }
1938
1939 #[test]
1940 fn test_find_xref_offset_with_spaces() {
1941 let pdf_data = b"startxref \n 12345 \n%%EOF";
1942 let cursor = Cursor::new(pdf_data.to_vec());
1943 let mut reader = BufReader::new(cursor);
1944
1945 let offset = XRefTable::find_xref_offset(&mut reader).unwrap();
1946 assert_eq!(offset, 12345);
1947 }
1948
1949 #[test]
1950 fn test_find_xref_offset_missing() {
1951 let pdf_data = b"no startxref here";
1952 let cursor = Cursor::new(pdf_data.to_vec());
1953 let mut reader = BufReader::new(cursor);
1954
1955 let result = XRefTable::find_xref_offset(&mut reader);
1956 assert!(result.is_err());
1957 }
1958
1959 #[test]
1960 fn test_trailer_getter() {
1961 let mut table = XRefTable::new();
1962 assert!(table.trailer().is_none());
1963
1964 let trailer = PdfDictionary::new();
1965 table.set_trailer(trailer);
1966 assert!(table.trailer().is_some());
1967 }
1968
1969 #[test]
1970 fn test_xref_table_clone() {
1971 let mut table = XRefTable::new();
1972 table.add_entry(
1973 1,
1974 XRefEntry {
1975 offset: 100,
1976 generation: 0,
1977 in_use: true,
1978 },
1979 );
1980 table.xref_offset = 5000;
1981
1982 let cloned = table.clone();
1983 assert_eq!(cloned.entries.len(), 1);
1984 assert_eq!(cloned.xref_offset, 5000);
1985 }
1986
1987 #[test]
1988 fn test_parse_obj_header() {
1989 assert_eq!(XRefTable::parse_obj_header("1 0 obj"), Some((1, 0)));
1991 assert_eq!(XRefTable::parse_obj_header("123 5 obj"), Some((123, 5)));
1992 assert_eq!(
1993 XRefTable::parse_obj_header(" 42 3 obj "),
1994 Some((42, 3))
1995 );
1996
1997 assert_eq!(XRefTable::parse_obj_header("1 obj"), None);
1999 assert_eq!(XRefTable::parse_obj_header("abc 0 obj"), None);
2000 assert_eq!(XRefTable::parse_obj_header("1 0 object"), None);
2001 assert_eq!(XRefTable::parse_obj_header(""), None);
2002 }
2003
2004 #[test]
2005 fn test_xref_recovery_parsing() {
2006 let pdf_content =
2008 b"1 0 obj\n<< /Type /Catalog >>\nendobj\n2 0 obj\n<< /Type /Page >>\nendobj\n";
2009 let mut reader = BufReader::new(Cursor::new(pdf_content));
2010
2011 let table = XRefTable::parse_with_recovery(&mut reader).unwrap();
2012
2013 assert_eq!(table.len(), 2);
2015 assert!(table.get_entry(1).is_some());
2016 assert!(table.get_entry(2).is_some());
2017
2018 assert!(table.get_entry(1).unwrap().in_use);
2020 assert!(table.get_entry(2).unwrap().in_use);
2021 }
2022
2023 #[test]
2024 fn test_xref_recovery_no_objects() {
2025 let pdf_content = b"This is not a PDF file\nNo objects here\n";
2027 let mut reader = BufReader::new(Cursor::new(pdf_content));
2028
2029 let result = XRefTable::parse_with_recovery(&mut reader);
2030 assert!(result.is_err());
2031 }
2032
2033 #[test]
2034 fn test_offset_validation() {
2035 let pdf_data = b"small file";
2036 let mut reader = BufReader::new(Cursor::new(pdf_data));
2037
2038 assert!(XRefTable::validate_offset(&mut reader, 5).is_ok());
2040
2041 assert!(XRefTable::validate_offset(&mut reader, 100).is_err());
2043
2044 assert!(XRefTable::validate_offset(&mut reader, 10).is_err());
2046 }
2047
2048 #[test]
2049 fn test_xref_parse_with_fallback() {
2050 let pdf_content =
2052 b"1 0 obj\n<< /Type /Catalog >>\nendobj\n2 0 obj\n<< /Type /Page >>\nendobj\n";
2053 let mut reader = BufReader::new(Cursor::new(pdf_content));
2054
2055 let result = XRefTable::parse(&mut reader);
2058 assert!(result.is_err());
2059 if let Err(e) = result {
2060 assert!(matches!(e, ParseError::InvalidXRef));
2061 }
2062 }
2063
2064 #[test]
2065 fn test_xref_entry_creation() {
2066 let entry = XRefEntry {
2067 offset: 1234,
2068 generation: 5,
2069 in_use: true,
2070 };
2071
2072 assert_eq!(entry.offset, 1234);
2073 assert_eq!(entry.generation, 5);
2074 assert!(entry.in_use);
2075 }
2076
2077 #[test]
2078 fn test_xref_entry_ext_creation() {
2079 let basic = XRefEntry {
2080 offset: 5000,
2081 generation: 0,
2082 in_use: true,
2083 };
2084
2085 let ext = XRefEntryExt {
2086 basic: basic.clone(),
2087 compressed_info: Some((10, 3)),
2088 };
2089
2090 assert_eq!(ext.basic.offset, 5000);
2091 assert_eq!(ext.compressed_info, Some((10, 3)));
2092 }
2093
2094 #[test]
2095 fn test_xref_table_new_advanced() {
2096 let table = XRefTable::new();
2097 assert_eq!(table.entries.len(), 0);
2098 assert_eq!(table.extended_entries.len(), 0);
2099 assert!(table.trailer.is_none());
2100 assert_eq!(table.xref_offset, 0);
2101 }
2102
2103 #[test]
2104 fn test_xref_table_default_advanced() {
2105 let table = XRefTable::default();
2106 assert_eq!(table.entries.len(), 0);
2107 assert!(table.trailer.is_none());
2108 }
2109
2110 #[test]
2111 fn test_xref_table_add_entry() {
2112 let mut table = XRefTable::new();
2113
2114 let entry1 = XRefEntry {
2115 offset: 100,
2116 generation: 0,
2117 in_use: true,
2118 };
2119 table.add_entry(1, entry1);
2120 let entry2 = XRefEntry {
2121 offset: 200,
2122 generation: 1,
2123 in_use: false,
2124 };
2125 table.add_entry(2, entry2);
2126
2127 assert_eq!(table.len(), 2);
2128
2129 let entry1 = table.get_entry(1).unwrap();
2130 assert_eq!(entry1.offset, 100);
2131 assert_eq!(entry1.generation, 0);
2132 assert!(entry1.in_use);
2133
2134 let entry2 = table.get_entry(2).unwrap();
2135 assert_eq!(entry2.offset, 200);
2136 assert_eq!(entry2.generation, 1);
2137 assert!(!entry2.in_use);
2138 }
2139
2140 #[test]
2141 fn test_xref_table_add_extended_entry() {
2142 let mut table = XRefTable::new();
2143
2144 let basic_entry = XRefEntry {
2145 offset: 0,
2146 generation: 0,
2147 in_use: true,
2148 };
2149
2150 let extended_entry = XRefEntryExt {
2151 basic: basic_entry,
2152 compressed_info: Some((10, 2)),
2153 };
2154
2155 table.add_extended_entry(5, extended_entry);
2156
2157 let ext = table.get_extended_entry(5);
2159 assert!(ext.is_some());
2160 if let Some(ext) = ext {
2161 assert_eq!(ext.compressed_info, Some((10, 2)));
2162 }
2163
2164 assert!(table.is_compressed(5));
2165 }
2166
2167 #[test]
2168 fn test_xref_table_get_nonexistent() {
2169 let table = XRefTable::new();
2170 assert!(table.get_entry(999).is_none());
2171 assert!(table.get_extended_entry(999).is_none());
2172 }
2173
2174 #[test]
2175 fn test_xref_table_update_entry() {
2176 let mut table = XRefTable::new();
2177
2178 let entry1 = XRefEntry {
2180 offset: 100,
2181 generation: 0,
2182 in_use: true,
2183 };
2184 table.add_entry(1, entry1);
2185
2186 let entry2 = XRefEntry {
2188 offset: 200,
2189 generation: 1,
2190 in_use: false,
2191 };
2192 table.add_entry(1, entry2);
2193
2194 let entry = table.get_entry(1).unwrap();
2196 assert_eq!(entry.offset, 200);
2197 assert_eq!(entry.generation, 1);
2198 assert!(!entry.in_use);
2199 }
2200
2201 #[test]
2202 fn test_xref_table_set_trailer() {
2203 let mut table = XRefTable::new();
2204 assert!(table.trailer.is_none());
2205
2206 let mut trailer = PdfDictionary::new();
2207 trailer.insert("Size".to_string(), PdfObject::Integer(10));
2208
2209 table.set_trailer(trailer.clone());
2210 assert!(table.trailer.is_some());
2211 assert_eq!(table.trailer(), Some(&trailer));
2212 }
2213
2214 #[test]
2215 fn test_xref_table_offset() {
2216 let table = XRefTable::new();
2217 assert_eq!(table.xref_offset(), 0);
2218 }
2219
2220 #[test]
2221 fn test_parse_xref_entry_invalid_static() {
2222 let invalid_lines = vec![
2223 "not a valid entry".to_string(),
2224 "12345 abcde n".to_string(), ];
2226
2227 for line in invalid_lines {
2228 let result = XRefTable::parse_xref_entry(&line);
2229 assert!(result.is_err());
2230 }
2231
2232 let result = XRefTable::parse_xref_entry("12345 00000");
2234 assert!(result.is_ok());
2235 let entry = result.unwrap();
2236 assert_eq!(entry.offset, 12345);
2237 assert_eq!(entry.generation, 0);
2238 assert!(entry.in_use); }
2240
2241 #[test]
2242 fn test_xref_entry_operations() {
2243 let mut table = XRefTable::new();
2244
2245 let entry1 = XRefEntry {
2247 offset: 1234,
2248 generation: 5,
2249 in_use: true,
2250 };
2251
2252 let entry2 = XRefEntry {
2253 offset: 5678,
2254 generation: 10,
2255 in_use: false,
2256 };
2257
2258 table.add_entry(1, entry1);
2259 table.add_entry(2, entry2);
2260
2261 assert_eq!(table.len(), 2);
2262
2263 let retrieved1 = table.get_entry(1).unwrap();
2264 assert_eq!(retrieved1.offset, 1234);
2265 assert_eq!(retrieved1.generation, 5);
2266 assert!(retrieved1.in_use);
2267
2268 let retrieved2 = table.get_entry(2).unwrap();
2269 assert_eq!(retrieved2.offset, 5678);
2270 assert_eq!(retrieved2.generation, 10);
2271 assert!(!retrieved2.in_use);
2272 }
2273
2274 #[test]
2275 fn test_parse_xref_with_comments() {
2276 let pdf_content = b"%PDF-1.4\n\
22771 0 obj\n<< /Type /Catalog >>\nendobj\n\
2278xref\n\
2279% This is a comment\n\
22800 2\n\
22810000000000 65535 f \n\
22820000000015 00000 n \n\
2283% Another comment\n\
2284trailer\n\
2285<< /Size 2 /Root 1 0 R >>\n\
2286startxref\n\
228745\n\
2288%%EOF";
2289
2290 let mut reader = BufReader::new(Cursor::new(pdf_content));
2291 reader.seek(SeekFrom::Start(45)).unwrap(); let result = XRefTable::parse(&mut reader);
2294 assert!(result.is_ok());
2295 let table = result.unwrap();
2296 assert_eq!(table.len(), 2);
2297 }
2298
2299 #[test]
2300 fn test_parse_multiple_xref_sections() {
2301 let pdf_content = b"%PDF-1.4\n\
23021 0 obj\n<< /Type /Catalog >>\nendobj\n\
23032 0 obj\n<< /Type /Page >>\nendobj\n\
2304xref\n\
23050 2\n\
23060000000000 65535 f \n\
23070000000015 00000 n \n\
23085 2\n\
23090000000100 00000 n \n\
23100000000200 00000 n \n\
2311trailer\n\
2312<< /Size 7 /Root 1 0 R >>\n\
2313startxref\n\
231478\n\
2315%%EOF";
2316
2317 let mut reader = BufReader::new(Cursor::new(pdf_content));
2318 reader.seek(SeekFrom::Start(78)).unwrap(); let result = XRefTable::parse(&mut reader);
2321 assert!(result.is_ok());
2322 let table = result.unwrap();
2323 assert_eq!(table.len(), 4);
2325 assert!(table.get_entry(0).is_some());
2326 assert!(table.get_entry(1).is_some());
2327 assert!(table.get_entry(5).is_some());
2328 assert!(table.get_entry(6).is_some());
2329 }
2330
2331 #[test]
2332 fn test_parse_xref_with_prev() {
2333 let pdf_content = b"%PDF-1.4\n\
2335% First xref at 15\n\
2336xref\n\
23370 2\n\
23380000000000 65535 f \n\
23390000000100 00000 n \n\
2340trailer\n\
2341<< /Size 2 >>\n\
2342% Second xref at 100\n\
2343xref\n\
23442 1\n\
23450000000200 00000 n \n\
2346trailer\n\
2347<< /Size 3 /Prev 15 >>\n\
2348startxref\n\
2349100\n\
2350%%EOF";
2351
2352 let mut reader = BufReader::new(Cursor::new(pdf_content));
2353 let options = ParseOptions {
2354 lenient_syntax: true,
2355 ..Default::default()
2356 };
2357
2358 let result = XRefTable::parse_with_options(&mut reader, &options);
2359 assert!(result.is_ok() || result.is_err());
2361 }
2362
2363 #[test]
2364 fn test_invalid_xref_format() {
2365 let pdf_content = b"xref\ninvalid content\ntrailer";
2366 let mut reader = BufReader::new(Cursor::new(pdf_content));
2367
2368 let result = XRefTable::parse(&mut reader);
2369 assert!(result.is_err());
2370 }
2371
2372 #[test]
2373 fn test_xref_entry_overflow() {
2374 let mut table = XRefTable::new();
2375
2376 let entry = XRefEntry {
2378 offset: u64::MAX,
2379 generation: u16::MAX,
2380 in_use: true,
2381 };
2382 table.add_entry(u32::MAX, entry);
2383
2384 let entry = table.get_entry(u32::MAX).unwrap();
2385 assert_eq!(entry.offset, u64::MAX);
2386 assert_eq!(entry.generation, u16::MAX);
2387 }
2388
2389 #[test]
2390 fn test_xref_table_operations() {
2391 let mut table = XRefTable::new();
2392
2393 let entry1 = XRefEntry {
2395 offset: 100,
2396 generation: 0,
2397 in_use: true,
2398 };
2399
2400 let entry2 = XRefEntry {
2401 offset: 200,
2402 generation: 0,
2403 in_use: true,
2404 };
2405
2406 table.add_entry(1, entry1);
2407 table.add_entry(2, entry2);
2408
2409 assert_eq!(table.len(), 2);
2410 assert!(table.get_entry(1).is_some());
2411 assert!(table.get_entry(2).is_some());
2412 assert!(table.get_entry(3).is_none());
2413 }
2414
2415 #[test]
2416 fn test_xref_table_merge() {
2417 let mut table1 = XRefTable::new();
2418 let entry1 = XRefEntry {
2419 offset: 100,
2420 generation: 0,
2421 in_use: true,
2422 };
2423 table1.add_entry(1, entry1);
2424 let entry2 = XRefEntry {
2425 offset: 200,
2426 generation: 0,
2427 in_use: true,
2428 };
2429 table1.add_entry(2, entry2);
2430
2431 let mut table2 = XRefTable::new();
2432 let entry3 = XRefEntry {
2433 offset: 250,
2434 generation: 1,
2435 in_use: true,
2436 }; table2.add_entry(2, entry3);
2438 let entry4 = XRefEntry {
2439 offset: 300,
2440 generation: 0,
2441 in_use: true,
2442 }; table2.add_entry(3, entry4);
2444
2445 for i in 2..=3 {
2448 if let Some(entry) = table2.get_entry(i) {
2449 table1.add_entry(
2450 i,
2451 XRefEntry {
2452 offset: entry.offset,
2453 generation: entry.generation,
2454 in_use: entry.in_use,
2455 },
2456 );
2457 }
2458 }
2459
2460 assert_eq!(table1.len(), 3);
2461
2462 let entry2 = table1.get_entry(2).unwrap();
2464 assert_eq!(entry2.offset, 250);
2465 assert_eq!(entry2.generation, 1);
2466
2467 assert!(table1.get_entry(3).is_some());
2469 }
2470
2471 #[test]
2472 fn test_xref_recovery_with_stream() {
2473 let pdf_content = b"1 0 obj\n<< /Type /ObjStm /N 2 /First 10 >>\nstream\n12345678901 0 2 0\nendstream\nendobj\n";
2474 let mut reader = BufReader::new(Cursor::new(pdf_content));
2475
2476 let result = XRefTable::parse_with_recovery(&mut reader);
2477 assert!(result.is_ok() || result.is_err());
2479 }
2480
2481 #[test]
2482 fn test_xref_entry_equality_advanced() {
2483 let entry1 = XRefEntry {
2484 offset: 100,
2485 generation: 0,
2486 in_use: true,
2487 };
2488
2489 let entry2 = XRefEntry {
2490 offset: 100,
2491 generation: 0,
2492 in_use: true,
2493 };
2494
2495 let entry3 = XRefEntry {
2496 offset: 200,
2497 generation: 0,
2498 in_use: true,
2499 };
2500
2501 assert_eq!(entry1, entry2);
2502 assert_ne!(entry1, entry3);
2503 }
2504
2505 #[test]
2506 fn test_parse_options_effect() {
2507 let pdf_content = b"xref 0 1 invalid";
2508 let mut reader = BufReader::new(Cursor::new(pdf_content));
2509
2510 let strict_options = ParseOptions {
2512 lenient_syntax: false,
2513 ..Default::default()
2514 };
2515 let result = XRefTable::parse_with_options(&mut reader, &strict_options);
2516 assert!(result.is_err());
2517
2518 reader.seek(SeekFrom::Start(0)).unwrap();
2520 let lenient_options = ParseOptions {
2521 lenient_syntax: true,
2522 ..Default::default()
2523 };
2524 let result = XRefTable::parse_with_options(&mut reader, &lenient_options);
2525 assert!(result.is_err() || result.is_ok());
2527 }
2528
2529 #[test]
2530 fn test_circular_reference_detection() {
2531 let pdf_content = b"%PDF-1.4\n\
2533xref\n\
25340 1\n\
25350000000000 65535 f \n\
2536trailer\n\
2537<< /Size 1 /Prev 10 >>\n\
2538startxref\n\
253910\n\
2540%%EOF";
2541
2542 let mut reader = BufReader::new(Cursor::new(pdf_content));
2543
2544 let result = XRefTable::parse_with_incremental_updates(&mut reader);
2546 assert!(result.is_ok() || result.is_err());
2548 }
2549
2550 #[test]
2551 fn test_linearized_xref_detection() {
2552 let pdf_content = b"%PDF-1.4\n\
25541 0 obj\n\
2555<< /Linearized 1 /L 1234 /H [100 200] /O 5 /E 500 /N 10 /T 600 >>\n\
2556endobj\n\
2557xref\n\
25580 2\n\
25590000000000 65535 f \n\
25600000000009 00000 n \n\
2561trailer\n\
2562<< /Size 2 >>\n\
2563startxref\n\
256463\n\
2565%%EOF";
2566
2567 let mut reader = BufReader::new(Cursor::new(pdf_content));
2568
2569 let result = XRefTable::find_linearized_xref(&mut reader);
2571 assert!(result.is_ok());
2572
2573 let xref_pos = result.unwrap();
2576 assert_eq!(
2577 xref_pos, 90,
2578 "Expected xref at position 90, got {}",
2579 xref_pos
2580 );
2581 }
2582
2583 #[test]
2584 fn test_xref_stream_parsing() {
2585 let pdf_content = b"%PDF-1.5\n\
25881 0 obj\n\
2589<< /Type /XRef /Size 3 /W [1 2 1] /Length 12 >>\n\
2590stream\n\
2591\x00\x00\x00\x00\
2592\x01\x00\x10\x00\
2593\x01\x00\x20\x00\
2594endstream\n\
2595endobj\n\
2596startxref\n\
25979\n\
2598%%EOF";
2599
2600 let mut reader = BufReader::new(Cursor::new(pdf_content));
2601 reader.seek(SeekFrom::Start(9)).unwrap();
2602
2603 let result = XRefTable::parse(&mut reader);
2605 assert!(result.is_err() || result.is_ok());
2607 }
2608
2609 #[test]
2610 fn test_xref_validation_max_object_exceeds_size() {
2611 let pdf_content = b"%PDF-1.4\n\
2613xref\n\
26140 1\n\
26150000000000 65535 f \n\
261610 1\n\
26170000000100 00000 n \n\
2618trailer\n\
2619<< /Size 5 /Root 1 0 R >>\n\
2620startxref\n\
26219\n\
2622%%EOF";
2623
2624 let mut reader = BufReader::new(Cursor::new(pdf_content));
2625 reader.seek(SeekFrom::Start(9)).unwrap();
2626
2627 let result = XRefTable::parse(&mut reader);
2629 assert!(result.is_err());
2630 }
2631
2632 #[test]
2633 fn test_parse_with_options_lenient_vs_strict() {
2634 let pdf_content = b"%PDF-1.4\n\
2636xref\n\
26370 2\n\
26380000000000 65535 f \n\
26390000000015 00000 n \n\
2640trailer\n\
2641<< /Size 2 >>\n\
2642startxref\n\
26439\n\
2644%%EOF";
2645
2646 let mut reader = BufReader::new(Cursor::new(pdf_content));
2647
2648 let strict_options = ParseOptions {
2650 lenient_syntax: false,
2651 recover_from_stream_errors: false,
2652 ..Default::default()
2653 };
2654 reader.seek(SeekFrom::Start(9)).unwrap();
2655 let strict_result = XRefTable::parse_with_options(&mut reader, &strict_options);
2656
2657 let lenient_options = ParseOptions {
2659 lenient_syntax: true,
2660 recover_from_stream_errors: true,
2661 ..Default::default()
2662 };
2663 reader.seek(SeekFrom::Start(9)).unwrap();
2664 let lenient_result = XRefTable::parse_with_options(&mut reader, &lenient_options);
2665
2666 assert!(strict_result.is_ok());
2668 assert!(lenient_result.is_ok());
2669 }
2670
2671 #[test]
2672 fn test_xref_entry_with_attached_flag() {
2673 let entry1 = XRefTable::parse_xref_entry("12345 0n");
2675 assert!(entry1.is_ok());
2676 let entry1 = entry1.unwrap();
2677 assert_eq!(entry1.offset, 12345);
2678 assert_eq!(entry1.generation, 0);
2679 assert!(entry1.in_use);
2680
2681 let entry2 = XRefTable::parse_xref_entry("54321 1f");
2682 assert!(entry2.is_ok());
2683 let entry2 = entry2.unwrap();
2684 assert_eq!(entry2.offset, 54321);
2685 assert_eq!(entry2.generation, 1);
2686 assert!(!entry2.in_use);
2687 }
2688
2689 #[test]
2690 fn test_find_xref_offset_edge_cases() {
2691 use std::io::{BufReader, Cursor};
2693
2694 let content = b"garbage\nstartxref \n 123 \n%%EOF";
2696 let mut reader = BufReader::new(Cursor::new(content));
2697 let result = XRefTable::find_xref_offset(&mut reader);
2698 assert_eq!(result.unwrap(), 123);
2699
2700 let content = b"startxref\n999\n%%EOF";
2702 let mut reader = BufReader::new(Cursor::new(content));
2703 let result = XRefTable::find_xref_offset(&mut reader);
2704 assert_eq!(result.unwrap(), 999);
2705
2706 let content = b"startxref\n456";
2708 let mut reader = BufReader::new(Cursor::new(content));
2709 let result = XRefTable::find_xref_offset(&mut reader);
2710 assert!(result.is_ok() || result.is_err());
2712
2713 let content = b"some content\n%%EOF";
2715 let mut reader = BufReader::new(Cursor::new(content));
2716 let result = XRefTable::find_xref_offset(&mut reader);
2717 assert!(result.is_err());
2718 }
2719
2720 #[test]
2721 fn test_xref_subsection_incomplete() {
2722 let pdf_content = b"%PDF-1.4\n\
2724xref\n\
27250 5\n\
27260000000000 65535 f \n\
27270000000015 00000 n \n\
2728trailer\n\
2729<< /Size 5 >>\n\
2730startxref\n\
27319\n\
2732%%EOF";
2733
2734 let mut reader = BufReader::new(Cursor::new(pdf_content));
2735 reader.seek(SeekFrom::Start(9)).unwrap();
2736
2737 let result = XRefTable::parse(&mut reader);
2739 assert!(result.is_err() || result.is_ok());
2741 }
2742}
2743
2744fn extract_root_from_xref_stream(content: &str) -> Option<u32> {
2746 let lines: Vec<&str> = content.lines().collect();
2751 let mut in_xref_obj = false;
2752
2753 for (i, line) in lines.iter().enumerate() {
2754 if line.contains(" obj")
2756 && lines
2757 .get(i + 1)
2758 .map_or(false, |next| next.contains("/Type /XRef"))
2759 {
2760 in_xref_obj = true;
2761 continue;
2762 }
2763
2764 if in_xref_obj {
2766 if line.contains("endobj") {
2767 in_xref_obj = false;
2768 continue;
2769 }
2770
2771 if let Some(root_pos) = line.find("/Root ") {
2773 let after_root = &line[root_pos + 6..]; if let Some(space_pos) = after_root.find(' ') {
2777 let number_part = &after_root[..space_pos];
2778 if let Ok(root_obj) = number_part.parse::<u32>() {
2779 tracing::debug!("Extracted Root {} from XRef stream", root_obj);
2780 return Some(root_obj);
2781 }
2782 }
2783 }
2784 }
2785 }
2786
2787 None
2788}
2789
2790fn find_catalog_by_content(table: &XRefTable, buffer: &[u8]) -> Option<u32> {
2793 for (obj_num, entry) in &table.entries {
2794 if entry.in_use {
2795 let offset = entry.offset as usize;
2796 if offset < buffer.len() {
2797 let obj_pattern = format!("{} 0 obj", obj_num);
2799 if let Some(obj_start) =
2800 find_byte_pattern(&buffer[offset..], obj_pattern.as_bytes())
2801 {
2802 let absolute_start = offset + obj_start;
2803
2804 if let Some(endobj_pos) =
2806 find_byte_pattern(&buffer[absolute_start..], b"endobj")
2807 {
2808 let absolute_end = absolute_start + endobj_pos;
2809 let obj_content_bytes = &buffer[absolute_start..absolute_end];
2810 let obj_content = String::from_utf8_lossy(obj_content_bytes);
2811
2812 if obj_content.contains("/Type /Catalog") {
2814 tracing::debug!(
2815 "Found catalog candidate at object {} (validated structure)",
2816 obj_num
2817 );
2818 return Some(*obj_num);
2819 }
2820 }
2821 }
2822 }
2823 }
2824 }
2825
2826 tracing::debug!("No valid catalog found by content search");
2827 None
2828}