use super::xref_stream;
use super::xref_types::{XRefEntryInfo, XRefEntryType};
use super::{ParseError, ParseOptions, ParseResult};
use crate::parser::reader::PDFLines;
use std::collections::HashMap;
use std::io::{BufRead, BufReader, Read, Seek, SeekFrom};
pub(crate) fn find_byte_pattern(buffer: &[u8], pattern: &[u8]) -> Option<usize> {
buffer
.windows(pattern.len())
.position(|window| window == pattern)
}
fn rfind_byte_pattern(buffer: &[u8], pattern: &[u8]) -> Option<usize> {
buffer
.windows(pattern.len())
.rposition(|window| window == pattern)
}
fn parse_obj_header_bytes(line_bytes: &[u8]) -> Option<(u32, u16)> {
let line = String::from_utf8_lossy(line_bytes);
let parts: Vec<&str> = line.trim().split_whitespace().collect();
if parts.len() >= 3 && parts[2] == "obj" {
let obj_num = parts[0].parse::<u32>().ok()?;
let gen_num = parts[1].parse::<u16>().ok()?;
return Some((obj_num, gen_num));
}
None
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
struct ObjHeader {
obj_num: u32,
generation: u16,
offset: u64,
}
fn scan_window_for_headers(
window: &[u8],
window_base: u64,
out: &mut Vec<ObjHeader>,
seen: &mut std::collections::BTreeSet<u64>,
) {
let mut pos = 0;
while pos < window.len() {
let Some(obj_rel) = find_byte_pattern(&window[pos..], b"obj") else {
break;
};
let abs = pos + obj_rel;
if abs < 4 {
pos = abs + 3;
continue;
}
let line_start = window[..abs]
.iter()
.rposition(|&b| b == b'\n' || b == b'\r')
.map(|p| p + 1)
.unwrap_or(0);
let line_bytes = &window[line_start..abs + 3];
if let Some((obj_num, generation)) = parse_obj_header_bytes(line_bytes) {
let offset = window_base + line_start as u64;
if seen.insert(offset) {
out.push(ObjHeader {
obj_num,
generation,
offset,
});
}
}
pos = abs + 3;
}
}
fn scan_object_headers<R: Read + Seek>(reader: &mut R) -> ParseResult<Vec<ObjHeader>> {
scan_object_headers_chunked(reader, 64 * 1024)
}
fn scan_object_headers_chunked<R: Read + Seek>(
reader: &mut R,
chunk_size: usize,
) -> ParseResult<Vec<ObjHeader>> {
let chunk_size = chunk_size.max(1);
const CARRY_CAP: usize = 1024;
reader.seek(SeekFrom::Start(0))?;
let mut headers: Vec<ObjHeader> = Vec::new();
let mut seen: std::collections::BTreeSet<u64> = std::collections::BTreeSet::new();
let mut carry: Vec<u8> = Vec::new();
let mut window_base: u64 = 0; let mut chunk = vec![0u8; chunk_size];
loop {
let mut filled = 0;
while filled < chunk_size {
let n = reader.read(&mut chunk[filled..])?;
if n == 0 {
break;
}
filled += n;
}
let eof = filled == 0;
if eof && carry.is_empty() {
break;
}
let mut window = std::mem::take(&mut carry);
window.extend_from_slice(&chunk[..filled]);
scan_window_for_headers(&window, window_base, &mut headers, &mut seen);
if eof {
break;
}
let last_nl = window.iter().rposition(|&b| b == b'\n' || b == b'\r');
let mut start = last_nl.map(|p| p + 1).unwrap_or(0);
if window.len() - start > CARRY_CAP {
start = window.len() - CARRY_CAP;
}
window_base += start as u64;
carry = window[start..].to_vec();
}
headers.sort_by_key(|h| h.offset);
Ok(headers)
}
pub(crate) fn read_window_at<R: Read + Seek>(
reader: &mut R,
offset: u64,
max: usize,
) -> ParseResult<Vec<u8>> {
reader.seek(SeekFrom::Start(offset))?;
let mut buf = vec![0u8; max];
let mut filled = 0;
while filled < max {
let n = reader.read(&mut buf[filled..])?;
if n == 0 {
break;
}
filled += n;
}
buf.truncate(filled);
Ok(buf)
}
fn read_tail<R: Read + Seek>(reader: &mut R, max: usize) -> ParseResult<(u64, Vec<u8>)> {
let len = reader.seek(SeekFrom::End(0))?;
let start = len.saturating_sub(max as u64);
let bytes = read_window_at(reader, start, (len - start) as usize)?;
Ok((start, bytes))
}
fn read_object_content<R: Read + Seek>(
reader: &mut R,
obj_num: u32,
offset: u64,
) -> ParseResult<Option<String>> {
const OBJ_WINDOW: usize = 64 * 1024;
let window = read_window_at(reader, offset, OBJ_WINDOW)?;
let obj_pattern = format!("{obj_num} 0 obj");
let Some(obj_start) = find_byte_pattern(&window, obj_pattern.as_bytes()) else {
return Ok(None);
};
let Some(endobj_rel) = find_byte_pattern(&window[obj_start..], b"endobj") else {
return Ok(None);
};
let content_bytes = &window[obj_start..obj_start + endobj_rel];
Ok(Some(String::from_utf8_lossy(content_bytes).into_owned()))
}
fn find_object_offset<R: Read + Seek>(reader: &mut R, obj_num: u32) -> ParseResult<Option<u64>> {
const CHUNK_SIZE: usize = 64 * 1024;
const CARRY_CAP: usize = 1024;
reader.seek(SeekFrom::Start(0))?;
let mut carry: Vec<u8> = Vec::new();
let mut window_base: u64 = 0; let mut chunk = vec![0u8; CHUNK_SIZE];
loop {
let mut filled = 0;
while filled < CHUNK_SIZE {
let n = reader.read(&mut chunk[filled..])?;
if n == 0 {
break;
}
filled += n;
}
let eof = filled == 0;
if eof && carry.is_empty() {
break;
}
let mut window = std::mem::take(&mut carry);
window.extend_from_slice(&chunk[..filled]);
let mut headers = Vec::new();
let mut seen = std::collections::BTreeSet::new();
scan_window_for_headers(&window, window_base, &mut headers, &mut seen);
if let Some(header) = headers.iter().find(|h| h.obj_num == obj_num) {
return Ok(Some(header.offset));
}
if eof {
break;
}
let last_nl = window.iter().rposition(|&b| b == b'\n' || b == b'\r');
let mut start = last_nl.map(|p| p + 1).unwrap_or(0);
if window.len() - start > CARRY_CAP {
start = window.len() - CARRY_CAP;
}
window_base += start as u64;
carry = window[start..].to_vec();
}
Ok(None)
}
pub(crate) fn read_object_window<R: Read + Seek>(
reader: &mut R,
obj_num: u32,
max: usize,
) -> ParseResult<Option<(u64, Vec<u8>)>> {
let Some(offset) = find_object_offset(reader, obj_num)? else {
return Ok(None);
};
let window = read_window_at(reader, offset, max)?;
Ok(Some((offset, window)))
}
pub(crate) fn scan_page_object_refs<R: Read + Seek>(
reader: &mut R,
) -> ParseResult<Vec<(u32, u16)>> {
const PROBE: usize = 4 * 1024;
let headers = scan_object_headers(reader)?;
let mut pages = Vec::new();
for header in &headers {
let window = read_window_at(reader, header.offset, PROBE)?;
let region = match find_byte_pattern(&window, b"endobj") {
Some(end) => &window[..end],
None => &window[..],
};
let text = String::from_utf8_lossy(region);
let is_page = text.contains("/Type /Page") || text.contains("/Type/Page");
let is_pages = text.contains("/Type /Pages") || text.contains("/Type/Pages");
if is_page && !is_pages {
pages.push((header.obj_num, 0));
}
}
pages.sort_unstable();
pages.dedup();
Ok(pages)
}
fn read_pdf_line<R: BufRead>(reader: &mut R, buf: &mut String) -> std::io::Result<usize> {
buf.clear();
let mut total_bytes = 0;
loop {
let available = reader.fill_buf()?;
if available.is_empty() {
break;
}
let mut found_terminator = false;
let mut consume_len = 0;
for (i, &byte) in available.iter().enumerate() {
if byte == b'\r' || byte == b'\n' {
let content = &available[..i];
buf.push_str(&String::from_utf8_lossy(content));
consume_len = i + 1;
if byte == b'\r' && i + 1 < available.len() && available[i + 1] == b'\n' {
consume_len += 1; }
found_terminator = true;
break;
}
}
if found_terminator {
reader.consume(consume_len);
total_bytes += consume_len;
break;
} else {
let len = available.len();
buf.push_str(&String::from_utf8_lossy(available));
reader.consume(len);
total_bytes += len;
}
}
Ok(total_bytes)
}
#[derive(Debug, Clone, Copy, PartialEq)]
pub struct XRefEntry {
pub offset: u64,
pub generation: u16,
pub in_use: bool,
}
#[derive(Debug, Clone, PartialEq)]
pub struct XRefEntryExt {
pub basic: XRefEntry,
pub compressed_info: Option<(u32, u32)>, }
#[derive(Debug, Clone)]
pub struct XRefTable {
entries: HashMap<u32, XRefEntry>,
extended_entries: HashMap<u32, XRefEntryExt>,
trailer: Option<super::objects::PdfDictionary>,
xref_offset: u64,
}
impl Default for XRefTable {
fn default() -> Self {
Self::new()
}
}
impl XRefTable {
pub fn new() -> Self {
Self {
entries: HashMap::new(),
extended_entries: HashMap::new(),
trailer: None,
xref_offset: 0,
}
}
pub fn entries(&self) -> &HashMap<u32, XRefEntry> {
&self.entries
}
pub fn parse<R: Read + Seek>(reader: &mut BufReader<R>) -> ParseResult<Self> {
Self::parse_with_options(reader, &super::ParseOptions::default())
}
pub fn parse_with_options<R: Read + Seek>(
reader: &mut BufReader<R>,
options: &super::ParseOptions,
) -> ParseResult<Self> {
match Self::parse_with_incremental_updates_options(reader, options) {
Ok(table) => Ok(table),
Err(e) => {
if options.lenient_syntax {
tracing::warn!("Primary XRef parsing failed: {e:?}, attempting recovery");
reader.seek(SeekFrom::Start(0))?;
Self::parse_with_recovery_options(reader, options)
} else {
Err(e)
}
}
}
}
#[allow(dead_code)]
fn parse_with_incremental_updates<R: Read + Seek>(
reader: &mut BufReader<R>,
) -> ParseResult<Self> {
Self::parse_with_incremental_updates_options(reader, &super::ParseOptions::default())
}
fn parse_with_incremental_updates_options<R: Read + Seek>(
reader: &mut BufReader<R>,
options: &super::ParseOptions,
) -> ParseResult<Self> {
let xref_offset = Self::find_xref_offset(reader)?;
let mut merged_table = Self::new();
let mut current_offset = Some(xref_offset);
let mut visited_offsets = std::collections::HashSet::new();
while let Some(offset) = current_offset {
if visited_offsets.contains(&offset) {
tracing::debug!(
"Circular reference in XRef chain at offset {} (already visited)",
offset
);
break;
}
visited_offsets.insert(offset);
reader.seek(SeekFrom::Start(offset))?;
let table = Self::parse_primary_with_options(reader, options)?;
let prev_offset = table
.trailer
.as_ref()
.and_then(|t| t.get("Prev"))
.and_then(|obj| obj.as_integer())
.map(|i| i as u64);
if let Some(_prev) = prev_offset {
} else {
}
let _regular_count = table.entries.len();
let _extended_count = table.extended_entries.len();
for (obj_num, entry) in table.entries {
merged_table.entries.entry(obj_num).or_insert(entry);
}
for (obj_num, ext_entry) in table.extended_entries {
merged_table
.extended_entries
.entry(obj_num)
.or_insert(ext_entry);
}
if merged_table.trailer.is_none() {
merged_table.trailer = table.trailer;
merged_table.xref_offset = table.xref_offset;
}
current_offset = prev_offset;
}
if options.lenient_syntax || options.collect_warnings {
if let Err(e) = Self::scan_and_fill_missing_objects(reader, &mut merged_table) {
tracing::debug!("scan_and_fill_missing_objects failed (non-fatal): {e}");
}
}
Ok(merged_table)
}
#[allow(dead_code)]
fn parse_primary<R: Read + Seek>(reader: &mut BufReader<R>) -> ParseResult<Self> {
Self::parse_primary_with_options(reader, &super::ParseOptions::default())
}
fn parse_primary_with_options<R: Read + Seek>(
reader: &mut BufReader<R>,
options: &super::ParseOptions,
) -> ParseResult<Self> {
let mut table = Self::new();
let xref_offset = reader.stream_position()?;
table.xref_offset = xref_offset;
let mut line = String::new();
let pos = reader.stream_position()?;
read_pdf_line(reader, &mut line)?;
if line.trim() == "xref" {
Self::parse_traditional_xref_with_options(reader, &mut table, options)?;
} else {
tracing::debug!(
"Not a traditional xref, checking for xref stream. Line: {:?}",
line.trim()
);
reader.seek(SeekFrom::Start(pos))?;
let mut lexer = super::lexer::Lexer::new_with_options(&mut *reader, options.clone());
let obj_num = match lexer.next_token()? {
super::lexer::Token::Integer(n) => n as u32,
_ => return Err(ParseError::InvalidXRef),
};
tracing::debug!("Found object {obj_num} at xref position");
let _gen_num = match lexer.next_token()? {
super::lexer::Token::Integer(n) => n as u16,
_ => return Err(ParseError::InvalidXRef),
};
match lexer.next_token()? {
super::lexer::Token::Obj => {}
_ => return Err(ParseError::InvalidXRef),
};
let obj = super::objects::PdfObject::parse_with_options(&mut lexer, options)?;
if let Some(stream) = obj.as_stream() {
if stream
.dict
.get("Type")
.and_then(|o| o.as_name())
.map(|n| n.as_str())
== Some("XRef")
{
tracing::debug!("Parsing XRef stream");
let decoded_data = match stream.decode(options) {
Ok(data) => data,
Err(e) => {
tracing::warn!(
"XRef stream decode failed: {e:?}, triggering recovery mode"
);
return Err(e);
}
};
let xref_stream_parser = xref_stream::XRefStream::parse(
&mut *reader,
stream.dict.clone(),
decoded_data,
options,
)?;
let entries = xref_stream_parser.to_xref_entries()?;
tracing::debug!("XRef stream parsed, found {} entries", entries.len());
for (obj_num, entry) in entries {
match entry {
xref_stream::XRefEntry::Free {
next_free_object,
generation,
} => {
table.entries.insert(
obj_num,
XRefEntry {
offset: next_free_object as u64,
generation,
in_use: false,
},
);
}
xref_stream::XRefEntry::InUse { offset, generation } => {
table.entries.insert(
obj_num,
XRefEntry {
offset,
generation,
in_use: true,
},
);
}
xref_stream::XRefEntry::Compressed {
stream_object_number,
index_within_stream,
} => {
let ext_entry = XRefEntryExt {
basic: XRefEntry {
offset: 0,
generation: 0,
in_use: true,
},
compressed_info: Some((
stream_object_number,
index_within_stream,
)),
};
table.extended_entries.insert(obj_num, ext_entry);
table.entries.insert(
obj_num,
XRefEntry {
offset: 0,
generation: 0,
in_use: true,
},
);
}
}
}
table.trailer = Some(xref_stream_parser.trailer_dict().clone());
} else {
return Err(ParseError::InvalidXRef);
}
} else {
return Err(ParseError::InvalidXRef);
}
}
Ok(table)
}
#[allow(dead_code)]
fn parse_traditional_xref<R: Read + Seek>(
reader: &mut BufReader<R>,
table: &mut XRefTable,
) -> ParseResult<()> {
Self::parse_traditional_xref_with_options(reader, table, &super::ParseOptions::default())
}
fn parse_traditional_xref_with_options<R: Read + Seek>(
reader: &mut BufReader<R>,
table: &mut XRefTable,
options: &super::ParseOptions,
) -> ParseResult<()> {
let mut line = String::new();
let mut trailer_dict_offset: Option<u64> = None;
loop {
line.clear();
let line_start_pos = reader.stream_position()?;
read_pdf_line(reader, &mut line)?;
let trimmed_line = line.trim();
if trimmed_line.is_empty() || trimmed_line.starts_with('%') {
continue;
}
if trimmed_line == "trailer" {
break;
}
if let Some(dict_pos) = trimmed_line.find("<<") {
if trimmed_line.starts_with("trailer") {
let trailer_keyword_start =
trimmed_line.as_ptr() as usize - line.as_ptr() as usize;
trailer_dict_offset =
Some(line_start_pos + (trailer_keyword_start + dict_pos) as u64);
break;
}
}
if trimmed_line.starts_with("<<") {
tracing::warn!(" Found trailer dictionary without 'trailer' keyword");
trailer_dict_offset = Some(line_start_pos);
break;
}
let parts: Vec<&str> = trimmed_line.split_whitespace().collect();
if parts.len() != 2 {
return Err(ParseError::InvalidXRef);
}
let first_obj_num = parts[0]
.parse::<u32>()
.map_err(|_| ParseError::InvalidXRef)?;
let count = parts[1]
.parse::<u32>()
.map_err(|_| ParseError::InvalidXRef)?;
let mut entries_parsed = 0;
let mut i = 0;
while i < count {
line.clear();
let bytes_read = read_pdf_line(reader, &mut line)?;
let trimmed = line.trim();
if trimmed.starts_with('%') {
continue;
}
if bytes_read == 0 || trimmed == "trailer" {
tracing::debug!(
"Warning: XRef subsection incomplete - expected {count} entries but found only {entries_parsed}"
);
if line.trim() == "trailer" {
break;
}
break;
}
match Self::parse_xref_entry(&line) {
Ok(entry) => {
table.entries.insert(first_obj_num + i, entry);
entries_parsed += 1;
}
Err(_) => {
tracing::debug!(
"Warning: Invalid XRef entry at position {}: {:?}",
i,
line.trim()
);
}
}
i += 1;
}
}
if let Some(offset) = trailer_dict_offset {
reader.seek(SeekFrom::Start(offset))?;
}
let mut lexer = super::lexer::Lexer::new_with_options(reader, options.clone());
let trailer_obj = super::objects::PdfObject::parse_with_options(&mut lexer, options)?;
table.trailer = trailer_obj.as_dict().cloned();
if let Some(trailer) = &table.trailer {
if let Some(size_obj) = trailer.get("Size") {
if let Some(expected_size) = size_obj.as_integer() {
if let Some(max_obj_num) = table.entries.keys().max() {
let max_expected = (*max_obj_num + 1) as i64;
if max_expected > expected_size {
tracing::debug!(
"Warning: XRef table has object {} but trailer Size is only {}",
max_obj_num,
expected_size
);
return Err(ParseError::InvalidXRef);
}
}
}
}
}
Ok(())
}
#[allow(dead_code)]
fn find_linearized_xref<R: Read + Seek>(reader: &mut BufReader<R>) -> ParseResult<u64> {
reader.seek(SeekFrom::Start(0))?;
let mut header = String::new();
reader.read_line(&mut header)?;
if !header.starts_with("%PDF-") {
return Err(ParseError::InvalidHeader);
}
let mut line = String::new();
reader.read_line(&mut line)?;
let pos = reader.stream_position()?;
let mut buffer = vec![0u8; 1024];
let bytes_read = reader.read(&mut buffer)?;
buffer.truncate(bytes_read);
tracing::debug!(
"Checking for linearized PDF, first 100 bytes: {:?}",
String::from_utf8_lossy(&buffer[..buffer.len().min(100)])
);
if find_byte_pattern(&buffer, b"/Linearized").is_some() {
if let Some(xref_pos) = find_byte_pattern(&buffer, b"xref") {
return Ok(pos + xref_pos as u64);
}
if find_byte_pattern(&buffer, b"/Type/XRef").is_some()
|| find_byte_pattern(&buffer, b"/Type /XRef").is_some()
{
if let Some(obj_pos) = find_byte_pattern(&buffer, b" obj") {
let search_from = obj_pos + 4;
if search_from < buffer.len() {
let after_first_obj = &buffer[search_from..];
if let Some(next_obj) = find_byte_pattern(after_first_obj, b" obj") {
let second_obj_start =
pos + (search_from + next_obj).saturating_sub(10) as u64;
return Ok(second_obj_start);
}
}
}
}
}
Err(ParseError::InvalidXRef)
}
fn find_xref_offset<R: Read + Seek>(reader: &mut BufReader<R>) -> ParseResult<u64> {
reader.seek(SeekFrom::End(0))?;
let file_size = reader.stream_position()?;
let read_size = std::cmp::min(1024, file_size);
reader.seek(SeekFrom::End(-(read_size as i64)))?;
let mut buffer = vec![0u8; read_size as usize];
reader.read_exact(&mut buffer)?;
let content = String::from_utf8_lossy(&buffer);
let debug_content = content.chars().take(200).collect::<String>();
tracing::debug!("XRef search in last {read_size} bytes: {debug_content:?}");
let mut lines = content.pdf_lines();
let mut last_offset = None;
while let Some(line) = lines.next() {
if line.trim() == "startxref" {
if let Some(offset_line) = lines.next() {
if let Ok(offset) = offset_line.trim().parse::<u64>() {
last_offset = Some(offset);
}
}
}
}
last_offset.ok_or(ParseError::InvalidXRef)
}
fn scan_and_fill_missing_objects<R: Read + Seek>(
reader: &mut BufReader<R>,
table: &mut Self,
) -> ParseResult<()> {
for header in scan_object_headers(reader)? {
if !table.entries.contains_key(&header.obj_num)
&& !table.extended_entries.contains_key(&header.obj_num)
{
table.add_entry(
header.obj_num,
XRefEntry {
offset: header.offset,
generation: header.generation,
in_use: true,
},
);
}
}
Ok(())
}
#[allow(dead_code)]
fn parse_with_recovery<R: Read + Seek>(reader: &mut BufReader<R>) -> ParseResult<Self> {
Self::parse_with_recovery_options(reader, &super::ParseOptions::default())
}
fn parse_with_recovery_options<R: Read + Seek>(
reader: &mut BufReader<R>,
_options: &super::ParseOptions,
) -> ParseResult<Self> {
const ROOT_TAIL: usize = 256 * 1024;
const CATALOG_TAIL: usize = 100 * 1024;
let mut table = Self::new();
let headers = scan_object_headers(reader)?;
for h in &headers {
if !table.entries.contains_key(&h.obj_num) {
table.add_entry(
h.obj_num,
XRefEntry {
offset: h.offset,
generation: h.generation,
in_use: true,
},
);
}
}
if table.entries.is_empty() {
return Err(ParseError::InvalidXRef);
}
tracing::debug!("XRef recovery: found {} objects", table.len());
let (_, root_tail) = read_tail(reader, ROOT_TAIL)?;
let root_tail_str = String::from_utf8_lossy(&root_tail);
let xref_root_candidate = extract_root_from_xref_stream(&root_tail_str);
let mut trailer = super::objects::PdfDictionary::new();
trailer.insert(
"Size".to_string(),
super::objects::PdfObject::Integer(table.len() as i64),
);
let mut catalog_candidate = None;
if let Some(xref_root) = xref_root_candidate {
if table.entries.contains_key(&xref_root) {
catalog_candidate = Some(xref_root);
tracing::debug!("Using Root {} from XRef stream as catalog", xref_root);
} else {
tracing::debug!(
"Warning: XRef Root {} not found in object table, searching manually",
xref_root
);
}
}
if catalog_candidate.is_none() {
catalog_candidate = find_catalog_by_content(reader, &table)?;
}
if catalog_candidate.is_none() {
for obj_num in [1, 2, 3, 4, 5] {
let offset = match table.entries.get(&obj_num) {
Some(entry) if entry.in_use => entry.offset,
_ => continue,
};
if let Some(content) = read_object_content(reader, obj_num, offset)? {
if content.contains("/Type/Sig") || content.contains("/Type /Sig") {
tracing::debug!("Skipping object {} (Type: Sig)", obj_num);
continue;
}
if content.contains("/Type/Catalog")
|| content.contains("/Type /Catalog")
|| content.contains("/Pages")
{
catalog_candidate = Some(obj_num);
tracing::debug!(
"Using fallback catalog candidate: object {} (validated)",
obj_num
);
break;
}
}
}
}
if catalog_candidate.is_none() && !table.entries.is_empty() {
tracing::debug!(
"Last resort: Scanning all {} objects for any with /Pages or /Catalog",
table.entries.len()
);
let mut obj_numbers: Vec<u32> = table.entries.keys().copied().collect();
obj_numbers.sort_unstable();
for obj_num in obj_numbers {
let offset = match table.entries.get(&obj_num) {
Some(entry) if entry.in_use => entry.offset,
_ => continue,
};
if let Some(content) = read_object_content(reader, obj_num, offset)? {
if content.contains("/Type/Sig") || content.contains("/Type /Sig") {
continue;
}
if content.contains("/Type/Catalog") || content.contains("/Type /Catalog") {
catalog_candidate = Some(obj_num);
tracing::debug!(
"Last resort: Found catalog at object {} (/Type/Catalog)",
obj_num
);
break;
} else if content.contains("/Pages") {
catalog_candidate = Some(obj_num);
tracing::debug!(
"Last resort: Found catalog at object {} (has /Pages)",
obj_num
);
break;
}
}
}
if catalog_candidate.is_none() {
tracing::debug!("Extreme last resort: Scanning last 100KB for /Type/Catalog");
let (_, search_buffer) = read_tail(reader, CATALOG_TAIL)?;
if let Some(catalog_pos) = rfind_byte_pattern(&search_buffer, b"/Type/Catalog") {
let local_search_start = catalog_pos.saturating_sub(200);
let search_area = &search_buffer[local_search_start..catalog_pos];
if let Some(obj_pattern_pos) = rfind_byte_pattern(search_area, b" 0 obj") {
let before_obj = &search_area[..obj_pattern_pos];
let before_obj_str = String::from_utf8_lossy(before_obj);
let trimmed = before_obj_str.trim_end();
if let Some(digit_start) = trimmed.rfind(|c: char| !c.is_ascii_digit()) {
let num_str = trimmed[digit_start + 1..].trim();
if !num_str.is_empty() {
if let Ok(obj_num) = num_str.parse::<u32>() {
tracing::debug!(
"Extreme last resort: Found /Type/Catalog at object {}",
obj_num
);
catalog_candidate = Some(obj_num);
}
}
} else if let Ok(obj_num) = trimmed.trim().parse::<u32>() {
tracing::debug!(
"Extreme last resort: Found /Type/Catalog at object {}",
obj_num
);
catalog_candidate = Some(obj_num);
}
}
} else {
tracing::debug!("Extreme last resort: No /Type/Catalog found in last 100KB");
}
}
if catalog_candidate.is_none() {
tracing::warn!(" Could not find any catalog object, using first non-signature object as absolute last resort");
let mut obj_numbers: Vec<u32> = table.entries.keys().copied().collect();
obj_numbers.sort_unstable();
for obj_num in obj_numbers {
let offset = match table.entries.get(&obj_num) {
Some(entry) => entry.offset,
None => continue,
};
if let Some(content) = read_object_content(reader, obj_num, offset)? {
if !content.contains("/Type/Sig") && !content.contains("/Type /Sig") {
catalog_candidate = Some(obj_num);
tracing::debug!("Using object {} as absolute last resort", obj_num);
break;
}
}
}
}
}
if let Some(root_obj) = catalog_candidate {
trailer.insert(
"Root".to_string(),
super::objects::PdfObject::Reference(root_obj, 0),
);
}
table.set_trailer(trailer);
Ok(table)
}
#[allow(dead_code)]
fn validate_offset<R: Read + Seek>(reader: &mut BufReader<R>, offset: u64) -> ParseResult<()> {
let file_size = reader.seek(SeekFrom::End(0))?;
if offset >= file_size {
#[cfg(debug_assertions)]
tracing::warn!(" XRef offset {offset} exceeds file size {file_size}");
return Err(ParseError::InvalidXRef);
}
reader.seek(SeekFrom::Start(offset))?;
let mut peek = [0u8; 20];
let read_bytes = reader.read(&mut peek)?;
if read_bytes == 0 {
#[cfg(debug_assertions)]
tracing::warn!(" XRef offset {offset} points to EOF");
return Err(ParseError::InvalidXRef);
}
let peek_slice = &peek[..read_bytes];
let starts_with_xref = peek_slice.len() >= 4 && &peek_slice[..4] == b"xref";
let starts_with_digit = peek_slice.first().map_or(false, |&b| b.is_ascii_digit());
if !starts_with_xref && !starts_with_digit {
#[cfg(debug_assertions)]
{
let debug_len = std::cmp::min(10, read_bytes);
let debug_content = String::from_utf8_lossy(&peek[..debug_len]);
tracing::debug!(
"Warning: XRef offset {} does not point to valid XRef content: {:?}",
offset,
debug_content
);
}
}
Ok(())
}
fn parse_xref_entry(line: &str) -> ParseResult<XRefEntry> {
let line = line.trim();
if line.len() >= 18 {
if let Ok(entry) = Self::parse_xref_entry_standard(line) {
return Ok(entry);
}
}
Self::parse_xref_entry_flexible(line)
}
fn parse_xref_entry_standard(line: &str) -> ParseResult<XRefEntry> {
if line.len() < 18 {
return Err(ParseError::InvalidXRef);
}
let offset_str = &line[0..10];
let gen_str = &line[11..16];
let flag = line.chars().nth(17);
let offset = offset_str
.trim()
.parse::<u64>()
.map_err(|_| ParseError::InvalidXRef)?;
let generation = gen_str
.trim()
.parse::<u16>()
.map_err(|_| ParseError::InvalidXRef)?;
let in_use = match flag {
Some('n') => true,
Some('f') => false,
_ => return Err(ParseError::InvalidXRef),
};
Ok(XRefEntry {
offset,
generation,
in_use,
})
}
fn parse_xref_entry_flexible(line: &str) -> ParseResult<XRefEntry> {
let parts: Vec<&str> = line.split_whitespace().collect();
if parts.is_empty() {
return Err(ParseError::InvalidXRef);
}
let offset = parts[0]
.parse::<u64>()
.map_err(|_| ParseError::InvalidXRef)?;
let (generation, flag_from_gen) = if parts.len() >= 2 {
let gen_part = parts[1];
if gen_part == "n" || gen_part == "f" {
(0, gen_part.chars().next())
} else if gen_part.ends_with('n') || gen_part.ends_with('f') {
let flag_char = gen_part.chars().last().ok_or(ParseError::InvalidXRef)?;
let gen_str = &gen_part[..gen_part.len() - 1];
if gen_str.is_empty() {
(0, Some(flag_char))
} else {
let gen = gen_str
.parse::<u16>()
.map_err(|_| ParseError::InvalidXRef)?;
(gen, Some(flag_char))
}
} else {
let gen = gen_part
.parse::<u16>()
.map_err(|_| ParseError::InvalidXRef)?;
(gen, None)
}
} else {
(0, None)
};
let in_use = if let Some(flag_char) = flag_from_gen {
match flag_char {
'n' => true,
'f' => false,
_ => true, }
} else if parts.len() >= 3 {
match parts[2].chars().next() {
Some('n') => true,
Some('f') => false,
_ => {
#[cfg(debug_assertions)]
tracing::warn!(" Invalid xref flag '{}', assuming 'n'", parts[2]);
true
}
}
} else {
true
};
Ok(XRefEntry {
offset,
generation,
in_use,
})
}
pub fn get_entry(&self, obj_num: u32) -> Option<&XRefEntry> {
self.entries.get(&obj_num)
}
pub fn get_entry_mut(&mut self, obj_num: u32) -> Option<&mut XRefEntry> {
self.entries.get_mut(&obj_num)
}
pub fn trailer(&self) -> Option<&super::objects::PdfDictionary> {
self.trailer.as_ref()
}
pub fn xref_offset(&self) -> u64 {
self.xref_offset
}
pub fn len(&self) -> usize {
self.entries.len()
}
pub fn is_empty(&self) -> bool {
self.entries.is_empty()
}
pub fn iter(&self) -> impl Iterator<Item = (&u32, &XRefEntry)> {
self.entries.iter()
}
pub fn get_extended_entry(&self, obj_num: u32) -> Option<&XRefEntryExt> {
self.extended_entries.get(&obj_num)
}
pub fn is_compressed(&self, obj_num: u32) -> bool {
self.extended_entries
.get(&obj_num)
.map(|e| e.compressed_info.is_some())
.unwrap_or(false)
}
pub fn add_entry(&mut self, obj_num: u32, entry: XRefEntry) {
self.entries.insert(obj_num, entry);
}
pub fn set_trailer(&mut self, trailer: super::objects::PdfDictionary) {
self.trailer = Some(trailer);
}
pub fn add_extended_entry(&mut self, obj_num: u32, entry: XRefEntryExt) {
self.extended_entries.insert(obj_num, entry);
}
}
#[derive(Debug, Clone)]
pub struct XRefStream {
stream: super::objects::PdfStream,
entries: HashMap<u32, XRefEntry>,
extended_entries: HashMap<u32, XRefEntryExt>,
}
impl XRefStream {
pub fn parse(stream: super::objects::PdfStream) -> ParseResult<Self> {
let mut xref_stream = Self {
stream,
entries: HashMap::new(),
extended_entries: HashMap::new(),
};
xref_stream.decode_entries()?;
Ok(xref_stream)
}
fn decode_entries(&mut self) -> ParseResult<()> {
let dict = &self.stream.dict;
let size = dict
.get("Size")
.and_then(|obj| obj.as_integer())
.ok_or_else(|| ParseError::MissingKey("Size".to_string()))?;
let index = match dict.get("Index") {
Some(obj) => {
let array = obj.as_array().ok_or_else(|| ParseError::SyntaxError {
position: 0,
message: "Index must be an array".to_string(),
})?;
let mut pairs = Vec::new();
for chunk in array.0.chunks(2) {
if chunk.len() != 2 {
return Err(ParseError::SyntaxError {
position: 0,
message: "Index array must have even number of elements".to_string(),
});
}
let first = chunk[0]
.as_integer()
.ok_or_else(|| ParseError::SyntaxError {
position: 0,
message: "Index values must be integers".to_string(),
})? as u32;
let count = chunk[1]
.as_integer()
.ok_or_else(|| ParseError::SyntaxError {
position: 0,
message: "Index values must be integers".to_string(),
})? as u32;
pairs.push((first, count));
}
pairs
}
None => {
vec![(0, size as u32)]
}
};
let w_array = dict
.get("W")
.and_then(|obj| obj.as_array())
.ok_or_else(|| ParseError::MissingKey("W".to_string()))?;
if w_array.len() != 3 {
return Err(ParseError::SyntaxError {
position: 0,
message: "W array must have exactly 3 elements".to_string(),
});
}
let w: Vec<usize> = w_array
.0
.iter()
.map(|obj| {
obj.as_integer()
.ok_or_else(|| ParseError::SyntaxError {
position: 0,
message: "W values must be integers".to_string(),
})
.map(|i| i as usize)
})
.collect::<ParseResult<Vec<_>>>()?;
let data = self.stream.decode(&ParseOptions::default())?;
let mut offset = 0;
for (first_obj_num, count) in index {
for i in 0..count {
if offset + w[0] + w[1] + w[2] > data.len() {
return Err(ParseError::SyntaxError {
position: 0,
message: "Xref stream data truncated".to_string(),
});
}
let field1 = Self::read_field(&data[offset..], w[0]);
offset += w[0];
let field2 = Self::read_field(&data[offset..], w[1]);
offset += w[1];
let field3 = Self::read_field(&data[offset..], w[2]);
offset += w[2];
let entry_info =
XRefEntryInfo::new(XRefEntryType::from_value(field1), field2, field3);
let entry = match entry_info.entry_type {
XRefEntryType::Free => XRefEntry {
offset: entry_info.field2,
generation: entry_info.field3 as u16,
in_use: false,
},
XRefEntryType::Uncompressed => XRefEntry {
offset: entry_info.field2,
generation: entry_info.field3 as u16,
in_use: true,
},
XRefEntryType::Compressed => {
let ext_entry = XRefEntryExt {
basic: XRefEntry {
offset: 0,
generation: 0,
in_use: true,
},
compressed_info: entry_info.get_compressed_info(),
};
self.extended_entries
.insert(first_obj_num + i, ext_entry.clone());
ext_entry.basic
}
XRefEntryType::Custom(_type_num) => {
#[cfg(debug_assertions)]
tracing::debug!(
"Note: Custom xref entry type {} for object {} (treating as in-use)",
_type_num,
first_obj_num + i
);
let ext_entry = XRefEntryExt {
basic: XRefEntry {
offset: entry_info.field2,
generation: entry_info.field3 as u16,
in_use: entry_info.entry_type.is_in_use(),
},
compressed_info: None,
};
self.extended_entries
.insert(first_obj_num + i, ext_entry.clone());
ext_entry.basic
}
};
self.entries.insert(first_obj_num + i, entry);
}
}
Ok(())
}
fn read_field(data: &[u8], width: usize) -> u64 {
let mut value = 0u64;
for i in 0..width {
if i < data.len() {
value = (value << 8) | (data[i] as u64);
}
}
value
}
pub fn get_entry(&self, obj_num: u32) -> Option<&XRefEntry> {
self.entries.get(&obj_num)
}
pub fn trailer(&self) -> &super::objects::PdfDictionary {
&self.stream.dict
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::parser::objects::{PdfDictionary, PdfObject};
use std::io::Cursor;
#[test]
fn test_scan_object_headers_finds_simple_headers() {
let mut buf = Vec::new();
buf.extend_from_slice(b"%PDF-1.7\n");
let off1 = buf.len() as u64;
buf.extend_from_slice(b"1 0 obj\n<< /Type /Catalog >>\nendobj\n");
let off2 = buf.len() as u64;
buf.extend_from_slice(b"2 0 obj\n<< /Type /Pages >>\nendobj\n");
let off10 = buf.len() as u64;
buf.extend_from_slice(b"10 0 obj\n<< /Length 0 >>\nendobj\n");
let mut cursor = Cursor::new(buf);
let headers = scan_object_headers(&mut cursor).unwrap();
assert_eq!(
headers,
vec![
ObjHeader {
obj_num: 1,
generation: 0,
offset: off1
},
ObjHeader {
obj_num: 2,
generation: 0,
offset: off2
},
ObjHeader {
obj_num: 10,
generation: 0,
offset: off10
},
]
);
}
#[test]
fn test_scan_object_headers_chunk_invariant_across_boundaries() {
let mut buf = Vec::new();
buf.extend_from_slice(b"%PDF-1.7\n");
let mut expected: Vec<(u32, u64)> = Vec::new();
for i in 1..=50u32 {
for _ in 0..(i as usize % 7) {
buf.push(b' ');
}
buf.push(b'\n');
expected.push((i, buf.len() as u64));
buf.extend_from_slice(format!("{i} 0 obj\n<< /N {i} >>\nendobj\n").as_bytes());
}
let reference =
scan_object_headers_chunked(&mut Cursor::new(buf.clone()), buf.len().max(1)).unwrap();
let got: Vec<(u32, u64)> = reference.iter().map(|h| (h.obj_num, h.offset)).collect();
assert_eq!(
got, expected,
"reference scan disagrees with hand-computed offsets"
);
for cs in [1usize, 2, 3, 7, 13, 16, 64, 256] {
let chunked = scan_object_headers_chunked(&mut Cursor::new(buf.clone()), cs).unwrap();
assert_eq!(chunked, reference, "scan mismatch at chunk_size={cs}");
}
}
#[test]
fn test_scan_object_headers_ignores_endobj_keyword() {
let mut buf = Vec::new();
buf.extend_from_slice(b"%PDF\n");
let off = buf.len() as u64;
buf.extend_from_slice(b"7 0 obj\n<< >>\nendobj\nendobj\n");
let headers = scan_object_headers(&mut Cursor::new(buf)).unwrap();
assert_eq!(
headers,
vec![ObjHeader {
obj_num: 7,
generation: 0,
offset: off
}]
);
}
#[test]
fn test_scan_object_headers_carry_truncation_no_newline_run() {
let mut buf = Vec::new();
buf.extend_from_slice(b"%PDF-1.7\n");
let filler: Vec<u8> = (0..2000u32)
.map(|i| if i % 5 == 0 { b' ' } else { b'7' })
.collect();
buf.extend_from_slice(&filler);
buf.push(b'\n');
let off_a = buf.len() as u64;
buf.extend_from_slice(b"5 0 obj\n<< >>\nendobj\n");
buf.extend_from_slice(&filler);
buf.extend_from_slice(b"7 0 obj\n<< >>\nendobj\n");
let reference =
scan_object_headers_chunked(&mut Cursor::new(buf.clone()), buf.len().max(1)).unwrap();
for cs in [16usize, 64, 256] {
let chunked = scan_object_headers_chunked(&mut Cursor::new(buf.clone()), cs).unwrap();
assert_eq!(
chunked, reference,
"carry-truncation mismatch at chunk_size={cs}"
);
}
assert!(reference
.iter()
.any(|h| h.obj_num == 5 && h.offset == off_a));
assert!(!reference.iter().any(|h| h.obj_num == 7));
}
#[test]
fn test_scan_object_headers_empty_input() {
let headers = scan_object_headers(&mut Cursor::new(Vec::new())).unwrap();
assert!(headers.is_empty());
}
#[test]
fn test_scan_object_headers_reads_in_bounded_chunks() {
struct MaxReadReader<R> {
inner: R,
max_read: usize,
}
impl<R: Read> Read for MaxReadReader<R> {
fn read(&mut self, buf: &mut [u8]) -> std::io::Result<usize> {
self.max_read = self.max_read.max(buf.len());
self.inner.read(buf)
}
}
impl<R: Seek> Seek for MaxReadReader<R> {
fn seek(&mut self, p: SeekFrom) -> std::io::Result<u64> {
self.inner.seek(p)
}
}
let mut buf = Vec::new();
buf.extend_from_slice(b"%PDF\n");
for i in 1..=2000u32 {
buf.extend_from_slice(format!("{i} 0 obj\n<< >>\nendobj\n").as_bytes());
}
let total = buf.len();
assert!(
total > 8192,
"fixture must exceed the chunk size to be meaningful"
);
let mut r = MaxReadReader {
inner: Cursor::new(buf),
max_read: 0,
};
let headers = scan_object_headers_chunked(&mut r, 4096).unwrap();
assert_eq!(headers.len(), 2000);
assert_eq!(headers[0].obj_num, 1);
assert_eq!(headers[1999].obj_num, 2000);
assert!(
r.max_read <= 4096,
"scanner requested {} bytes in a single read (chunk=4096, file={total}); not bounded",
r.max_read
);
}
#[test]
fn test_read_object_window_bounded_and_correct() {
struct MaxReadReader<R> {
inner: R,
max_read: usize,
}
impl<R: Read> Read for MaxReadReader<R> {
fn read(&mut self, buf: &mut [u8]) -> std::io::Result<usize> {
self.max_read = self.max_read.max(buf.len());
self.inner.read(buf)
}
}
impl<R: Seek> Seek for MaxReadReader<R> {
fn seek(&mut self, p: SeekFrom) -> std::io::Result<u64> {
self.inner.seek(p)
}
}
let mut buf = Vec::new();
buf.extend_from_slice(b"%PDF-1.7\n");
let cat_off = buf.len() as u64;
buf.extend_from_slice(b"1 0 obj\n<< /Type /Catalog /Pages 2 0 R >>\nendobj\n");
buf.extend_from_slice(b"2 0 obj\n<< /Type /Pages /Kids [3 0 R] /Count 1 >>\nendobj\n");
buf.extend_from_slice(b"3 0 obj\n<< /Type /Page >>\nstream\n");
buf.extend(std::iter::repeat(b'x').take(200 * 1024));
buf.extend_from_slice(b"\nendstream\nendobj\n");
let total = buf.len();
assert!(
total > 64 * 1024,
"fixture must exceed the scan chunk size to be meaningful"
);
let mut r = MaxReadReader {
inner: Cursor::new(buf),
max_read: 0,
};
let (offset, window) = read_object_window(&mut r, 1, 64 * 1024)
.unwrap()
.expect("object 1 must be locatable by bounded scan");
assert_eq!(offset, cat_off, "header offset must be the real line start");
assert!(
find_byte_pattern(&window, b"/Type /Catalog").is_some(),
"window must contain the catalog dict"
);
assert!(
find_byte_pattern(&window, b"/Pages 2 0 R").is_some(),
"window must contain the /Pages reference"
);
assert!(
r.max_read <= 64 * 1024,
"locate requested {} bytes in a single read (file={total}); not bounded",
r.max_read
);
}
#[test]
fn test_scan_page_object_refs_matches_compact_type_page() {
let mut buf = Vec::new();
buf.extend_from_slice(b"%PDF-1.7\n");
buf.extend_from_slice(b"1 0 obj\n<< /Type/Catalog /Pages 2 0 R >>\nendobj\n");
buf.extend_from_slice(b"2 0 obj\n<< /Type/Pages /Kids [3 0 R] /Count 1 >>\nendobj\n");
buf.extend_from_slice(b"3 0 obj\n<< /Type/Page /Parent 2 0 R >>\nendobj\n");
let mut r = Cursor::new(buf);
let pages = scan_page_object_refs(&mut r).unwrap();
assert_eq!(
pages,
vec![(3, 0)],
"compact /Type/Page must be detected and compact /Type/Pages excluded"
);
}
#[test]
fn test_scan_and_fill_adds_missing_preserves_present() {
let mut buf = Vec::new();
buf.extend_from_slice(b"%PDF-1.7\n");
let off1 = buf.len() as u64;
buf.extend_from_slice(b"1 0 obj\n<< /Type /Catalog >>\nendobj\n");
buf.extend_from_slice(b"2 0 obj\n<< /Type /Pages >>\nendobj\n");
let off3 = buf.len() as u64;
buf.extend_from_slice(b"3 0 obj\n<< >>\nendobj\n");
let mut table = XRefTable::new();
table.add_entry(
2,
XRefEntry {
offset: 99999,
generation: 0,
in_use: true,
},
);
let mut reader = BufReader::new(Cursor::new(buf));
XRefTable::scan_and_fill_missing_objects(&mut reader, &mut table).unwrap();
assert_eq!(table.get_entry(1).map(|e| e.offset), Some(off1));
assert_eq!(table.get_entry(3).map(|e| e.offset), Some(off3));
assert_eq!(table.get_entry(2).map(|e| e.offset), Some(99999));
}
#[test]
fn test_recovery_finds_objects_and_catalog_root() {
let mut buf = Vec::new();
buf.extend_from_slice(b"%PDF-1.7\n");
let off1 = buf.len() as u64;
buf.extend_from_slice(b"1 0 obj\n<< /Type /Catalog /Pages 2 0 R >>\nendobj\n");
let off2 = buf.len() as u64;
buf.extend_from_slice(b"2 0 obj\n<< /Type /Pages /Kids [3 0 R] /Count 1 >>\nendobj\n");
let off3 = buf.len() as u64;
buf.extend_from_slice(b"3 0 obj\n<< /Type /Page /Parent 2 0 R >>\nendobj\n");
let mut reader = BufReader::new(Cursor::new(buf));
let table =
XRefTable::parse_with_recovery_options(&mut reader, &ParseOptions::default()).unwrap();
assert_eq!(table.get_entry(1).map(|e| e.offset), Some(off1));
assert_eq!(table.get_entry(2).map(|e| e.offset), Some(off2));
assert_eq!(table.get_entry(3).map(|e| e.offset), Some(off3));
let root = table.trailer().and_then(|t| t.get("Root")).cloned();
assert_eq!(root, Some(PdfObject::Reference(1, 0)));
}
#[test]
fn test_recovery_uses_root_from_xref_stream() {
let mut buf = Vec::new();
buf.extend_from_slice(b"%PDF-1.7\n");
buf.extend_from_slice(b"5 0 obj\n<< /Type /Catalog /Pages 6 0 R >>\nendobj\n");
buf.extend_from_slice(b"6 0 obj\n<< /Type /Pages /Count 0 >>\nendobj\n");
buf.extend_from_slice(
b"9 0 obj\n<< /Type /XRef /Root 5 0 R /Size 10 >>\nstream\n....\nendstream\nendobj\n",
);
let mut reader = BufReader::new(Cursor::new(buf));
let table =
XRefTable::parse_with_recovery_options(&mut reader, &ParseOptions::default()).unwrap();
let root = table.trailer().and_then(|t| t.get("Root")).cloned();
assert_eq!(root, Some(PdfObject::Reference(5, 0)));
}
#[test]
fn test_recovery_empty_when_no_objects() {
let mut reader = BufReader::new(Cursor::new(b"%PDF-1.7\nnothing useful here\n".to_vec()));
let result = XRefTable::parse_with_recovery_options(&mut reader, &ParseOptions::default());
assert!(matches!(result, Err(ParseError::InvalidXRef)));
}
#[test]
fn test_parse_xref_entry() {
let entry1 = XRefTable::parse_xref_entry("0000000000 65535 f ").unwrap();
assert_eq!(entry1.offset, 0);
assert_eq!(entry1.generation, 65535);
assert!(!entry1.in_use);
let entry2 = XRefTable::parse_xref_entry("0000000017 00000 n ").unwrap();
assert_eq!(entry2.offset, 17);
assert_eq!(entry2.generation, 0);
assert!(entry2.in_use);
}
#[test]
fn test_parse_xref_entry_flexible() {
let entry1 = XRefTable::parse_xref_entry("17 0 n").unwrap();
assert_eq!(entry1.offset, 17);
assert_eq!(entry1.generation, 0);
assert!(entry1.in_use);
let entry2 = XRefTable::parse_xref_entry("123 5 f").unwrap();
assert_eq!(entry2.offset, 123);
assert_eq!(entry2.generation, 5);
assert!(!entry2.in_use);
let entry3 = XRefTable::parse_xref_entry("456 n").unwrap();
assert_eq!(entry3.offset, 456);
assert_eq!(entry3.generation, 0);
assert!(entry3.in_use);
let entry4 = XRefTable::parse_xref_entry("789 2").unwrap();
assert_eq!(entry4.offset, 789);
assert_eq!(entry4.generation, 2);
assert!(entry4.in_use);
let entry5 = XRefTable::parse_xref_entry("1000 0n").unwrap();
assert_eq!(entry5.offset, 1000);
assert_eq!(entry5.generation, 0);
assert!(entry5.in_use);
let entry6 = XRefTable::parse_xref_entry("2000 1f").unwrap();
assert_eq!(entry6.offset, 2000);
assert_eq!(entry6.generation, 1);
assert!(!entry6.in_use);
let entry7 = XRefTable::parse_xref_entry("3000\t0\tn").unwrap();
assert_eq!(entry7.offset, 3000);
assert_eq!(entry7.generation, 0);
assert!(entry7.in_use);
}
#[test]
fn test_parse_xref_entry_invalid_flag_fallback() {
let entry = XRefTable::parse_xref_entry("100 0 x").unwrap();
assert_eq!(entry.offset, 100);
assert_eq!(entry.generation, 0);
assert!(entry.in_use); }
#[test]
fn test_parse_xref_entry_malformed() {
let result = XRefTable::parse_xref_entry("");
assert!(result.is_err());
let result = XRefTable::parse_xref_entry("abc 0 n");
assert!(result.is_err());
let result = XRefTable::parse_xref_entry(" ");
assert!(result.is_err());
}
#[test]
fn test_xref_table_new() {
let table = XRefTable::new();
assert!(table.entries.is_empty());
assert!(table.extended_entries.is_empty());
assert!(table.trailer.is_none());
assert_eq!(table.xref_offset, 0);
}
#[test]
fn test_xref_table_default() {
let table = XRefTable::default();
assert!(table.entries.is_empty());
assert!(table.extended_entries.is_empty());
assert!(table.trailer.is_none());
}
#[test]
fn test_xref_entry_struct() {
let entry = XRefEntry {
offset: 12345,
generation: 7,
in_use: true,
};
assert_eq!(entry.offset, 12345);
assert_eq!(entry.generation, 7);
assert!(entry.in_use);
}
#[test]
fn test_xref_entry_equality() {
let entry1 = XRefEntry {
offset: 100,
generation: 0,
in_use: true,
};
let entry2 = XRefEntry {
offset: 100,
generation: 0,
in_use: true,
};
assert_eq!(entry1, entry2);
}
#[test]
fn test_xref_entry_clone() {
let entry = XRefEntry {
offset: 999,
generation: 3,
in_use: false,
};
let cloned = entry;
assert_eq!(cloned.offset, 999);
assert_eq!(cloned.generation, 3);
assert!(!cloned.in_use);
}
#[test]
fn test_xref_entry_ext() {
let ext_entry = XRefEntryExt {
basic: XRefEntry {
offset: 500,
generation: 0,
in_use: true,
},
compressed_info: Some((10, 5)),
};
assert_eq!(ext_entry.basic.offset, 500);
assert_eq!(ext_entry.compressed_info, Some((10, 5)));
}
#[test]
fn test_xref_entry_ext_no_compression() {
let ext_entry = XRefEntryExt {
basic: XRefEntry {
offset: 1000,
generation: 1,
in_use: true,
},
compressed_info: None,
};
assert!(ext_entry.compressed_info.is_none());
}
#[test]
fn test_add_entry() {
let mut table = XRefTable::new();
table.add_entry(
5,
XRefEntry {
offset: 1000,
generation: 0,
in_use: true,
},
);
assert_eq!(table.entries.len(), 1);
assert!(table.entries.contains_key(&5));
}
#[test]
fn test_get_entry() {
let mut table = XRefTable::new();
let entry = XRefEntry {
offset: 2000,
generation: 1,
in_use: true,
};
table.add_entry(10, entry);
let retrieved = table.get_entry(10);
assert!(retrieved.is_some());
assert_eq!(retrieved.unwrap().offset, 2000);
let missing = table.get_entry(999);
assert!(missing.is_none());
}
#[test]
fn test_set_trailer() {
let mut table = XRefTable::new();
let mut trailer = PdfDictionary::new();
trailer.insert("Size".to_string(), PdfObject::Integer(10));
table.set_trailer(trailer.clone());
assert!(table.trailer.is_some());
assert_eq!(
table.trailer().unwrap().get("Size"),
Some(&PdfObject::Integer(10))
);
}
#[test]
fn test_parse_xref_entry_invalid() {
let result = XRefTable::parse_xref_entry("0000000000 65535");
assert!(result.is_ok());
let result = XRefTable::parse_xref_entry("not_a_number 65535 f ");
assert!(result.is_err());
let result = XRefTable::parse_xref_entry("0000000000 65535 x ");
assert!(result.is_ok()); assert!(result.unwrap().in_use); }
#[test]
fn test_parse_xref_entry_various_offsets() {
let entry = XRefTable::parse_xref_entry("0000000001 00000 n ").unwrap();
assert_eq!(entry.offset, 1);
let entry = XRefTable::parse_xref_entry("9999999999 00000 n ").unwrap();
assert_eq!(entry.offset, 9999999999);
let entry = XRefTable::parse_xref_entry("0000000000 65535 f ").unwrap();
assert_eq!(entry.generation, 65535);
}
#[test]
fn test_add_extended_entry() {
let mut table = XRefTable::new();
let ext_entry = XRefEntryExt {
basic: XRefEntry {
offset: 0,
generation: 0,
in_use: true,
},
compressed_info: Some((5, 10)),
};
table.add_extended_entry(15, ext_entry);
assert_eq!(table.extended_entries.len(), 1);
assert!(table.extended_entries.contains_key(&15));
}
#[test]
fn test_get_extended_entry() {
let mut table = XRefTable::new();
let ext_entry = XRefEntryExt {
basic: XRefEntry {
offset: 0,
generation: 0,
in_use: true,
},
compressed_info: Some((20, 3)),
};
table.add_extended_entry(7, ext_entry);
let retrieved = table.get_extended_entry(7);
assert!(retrieved.is_some());
assert_eq!(retrieved.unwrap().compressed_info, Some((20, 3)));
}
#[test]
fn test_xref_offset() {
let mut table = XRefTable::new();
assert_eq!(table.xref_offset(), 0);
table.xref_offset = 12345;
assert_eq!(table.xref_offset(), 12345);
}
#[test]
fn test_find_xref_offset_simple() {
let pdf_data = b"startxref\n12345\n%%EOF";
let cursor = Cursor::new(pdf_data.to_vec());
let mut reader = BufReader::new(cursor);
let offset = XRefTable::find_xref_offset(&mut reader).unwrap();
assert_eq!(offset, 12345);
}
#[test]
fn test_find_xref_offset_with_spaces() {
let pdf_data = b"startxref \n 12345 \n%%EOF";
let cursor = Cursor::new(pdf_data.to_vec());
let mut reader = BufReader::new(cursor);
let offset = XRefTable::find_xref_offset(&mut reader).unwrap();
assert_eq!(offset, 12345);
}
#[test]
fn test_find_xref_offset_missing() {
let pdf_data = b"no startxref here";
let cursor = Cursor::new(pdf_data.to_vec());
let mut reader = BufReader::new(cursor);
let result = XRefTable::find_xref_offset(&mut reader);
assert!(result.is_err());
}
#[test]
fn test_trailer_getter() {
let mut table = XRefTable::new();
assert!(table.trailer().is_none());
let trailer = PdfDictionary::new();
table.set_trailer(trailer);
assert!(table.trailer().is_some());
}
#[test]
fn test_xref_table_clone() {
let mut table = XRefTable::new();
table.add_entry(
1,
XRefEntry {
offset: 100,
generation: 0,
in_use: true,
},
);
table.xref_offset = 5000;
let cloned = table.clone();
assert_eq!(cloned.entries.len(), 1);
assert_eq!(cloned.xref_offset, 5000);
}
#[test]
fn test_parse_obj_header() {
assert_eq!(parse_obj_header_bytes(b"1 0 obj"), Some((1, 0)));
assert_eq!(parse_obj_header_bytes(b"123 5 obj"), Some((123, 5)));
assert_eq!(parse_obj_header_bytes(b" 42 3 obj "), Some((42, 3)));
assert_eq!(parse_obj_header_bytes(b"1 obj"), None);
assert_eq!(parse_obj_header_bytes(b"abc 0 obj"), None);
assert_eq!(parse_obj_header_bytes(b"1 0 object"), None);
assert_eq!(parse_obj_header_bytes(b""), None);
}
#[test]
fn test_xref_recovery_parsing() {
let pdf_content =
b"1 0 obj\n<< /Type /Catalog >>\nendobj\n2 0 obj\n<< /Type /Page >>\nendobj\n";
let mut reader = BufReader::new(Cursor::new(pdf_content));
let table = XRefTable::parse_with_recovery(&mut reader).unwrap();
assert_eq!(table.len(), 2);
assert!(table.get_entry(1).is_some());
assert!(table.get_entry(2).is_some());
assert!(table.get_entry(1).unwrap().in_use);
assert!(table.get_entry(2).unwrap().in_use);
}
#[test]
fn test_xref_recovery_no_objects() {
let pdf_content = b"This is not a PDF file\nNo objects here\n";
let mut reader = BufReader::new(Cursor::new(pdf_content));
let result = XRefTable::parse_with_recovery(&mut reader);
assert!(result.is_err());
}
#[test]
fn test_offset_validation() {
let pdf_data = b"small file";
let mut reader = BufReader::new(Cursor::new(pdf_data));
assert!(XRefTable::validate_offset(&mut reader, 5).is_ok());
assert!(XRefTable::validate_offset(&mut reader, 100).is_err());
assert!(XRefTable::validate_offset(&mut reader, 10).is_err());
}
#[test]
fn test_xref_parse_with_fallback() {
let pdf_content =
b"1 0 obj\n<< /Type /Catalog >>\nendobj\n2 0 obj\n<< /Type /Page >>\nendobj\n";
let mut reader = BufReader::new(Cursor::new(pdf_content));
let result = XRefTable::parse(&mut reader);
assert!(result.is_err());
if let Err(e) = result {
assert!(matches!(e, ParseError::InvalidXRef));
}
}
#[test]
fn test_xref_entry_creation() {
let entry = XRefEntry {
offset: 1234,
generation: 5,
in_use: true,
};
assert_eq!(entry.offset, 1234);
assert_eq!(entry.generation, 5);
assert!(entry.in_use);
}
#[test]
fn test_xref_entry_ext_creation() {
let basic = XRefEntry {
offset: 5000,
generation: 0,
in_use: true,
};
let ext = XRefEntryExt {
basic: basic.clone(),
compressed_info: Some((10, 3)),
};
assert_eq!(ext.basic.offset, 5000);
assert_eq!(ext.compressed_info, Some((10, 3)));
}
#[test]
fn test_xref_table_new_advanced() {
let table = XRefTable::new();
assert_eq!(table.entries.len(), 0);
assert_eq!(table.extended_entries.len(), 0);
assert!(table.trailer.is_none());
assert_eq!(table.xref_offset, 0);
}
#[test]
fn test_xref_table_default_advanced() {
let table = XRefTable::default();
assert_eq!(table.entries.len(), 0);
assert!(table.trailer.is_none());
}
#[test]
fn test_xref_table_add_entry() {
let mut table = XRefTable::new();
let entry1 = XRefEntry {
offset: 100,
generation: 0,
in_use: true,
};
table.add_entry(1, entry1);
let entry2 = XRefEntry {
offset: 200,
generation: 1,
in_use: false,
};
table.add_entry(2, entry2);
assert_eq!(table.len(), 2);
let entry1 = table.get_entry(1).unwrap();
assert_eq!(entry1.offset, 100);
assert_eq!(entry1.generation, 0);
assert!(entry1.in_use);
let entry2 = table.get_entry(2).unwrap();
assert_eq!(entry2.offset, 200);
assert_eq!(entry2.generation, 1);
assert!(!entry2.in_use);
}
#[test]
fn test_xref_table_add_extended_entry() {
let mut table = XRefTable::new();
let basic_entry = XRefEntry {
offset: 0,
generation: 0,
in_use: true,
};
let extended_entry = XRefEntryExt {
basic: basic_entry,
compressed_info: Some((10, 2)),
};
table.add_extended_entry(5, extended_entry);
let ext = table.get_extended_entry(5);
assert!(ext.is_some());
if let Some(ext) = ext {
assert_eq!(ext.compressed_info, Some((10, 2)));
}
assert!(table.is_compressed(5));
}
#[test]
fn test_xref_table_get_nonexistent() {
let table = XRefTable::new();
assert!(table.get_entry(999).is_none());
assert!(table.get_extended_entry(999).is_none());
}
#[test]
fn test_xref_table_update_entry() {
let mut table = XRefTable::new();
let entry1 = XRefEntry {
offset: 100,
generation: 0,
in_use: true,
};
table.add_entry(1, entry1);
let entry2 = XRefEntry {
offset: 200,
generation: 1,
in_use: false,
};
table.add_entry(1, entry2);
let entry = table.get_entry(1).unwrap();
assert_eq!(entry.offset, 200);
assert_eq!(entry.generation, 1);
assert!(!entry.in_use);
}
#[test]
fn test_xref_table_set_trailer() {
let mut table = XRefTable::new();
assert!(table.trailer.is_none());
let mut trailer = PdfDictionary::new();
trailer.insert("Size".to_string(), PdfObject::Integer(10));
table.set_trailer(trailer.clone());
assert!(table.trailer.is_some());
assert_eq!(table.trailer(), Some(&trailer));
}
#[test]
fn test_xref_table_offset() {
let table = XRefTable::new();
assert_eq!(table.xref_offset(), 0);
}
#[test]
fn test_parse_xref_entry_invalid_static() {
let invalid_lines = vec![
"not a valid entry".to_string(),
"12345 abcde n".to_string(), ];
for line in invalid_lines {
let result = XRefTable::parse_xref_entry(&line);
assert!(result.is_err());
}
let result = XRefTable::parse_xref_entry("12345 00000");
assert!(result.is_ok());
let entry = result.unwrap();
assert_eq!(entry.offset, 12345);
assert_eq!(entry.generation, 0);
assert!(entry.in_use); }
#[test]
fn test_xref_entry_operations() {
let mut table = XRefTable::new();
let entry1 = XRefEntry {
offset: 1234,
generation: 5,
in_use: true,
};
let entry2 = XRefEntry {
offset: 5678,
generation: 10,
in_use: false,
};
table.add_entry(1, entry1);
table.add_entry(2, entry2);
assert_eq!(table.len(), 2);
let retrieved1 = table.get_entry(1).unwrap();
assert_eq!(retrieved1.offset, 1234);
assert_eq!(retrieved1.generation, 5);
assert!(retrieved1.in_use);
let retrieved2 = table.get_entry(2).unwrap();
assert_eq!(retrieved2.offset, 5678);
assert_eq!(retrieved2.generation, 10);
assert!(!retrieved2.in_use);
}
#[test]
fn test_parse_xref_with_comments() {
let pdf_content = b"%PDF-1.4\n\
1 0 obj\n<< /Type /Catalog >>\nendobj\n\
xref\n\
% This is a comment\n\
0 2\n\
0000000000 65535 f \n\
0000000015 00000 n \n\
% Another comment\n\
trailer\n\
<< /Size 2 /Root 1 0 R >>\n\
startxref\n\
45\n\
%%EOF";
let mut reader = BufReader::new(Cursor::new(pdf_content));
reader.seek(SeekFrom::Start(45)).unwrap();
let result = XRefTable::parse(&mut reader);
assert!(result.is_ok());
let table = result.unwrap();
assert_eq!(table.len(), 2);
}
#[test]
fn test_parse_multiple_xref_sections() {
let pdf_content = b"%PDF-1.4\n\
1 0 obj\n<< /Type /Catalog >>\nendobj\n\
2 0 obj\n<< /Type /Page >>\nendobj\n\
xref\n\
0 2\n\
0000000000 65535 f \n\
0000000015 00000 n \n\
5 2\n\
0000000100 00000 n \n\
0000000200 00000 n \n\
trailer\n\
<< /Size 7 /Root 1 0 R >>\n\
startxref\n\
78\n\
%%EOF";
let mut reader = BufReader::new(Cursor::new(pdf_content));
reader.seek(SeekFrom::Start(78)).unwrap();
let result = XRefTable::parse(&mut reader);
assert!(result.is_ok());
let table = result.unwrap();
assert_eq!(table.len(), 4);
assert!(table.get_entry(0).is_some());
assert!(table.get_entry(1).is_some());
assert!(table.get_entry(5).is_some());
assert!(table.get_entry(6).is_some());
}
#[test]
fn test_parse_xref_with_prev() {
let pdf_content = b"%PDF-1.4\n\
% First xref at 15\n\
xref\n\
0 2\n\
0000000000 65535 f \n\
0000000100 00000 n \n\
trailer\n\
<< /Size 2 >>\n\
% Second xref at 100\n\
xref\n\
2 1\n\
0000000200 00000 n \n\
trailer\n\
<< /Size 3 /Prev 15 >>\n\
startxref\n\
100\n\
%%EOF";
let mut reader = BufReader::new(Cursor::new(pdf_content));
let options = ParseOptions {
lenient_syntax: true,
..Default::default()
};
let result = XRefTable::parse_with_options(&mut reader, &options);
assert!(result.is_ok() || result.is_err());
}
#[test]
fn test_invalid_xref_format() {
let pdf_content = b"xref\ninvalid content\ntrailer";
let mut reader = BufReader::new(Cursor::new(pdf_content));
let result = XRefTable::parse(&mut reader);
assert!(result.is_err());
}
#[test]
fn test_xref_entry_overflow() {
let mut table = XRefTable::new();
let entry = XRefEntry {
offset: u64::MAX,
generation: u16::MAX,
in_use: true,
};
table.add_entry(u32::MAX, entry);
let entry = table.get_entry(u32::MAX).unwrap();
assert_eq!(entry.offset, u64::MAX);
assert_eq!(entry.generation, u16::MAX);
}
#[test]
fn test_xref_table_operations() {
let mut table = XRefTable::new();
let entry1 = XRefEntry {
offset: 100,
generation: 0,
in_use: true,
};
let entry2 = XRefEntry {
offset: 200,
generation: 0,
in_use: true,
};
table.add_entry(1, entry1);
table.add_entry(2, entry2);
assert_eq!(table.len(), 2);
assert!(table.get_entry(1).is_some());
assert!(table.get_entry(2).is_some());
assert!(table.get_entry(3).is_none());
}
#[test]
fn test_xref_table_merge() {
let mut table1 = XRefTable::new();
let entry1 = XRefEntry {
offset: 100,
generation: 0,
in_use: true,
};
table1.add_entry(1, entry1);
let entry2 = XRefEntry {
offset: 200,
generation: 0,
in_use: true,
};
table1.add_entry(2, entry2);
let mut table2 = XRefTable::new();
let entry3 = XRefEntry {
offset: 250,
generation: 1,
in_use: true,
}; table2.add_entry(2, entry3);
let entry4 = XRefEntry {
offset: 300,
generation: 0,
in_use: true,
}; table2.add_entry(3, entry4);
for i in 2..=3 {
if let Some(entry) = table2.get_entry(i) {
table1.add_entry(
i,
XRefEntry {
offset: entry.offset,
generation: entry.generation,
in_use: entry.in_use,
},
);
}
}
assert_eq!(table1.len(), 3);
let entry2 = table1.get_entry(2).unwrap();
assert_eq!(entry2.offset, 250);
assert_eq!(entry2.generation, 1);
assert!(table1.get_entry(3).is_some());
}
#[test]
fn test_xref_recovery_with_stream() {
let pdf_content = b"1 0 obj\n<< /Type /ObjStm /N 2 /First 10 >>\nstream\n12345678901 0 2 0\nendstream\nendobj\n";
let mut reader = BufReader::new(Cursor::new(pdf_content));
let result = XRefTable::parse_with_recovery(&mut reader);
assert!(result.is_ok() || result.is_err());
}
#[test]
fn test_xref_entry_equality_advanced() {
let entry1 = XRefEntry {
offset: 100,
generation: 0,
in_use: true,
};
let entry2 = XRefEntry {
offset: 100,
generation: 0,
in_use: true,
};
let entry3 = XRefEntry {
offset: 200,
generation: 0,
in_use: true,
};
assert_eq!(entry1, entry2);
assert_ne!(entry1, entry3);
}
#[test]
fn test_parse_options_effect() {
let pdf_content = b"xref 0 1 invalid";
let mut reader = BufReader::new(Cursor::new(pdf_content));
let strict_options = ParseOptions {
lenient_syntax: false,
..Default::default()
};
let result = XRefTable::parse_with_options(&mut reader, &strict_options);
assert!(result.is_err());
reader.seek(SeekFrom::Start(0)).unwrap();
let lenient_options = ParseOptions {
lenient_syntax: true,
..Default::default()
};
let result = XRefTable::parse_with_options(&mut reader, &lenient_options);
assert!(result.is_err() || result.is_ok());
}
#[test]
fn test_circular_reference_detection() {
let pdf_content = b"%PDF-1.4\n\
xref\n\
0 1\n\
0000000000 65535 f \n\
trailer\n\
<< /Size 1 /Prev 10 >>\n\
startxref\n\
10\n\
%%EOF";
let mut reader = BufReader::new(Cursor::new(pdf_content));
let result = XRefTable::parse_with_incremental_updates(&mut reader);
assert!(result.is_ok() || result.is_err());
}
#[test]
fn test_linearized_xref_detection() {
let pdf_content = b"%PDF-1.4\n\
1 0 obj\n\
<< /Linearized 1 /L 1234 /H [100 200] /O 5 /E 500 /N 10 /T 600 >>\n\
endobj\n\
xref\n\
0 2\n\
0000000000 65535 f \n\
0000000009 00000 n \n\
trailer\n\
<< /Size 2 >>\n\
startxref\n\
63\n\
%%EOF";
let mut reader = BufReader::new(Cursor::new(pdf_content));
let result = XRefTable::find_linearized_xref(&mut reader);
assert!(result.is_ok());
let xref_pos = result.unwrap();
assert_eq!(
xref_pos, 90,
"Expected xref at position 90, got {}",
xref_pos
);
}
#[test]
fn test_xref_stream_parsing() {
let pdf_content = b"%PDF-1.5\n\
1 0 obj\n\
<< /Type /XRef /Size 3 /W [1 2 1] /Length 12 >>\n\
stream\n\
\x00\x00\x00\x00\
\x01\x00\x10\x00\
\x01\x00\x20\x00\
endstream\n\
endobj\n\
startxref\n\
9\n\
%%EOF";
let mut reader = BufReader::new(Cursor::new(pdf_content));
reader.seek(SeekFrom::Start(9)).unwrap();
let result = XRefTable::parse(&mut reader);
assert!(result.is_err() || result.is_ok());
}
#[test]
fn test_xref_validation_max_object_exceeds_size() {
let pdf_content = b"%PDF-1.4\n\
xref\n\
0 1\n\
0000000000 65535 f \n\
10 1\n\
0000000100 00000 n \n\
trailer\n\
<< /Size 5 /Root 1 0 R >>\n\
startxref\n\
9\n\
%%EOF";
let mut reader = BufReader::new(Cursor::new(pdf_content));
reader.seek(SeekFrom::Start(9)).unwrap();
let result = XRefTable::parse(&mut reader);
assert!(result.is_err());
}
#[test]
fn test_parse_with_options_lenient_vs_strict() {
let pdf_content = b"%PDF-1.4\n\
xref\n\
0 2\n\
0000000000 65535 f \n\
0000000015 00000 n \n\
trailer\n\
<< /Size 2 >>\n\
startxref\n\
9\n\
%%EOF";
let mut reader = BufReader::new(Cursor::new(pdf_content));
let strict_options = ParseOptions {
lenient_syntax: false,
recover_from_stream_errors: false,
..Default::default()
};
reader.seek(SeekFrom::Start(9)).unwrap();
let strict_result = XRefTable::parse_with_options(&mut reader, &strict_options);
let lenient_options = ParseOptions {
lenient_syntax: true,
recover_from_stream_errors: true,
..Default::default()
};
reader.seek(SeekFrom::Start(9)).unwrap();
let lenient_result = XRefTable::parse_with_options(&mut reader, &lenient_options);
assert!(strict_result.is_ok());
assert!(lenient_result.is_ok());
}
#[test]
fn test_xref_entry_with_attached_flag() {
let entry1 = XRefTable::parse_xref_entry("12345 0n");
assert!(entry1.is_ok());
let entry1 = entry1.unwrap();
assert_eq!(entry1.offset, 12345);
assert_eq!(entry1.generation, 0);
assert!(entry1.in_use);
let entry2 = XRefTable::parse_xref_entry("54321 1f");
assert!(entry2.is_ok());
let entry2 = entry2.unwrap();
assert_eq!(entry2.offset, 54321);
assert_eq!(entry2.generation, 1);
assert!(!entry2.in_use);
}
#[test]
fn test_find_xref_offset_edge_cases() {
use std::io::{BufReader, Cursor};
let content = b"garbage\nstartxref \n 123 \n%%EOF";
let mut reader = BufReader::new(Cursor::new(content));
let result = XRefTable::find_xref_offset(&mut reader);
assert_eq!(result.unwrap(), 123);
let content = b"startxref\n999\n%%EOF";
let mut reader = BufReader::new(Cursor::new(content));
let result = XRefTable::find_xref_offset(&mut reader);
assert_eq!(result.unwrap(), 999);
let content = b"startxref\n456";
let mut reader = BufReader::new(Cursor::new(content));
let result = XRefTable::find_xref_offset(&mut reader);
assert!(result.is_ok() || result.is_err());
let content = b"some content\n%%EOF";
let mut reader = BufReader::new(Cursor::new(content));
let result = XRefTable::find_xref_offset(&mut reader);
assert!(result.is_err());
}
#[test]
fn test_xref_subsection_incomplete() {
let pdf_content = b"%PDF-1.4\n\
xref\n\
0 5\n\
0000000000 65535 f \n\
0000000015 00000 n \n\
trailer\n\
<< /Size 5 >>\n\
startxref\n\
9\n\
%%EOF";
let mut reader = BufReader::new(Cursor::new(pdf_content));
reader.seek(SeekFrom::Start(9)).unwrap();
let result = XRefTable::parse(&mut reader);
assert!(result.is_err() || result.is_ok());
}
}
fn extract_root_from_xref_stream(content: &str) -> Option<u32> {
let lines: Vec<&str> = content.lines().collect();
let mut in_xref_obj = false;
for (i, line) in lines.iter().enumerate() {
if line.contains(" obj")
&& lines
.get(i + 1)
.map_or(false, |next| next.contains("/Type /XRef"))
{
in_xref_obj = true;
continue;
}
if in_xref_obj {
if line.contains("endobj") {
in_xref_obj = false;
continue;
}
if let Some(root_pos) = line.find("/Root ") {
let after_root = &line[root_pos + 6..];
if let Some(space_pos) = after_root.find(' ') {
let number_part = &after_root[..space_pos];
if let Ok(root_obj) = number_part.parse::<u32>() {
tracing::debug!("Extracted Root {} from XRef stream", root_obj);
return Some(root_obj);
}
}
}
}
}
None
}
fn find_catalog_by_content<R: Read + Seek>(
reader: &mut R,
table: &XRefTable,
) -> ParseResult<Option<u32>> {
let mut obj_numbers: Vec<u32> = table.entries.keys().copied().collect();
obj_numbers.sort_unstable();
for obj_num in obj_numbers {
let offset = match table.entries.get(&obj_num) {
Some(entry) if entry.in_use => entry.offset,
_ => continue,
};
if let Some(content) = read_object_content(reader, obj_num, offset)? {
if content.contains("/Type /Catalog") {
tracing::debug!(
"Found catalog candidate at object {} (validated structure)",
obj_num
);
return Ok(Some(obj_num));
}
}
}
tracing::debug!("No valid catalog found by content search");
Ok(None)
}