use crate::error::{Error, Result};
use crate::object::Object;
use crate::parser::parse_object;
use crate::xref::{CrossRefTable, XRefEntry};
use lazy_static::lazy_static;
use std::collections::HashMap;
use std::io::{BufRead, BufReader, Read, Seek, SeekFrom};
lazy_static! {
static ref RE_OBJ_PATTERN: regex::bytes::Regex = regex::bytes::Regex::new(r"(\d+)\s+(\d+)\s+obj").unwrap();
static ref RE_TRAILER: regex::bytes::Regex = regex::bytes::Regex::new(r"trailer\s*<<").unwrap();
}
pub fn reconstruct_xref<R: Read + Seek>(reader: &mut R) -> Result<(CrossRefTable, Object)> {
log::info!("Reconstructing xref table by scanning file...");
reader.seek(SeekFrom::Start(0))?;
let mut contents = Vec::new();
reader.read_to_end(&mut contents)?;
log::debug!("File size: {} bytes", contents.len());
let mut xref = CrossRefTable::new();
let mut objects_found = 0;
for capture in RE_OBJ_PATTERN.captures_iter(&contents) {
let full_match = match capture.get(0) {
Some(m) => m,
None => continue,
};
let obj_num_bytes = match capture.get(1) {
Some(m) => m.as_bytes(),
None => continue,
};
let gen_num_bytes = match capture.get(2) {
Some(m) => m.as_bytes(),
None => continue,
};
let obj_num: u32 = match std::str::from_utf8(obj_num_bytes)
.ok()
.and_then(|s| s.parse().ok())
{
Some(n) => n,
None => {
log::warn!("Failed to parse object number at offset {}", full_match.start());
continue;
},
};
let gen_num: u16 = match std::str::from_utf8(gen_num_bytes)
.ok()
.and_then(|s| s.parse().ok())
{
Some(n) => n,
None => {
log::warn!("Failed to parse generation number at offset {}", full_match.start());
continue;
},
};
let offset = full_match.start() as u64;
let validation_start = offset + full_match.as_bytes().len() as u64;
if validation_start < contents.len() as u64 {
let remaining = &contents[validation_start as usize..];
let mut i = 0;
while i < remaining.len() && remaining[i].is_ascii_whitespace() {
i += 1;
}
if i < remaining.len() {
let next_byte = remaining[i];
let is_valid_object_start =
matches!(next_byte, b'<' | b'[' | b'(' | b'/' | b't' | b'f' | b'n' | b'-')
|| next_byte.is_ascii_digit();
if !is_valid_object_start {
log::debug!(
"Skipping false positive object header at offset {} (next byte: 0x{:02x} '{}')",
offset,
next_byte,
if next_byte.is_ascii_graphic() {
next_byte as char
} else {
'?'
}
);
continue;
}
log::debug!("Validated object {} gen {} at offset {}", obj_num, gen_num, offset);
}
}
let entry = XRefEntry::uncompressed(offset, gen_num);
xref.add_entry(obj_num, entry);
objects_found += 1;
}
log::info!("Reconstructed xref with {} objects", objects_found);
if objects_found == 0 {
return Err(Error::InvalidPdf("No objects found during xref reconstruction".to_string()));
}
let trailer = find_trailer(&contents, reader, &xref)?;
Ok((xref, trailer))
}
fn find_trailer<R: Read + Seek>(
contents: &[u8],
reader: &mut R,
xref: &CrossRefTable,
) -> Result<Object> {
log::debug!("Searching for trailer dictionary...");
let mut best_trailer: Option<Object> = None;
for mat in RE_TRAILER.find_iter(contents) {
let trailer_start = mat.start();
log::debug!("Found trailer keyword at offset {}", trailer_start);
let trailer_keyword_end = trailer_start + 7; let input = &contents[trailer_keyword_end..];
match parse_object(input) {
Ok((_, obj)) => {
best_trailer = Some(obj);
},
Err(e) => {
log::warn!("Failed to parse trailer dictionary at offset {}: {}", trailer_start, e);
},
}
}
if let Some(trailer) = best_trailer {
log::info!("Successfully parsed trailer dictionary (using last valid occurrence)");
return Ok(trailer);
}
log::info!("Reconstructing minimal trailer dictionary...");
reconstruct_minimal_trailer(reader, xref)
}
fn reconstruct_minimal_trailer<R: Read + Seek>(
reader: &mut R,
xref: &CrossRefTable,
) -> Result<Object> {
log::debug!("Scanning objects to find catalog...");
let mut catalog_ref = None;
for (idx, obj_num) in xref.all_object_numbers().enumerate() {
if idx >= 100 {
break;
}
if let Some(entry) = xref.get(obj_num) {
if !entry.in_use {
continue;
}
match load_object_at_offset(reader, entry.offset) {
Ok(obj) => {
if is_catalog(&obj) {
log::info!("Found catalog: object {} gen {}", obj_num, entry.generation);
catalog_ref = Some((obj_num, entry.generation));
break;
}
},
Err(e) => {
log::debug!(
"Failed to load object {} at offset {}: {}",
obj_num,
entry.offset,
e
);
continue;
},
}
}
}
if catalog_ref.is_none() {
return Err(Error::InvalidPdf("Could not find catalog in reconstructed xref".to_string()));
}
let (cat_num, cat_gen) = catalog_ref.expect("catalog_ref validated above");
let mut trailer_dict = HashMap::new();
trailer_dict.insert(
"Root".to_string(),
Object::Reference(crate::object::ObjectRef::new(cat_num, cat_gen)),
);
trailer_dict.insert("Size".to_string(), Object::Integer(xref.len() as i64));
Ok(Object::Dictionary(trailer_dict))
}
fn load_object_at_offset<R: Read + Seek>(reader: &mut R, offset: u64) -> Result<Object> {
reader.seek(SeekFrom::Start(offset))?;
let mut buf_reader = BufReader::new(reader);
let mut content = Vec::new();
let mut bytes_read = 0;
const MAX_OBJECT_SIZE: usize = 1024 * 1024;
loop {
let mut line = Vec::new();
match buf_reader.read_until(b'\n', &mut line) {
Ok(0) => break, Ok(n) => {
content.extend_from_slice(&line);
bytes_read += n;
if bytes_read > MAX_OBJECT_SIZE {
return Err(Error::InvalidPdf("Object too large".to_string()));
}
if content.windows(6).any(|w| w == b"endobj") {
break;
}
},
Err(e) => return Err(Error::Io(e)),
}
}
use crate::lexer::token;
let input = &content[..];
let (rest, _) = token(input).map_err(|e| Error::ParseError {
offset: 0,
reason: format!("failed to parse object number: {}", e),
})?;
let (rest, _) = token(rest).map_err(|e| Error::ParseError {
offset: 0,
reason: format!("failed to parse generation: {}", e),
})?;
let (rest, _) = token(rest).map_err(|e| Error::ParseError {
offset: 0,
reason: format!("failed to parse 'obj' keyword: {}", e),
})?;
let (_, obj) = parse_object(rest).map_err(|e| Error::ParseError {
offset: 0,
reason: format!("failed to parse object: {}", e),
})?;
Ok(obj)
}
fn is_catalog(obj: &Object) -> bool {
if let Some(dict) = obj.as_dict() {
if let Some(type_obj) = dict.get("Type") {
if let Some(type_name) = type_obj.as_name() {
return type_name == "Catalog";
}
}
}
false
}
pub fn search_nearby_for_object<R: Read + Seek>(
reader: &mut R,
obj_id: u32,
approx_offset: u64,
) -> Result<Object> {
log::debug!("Searching for object {} near offset {}", obj_id, approx_offset);
let search_range = 1024u64;
let start = approx_offset.saturating_sub(search_range);
let end = approx_offset + search_range;
reader.seek(SeekFrom::Start(start))?;
let mut buffer = vec![0u8; (end - start) as usize];
let bytes_read = reader.read(&mut buffer)?;
let buffer = &buffer[..bytes_read];
let pattern = format!(r"{} \d+ obj", obj_id);
let re = match regex::bytes::Regex::new(&pattern) {
Ok(r) => r,
Err(_) => return Err(Error::ObjectNotFound(obj_id, 0)),
};
if let Some(mat) = re.find(buffer) {
let obj_offset = start + mat.start() as u64;
log::debug!(
"Found object {} at offset {} (expected {})",
obj_id,
obj_offset,
approx_offset
);
return load_object_at_offset(reader, obj_offset);
}
Err(Error::ObjectNotFound(obj_id, 0))
}
#[cfg(test)]
mod tests {
use super::*;
use std::io::Cursor;
#[test]
fn test_reconstruct_simple_pdf() {
let pdf_data = b"%PDF-1.4\n\
1 0 obj\n\
<< /Type /Catalog /Pages 2 0 R >>\n\
endobj\n\
2 0 obj\n\
<< /Type /Pages /Count 0 /Kids [] >>\n\
endobj\n\
trailer\n\
<< /Root 1 0 R /Size 3 >>\n\
startxref\n\
0\n\
%%EOF";
let mut cursor = Cursor::new(pdf_data);
let result = reconstruct_xref(&mut cursor);
assert!(result.is_ok());
let (xref, trailer) = result.unwrap();
assert!(xref.contains(1));
assert!(xref.contains(2));
if let Some(dict) = trailer.as_dict() {
assert!(dict.contains_key("Root"));
} else {
panic!("Trailer is not a dictionary");
}
}
#[test]
fn test_is_catalog() {
let mut dict = HashMap::new();
dict.insert("Type".to_string(), Object::Name("Catalog".to_string()));
let catalog = Object::Dictionary(dict);
assert!(is_catalog(&catalog));
let not_catalog = Object::Integer(42);
assert!(!is_catalog(¬_catalog));
}
#[test]
fn test_reconstruct_no_objects() {
let pdf_data = b"%PDF-1.4\n\
This is not a valid PDF with objects\n\
%%EOF";
let mut cursor = Cursor::new(pdf_data);
let result = reconstruct_xref(&mut cursor);
assert!(result.is_err());
}
}