use std::collections::HashMap;
use crate::object::{ObjRef, PdfDict, PdfObject};
#[derive(Debug, Clone)]
struct XRefEntry {
offset: u64,
#[allow(dead_code)]
gen: u16,
in_use: bool,
}
pub struct ParsedDocument {
data: Vec<u8>,
xref: HashMap<u32, XRefEntry>,
pub trailer: PdfDict,
}
#[derive(Debug)]
pub enum ParseError {
InvalidSignature,
MalformedXRef,
MalformedObject(String),
UnexpectedEof,
UnsupportedFeature(String),
}
impl std::fmt::Display for ParseError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
ParseError::InvalidSignature => write!(f, "Not a valid PDF file"),
ParseError::MalformedXRef => write!(f, "Malformed cross-reference table"),
ParseError::MalformedObject(s) => write!(f, "Malformed object: {}", s),
ParseError::UnexpectedEof => write!(f, "Unexpected end of file"),
ParseError::UnsupportedFeature(s) => write!(f, "Unsupported feature: {}", s),
}
}
}
impl ParsedDocument {
pub fn parse(data: Vec<u8>) -> Result<Self, ParseError> {
if !data.starts_with(b"%PDF-") {
return Err(ParseError::InvalidSignature);
}
let startxref_offset = find_startxref(&data)?;
let (xref, trailer) = parse_xref(&data, startxref_offset)?;
Ok(Self { data, xref, trailer })
}
pub fn object_count(&self) -> usize {
if let Some(PdfObject::Integer(n)) = self.trailer.get("Size") {
*n as usize
} else {
self.xref.len()
}
}
pub fn get_object(&self, r: ObjRef) -> Result<PdfObject, ParseError> {
let entry = self.xref.get(&r.id)
.ok_or_else(|| ParseError::MalformedObject(format!("Object {} not found", r.id)))?;
if !entry.in_use {
return Ok(PdfObject::Null);
}
let offset = entry.offset as usize;
let mut parser = ObjectParser::new(&self.data, offset);
parser.skip_obj_header()?;
parser.parse_object()
}
pub fn root_ref(&self) -> Option<ObjRef> {
match self.trailer.get("Root") {
Some(PdfObject::Reference(r)) => Some(*r),
_ => None,
}
}
pub fn info_ref(&self) -> Option<ObjRef> {
match self.trailer.get("Info") {
Some(PdfObject::Reference(r)) => Some(*r),
_ => None,
}
}
pub fn page_refs(&self) -> Result<Vec<ObjRef>, ParseError> {
let root_ref = self.root_ref()
.ok_or_else(|| ParseError::MalformedObject("No /Root in trailer".into()))?;
let catalog = self.get_object(root_ref)?;
let catalog_dict = expect_dict(catalog)?;
let pages_ref = match catalog_dict.get("Pages") {
Some(PdfObject::Reference(r)) => *r,
_ => return Err(ParseError::MalformedObject("No /Pages in catalog".into())),
};
let mut refs = Vec::new();
self.collect_pages(pages_ref, &mut refs)?;
Ok(refs)
}
fn collect_pages(&self, node_ref: ObjRef, refs: &mut Vec<ObjRef>) -> Result<(), ParseError> {
let obj = self.get_object(node_ref)?;
let dict = expect_dict(obj)?;
match dict.get("Type") {
Some(PdfObject::Name(t)) if t == "Pages" => {
if let Some(PdfObject::Array(kids)) = dict.get("Kids") {
let kid_refs: Vec<ObjRef> = kids.iter().filter_map(|k| {
if let PdfObject::Reference(r) = k { Some(*r) } else { None }
}).collect();
for kid in kid_refs {
self.collect_pages(kid, refs)?;
}
}
}
Some(PdfObject::Name(t)) if t == "Page" => {
refs.push(node_ref);
}
_ => {}
}
Ok(())
}
pub fn extract_text(&self) -> Result<String, ParseError> {
let page_refs = self.page_refs()?;
let mut text = String::new();
for (i, page_ref) in page_refs.iter().enumerate() {
if i > 0 { text.push('\n'); }
let page_text = self.extract_page_text(*page_ref)?;
text.push_str(&page_text);
}
Ok(text)
}
fn extract_page_text(&self, page_ref: ObjRef) -> Result<String, ParseError> {
let page_obj = self.get_object(page_ref)?;
let page_dict = expect_dict(page_obj)?;
let content_ref = match page_dict.get("Contents") {
Some(PdfObject::Reference(r)) => *r,
Some(PdfObject::Array(arr)) => {
let mut combined = String::new();
for item in arr {
if let PdfObject::Reference(r) = item {
let stream_text = self.extract_stream_text(*r)?;
combined.push_str(&stream_text);
}
}
return Ok(combined);
}
_ => return Ok(String::new()),
};
self.extract_stream_text(content_ref)
}
fn extract_stream_text(&self, stream_ref: ObjRef) -> Result<String, ParseError> {
let entry = self.xref.get(&stream_ref.id)
.ok_or_else(|| ParseError::MalformedObject(format!("Stream {} not found", stream_ref.id)))?;
let offset = entry.offset as usize;
let mut parser = ObjectParser::new(&self.data, offset);
parser.skip_obj_header()?;
let stream_bytes = parser.parse_stream_bytes()?;
let content = String::from_utf8_lossy(&stream_bytes);
Ok(extract_text_from_content(&content))
}
}
fn extract_text_from_content(content: &str) -> String {
let mut result = String::new();
let mut in_text = false;
let mut tokens: Vec<&str> = Vec::new();
for line in content.lines() {
let line = line.trim();
match line {
"BT" => { in_text = true; }
"ET" => {
in_text = false;
result.push('\n');
}
_ if in_text => {
if line.ends_with("Tj") || line.ends_with("'") {
let operand = line.trim_end_matches("Tj").trim_end_matches("'").trim();
if let Some(s) = extract_literal_string(operand) {
result.push_str(&s);
result.push(' ');
}
} else if line.ends_with("TJ") {
let inner = line.trim_end_matches("TJ").trim();
let inner = inner.trim_start_matches('[').trim_end_matches(']');
for part in split_tj_array(inner) {
result.push_str(&part);
}
result.push(' ');
}
tokens.push(line);
}
_ => {}
}
}
result.trim().to_string()
}
fn extract_literal_string(s: &str) -> Option<String> {
let s = s.trim();
if s.starts_with('(') && s.ends_with(')') {
let inner = &s[1..s.len() - 1];
Some(unescape_pdf_string(inner))
} else {
None
}
}
fn split_tj_array(s: &str) -> Vec<String> {
let mut results = Vec::new();
let mut i = 0;
let chars: Vec<char> = s.chars().collect();
while i < chars.len() {
if chars[i] == '(' {
let start = i + 1;
i += 1;
while i < chars.len() && chars[i] != ')' {
if chars[i] == '\\' { i += 1; }
i += 1;
}
let s: String = chars[start..i].iter().collect();
results.push(unescape_pdf_string(&s));
}
i += 1;
}
results
}
fn unescape_pdf_string(s: &str) -> String {
let mut out = String::new();
let mut chars = s.chars().peekable();
while let Some(c) = chars.next() {
if c == '\\' {
match chars.next() {
Some('n') => out.push('\n'),
Some('r') => out.push('\r'),
Some('t') => out.push('\t'),
Some('(') => out.push('('),
Some(')') => out.push(')'),
Some('\\') => out.push('\\'),
Some(d) if d.is_ascii_digit() => {
let mut oct = d.to_string();
for _ in 0..2 {
if chars.peek().map(|c| c.is_ascii_digit()).unwrap_or(false) {
oct.push(chars.next().unwrap());
}
}
if let Ok(n) = u8::from_str_radix(&oct, 8) {
out.push(n as char);
}
}
Some(c) => out.push(c),
None => {}
}
} else {
out.push(c);
}
}
out
}
fn find_startxref(data: &[u8]) -> Result<u64, ParseError> {
let needle = b"startxref";
let search_from = data.len().saturating_sub(1024);
for i in (search_from..data.len()).rev() {
if data[i..].starts_with(needle) {
let after = &data[i + needle.len()..];
let offset_str = std::str::from_utf8(after)
.unwrap_or("")
.trim()
.lines()
.next()
.unwrap_or("")
.trim();
return offset_str.parse::<u64>()
.map_err(|_| ParseError::MalformedXRef);
}
}
Err(ParseError::MalformedXRef)
}
fn parse_xref(data: &[u8], offset: u64) -> Result<(HashMap<u32, XRefEntry>, PdfDict), ParseError> {
let mut xref = HashMap::new();
let offset = offset as usize;
if offset >= data.len() {
return Err(ParseError::MalformedXRef);
}
let section = std::str::from_utf8(&data[offset..])
.map_err(|_| ParseError::MalformedXRef)?;
let mut lines = section.lines();
let first_line = lines.next().ok_or(ParseError::MalformedXRef)?.trim();
if first_line != "xref" {
return Err(ParseError::UnsupportedFeature("Cross-reference streams (PDF 1.5+)".into()));
}
loop {
let header = match lines.next() {
Some(l) => l.trim(),
None => break,
};
if header.starts_with("trailer") { break; }
if header.is_empty() { continue; }
let mut parts = header.split_whitespace();
let start: u32 = parts.next()
.and_then(|s| s.parse().ok())
.ok_or(ParseError::MalformedXRef)?;
let count: u32 = parts.next()
.and_then(|s| s.parse().ok())
.ok_or(ParseError::MalformedXRef)?;
for i in 0..count {
let entry_line = lines.next()
.ok_or(ParseError::UnexpectedEof)?
.trim();
let mut ep = entry_line.split_whitespace();
let entry_offset: u64 = ep.next().and_then(|s| s.parse().ok()).unwrap_or(0);
let gen: u16 = ep.next().and_then(|s| s.parse().ok()).unwrap_or(0);
let status = ep.next().unwrap_or("f");
xref.insert(start + i, XRefEntry {
offset: entry_offset,
gen,
in_use: status == "n",
});
}
}
let trailer_pos = section.find("trailer")
.ok_or(ParseError::MalformedXRef)?;
let _trailer_section = §ion[trailer_pos + 7..].trim_start();
let mut parser = ObjectParser::new(data, offset + trailer_pos + 7);
parser.skip_whitespace();
let trailer_obj = parser.parse_object()?;
let trailer = expect_dict(trailer_obj)?;
Ok((xref, trailer))
}
fn expect_dict(obj: PdfObject) -> Result<PdfDict, ParseError> {
match obj {
PdfObject::Dictionary(d) => Ok(d),
other => Err(ParseError::MalformedObject(
format!("Expected dictionary, got {:?}", other)
)),
}
}
struct ObjectParser<'a> {
data: &'a [u8],
pos: usize,
}
impl<'a> ObjectParser<'a> {
fn new(data: &'a [u8], pos: usize) -> Self {
Self { data, pos }
}
fn peek(&self) -> Option<u8> {
self.data.get(self.pos).copied()
}
fn advance(&mut self) -> Option<u8> {
let b = self.data.get(self.pos).copied();
self.pos += 1;
b
}
fn skip_whitespace(&mut self) {
while let Some(b) = self.peek() {
if b == b'%' {
while self.peek().map(|b| b != b'\n').unwrap_or(false) {
self.advance();
}
} else if b.is_ascii_whitespace() {
self.advance();
} else {
break;
}
}
}
fn skip_obj_header(&mut self) -> Result<(), ParseError> {
self.skip_whitespace();
while let Some(b) = self.peek() {
if b == b'o' {
if self.data[self.pos..].starts_with(b"obj") {
self.pos += 3;
break;
}
}
self.advance();
}
self.skip_whitespace();
Ok(())
}
fn parse_object(&mut self) -> Result<PdfObject, ParseError> {
self.skip_whitespace();
match self.peek().ok_or(ParseError::UnexpectedEof)? {
b'<' => {
if self.data.get(self.pos + 1) == Some(&b'<') {
self.parse_dict_or_stream()
} else {
self.parse_hex_string()
}
}
b'(' => self.parse_literal_string(),
b'/' => self.parse_name(),
b'[' => self.parse_array(),
b't' => {
if self.data[self.pos..].starts_with(b"true") {
self.pos += 4;
Ok(PdfObject::Boolean(true))
} else {
Err(ParseError::MalformedObject("Expected 'true'".into()))
}
}
b'f' => {
if self.data[self.pos..].starts_with(b"false") {
self.pos += 5;
Ok(PdfObject::Boolean(false))
} else {
Err(ParseError::MalformedObject("Expected 'false'".into()))
}
}
b'n' => {
if self.data[self.pos..].starts_with(b"null") {
self.pos += 4;
Ok(PdfObject::Null)
} else {
Err(ParseError::MalformedObject("Expected 'null'".into()))
}
}
b'-' | b'+' | b'.' | b'0'..=b'9' => self.parse_number_or_ref(),
_ => Err(ParseError::MalformedObject(format!(
"Unexpected byte 0x{:02X} at offset {}",
self.peek().unwrap(),
self.pos
))),
}
}
fn parse_name(&mut self) -> Result<PdfObject, ParseError> {
self.advance(); let mut name = String::new();
while let Some(b) = self.peek() {
if b.is_ascii_whitespace() || b"<>[]()/%".contains(&b) {
break;
}
if b == b'#' {
self.advance();
let h1 = self.advance().ok_or(ParseError::UnexpectedEof)?;
let h2 = self.advance().ok_or(ParseError::UnexpectedEof)?;
let hex = format!("{}{}", h1 as char, h2 as char);
let byte = u8::from_str_radix(&hex, 16)
.map_err(|_| ParseError::MalformedObject("Bad hex in name".into()))?;
name.push(byte as char);
} else {
name.push(b as char);
self.advance();
}
}
Ok(PdfObject::Name(name))
}
fn parse_literal_string(&mut self) -> Result<PdfObject, ParseError> {
self.advance(); let mut bytes = Vec::new();
let mut depth = 1i32;
loop {
let b = self.advance().ok_or(ParseError::UnexpectedEof)?;
match b {
b'(' => { depth += 1; bytes.push(b); }
b')' => {
depth -= 1;
if depth == 0 { break; }
bytes.push(b);
}
b'\\' => {
let next = self.advance().ok_or(ParseError::UnexpectedEof)?;
match next {
b'n' => bytes.push(b'\n'),
b'r' => bytes.push(b'\r'),
b't' => bytes.push(b'\t'),
b'(' => bytes.push(b'('),
b')' => bytes.push(b')'),
b'\\' => bytes.push(b'\\'),
d if d.is_ascii_digit() => {
let mut oct = vec![d];
for _ in 0..2 {
if self.peek().map(|b| b.is_ascii_digit()).unwrap_or(false) {
oct.push(self.advance().unwrap());
}
}
let oct_str: String = oct.iter().map(|&b| b as char).collect();
let byte = u8::from_str_radix(&oct_str, 8).unwrap_or(0);
bytes.push(byte);
}
_ => bytes.push(next),
}
}
_ => bytes.push(b),
}
}
Ok(PdfObject::StringLiteral(bytes))
}
fn parse_hex_string(&mut self) -> Result<PdfObject, ParseError> {
self.advance(); let mut hex = String::new();
loop {
let b = self.advance().ok_or(ParseError::UnexpectedEof)?;
if b == b'>' { break; }
if !b.is_ascii_whitespace() {
hex.push(b as char);
}
}
if hex.len() % 2 != 0 { hex.push('0'); }
let bytes: Result<Vec<u8>, _> = (0..hex.len())
.step_by(2)
.map(|i| u8::from_str_radix(&hex[i..i + 2], 16))
.collect();
bytes.map(PdfObject::HexString)
.map_err(|_| ParseError::MalformedObject("Bad hex string".into()))
}
fn parse_array(&mut self) -> Result<PdfObject, ParseError> {
self.advance(); let mut items = Vec::new();
loop {
self.skip_whitespace();
if self.peek() == Some(b']') {
self.advance();
break;
}
items.push(self.parse_object()?);
}
Ok(PdfObject::Array(items))
}
fn parse_dict_or_stream(&mut self) -> Result<PdfObject, ParseError> {
self.pos += 2; let mut dict = PdfDict::new();
loop {
self.skip_whitespace();
if self.data.get(self.pos..self.pos + 2) == Some(b">>") {
self.pos += 2;
break;
}
let key_obj = self.parse_name()?;
let key = match key_obj {
PdfObject::Name(n) => n,
_ => return Err(ParseError::MalformedObject("Expected name in dict key".into())),
};
self.skip_whitespace();
let value = self.parse_object()?;
dict.set(key, value);
}
self.skip_whitespace();
if self.data[self.pos..].starts_with(b"stream") {
self.pos += 6;
if self.peek() == Some(b'\r') { self.advance(); }
if self.peek() == Some(b'\n') { self.advance(); }
let length = match dict.get("Length") {
Some(PdfObject::Integer(n)) => *n as usize,
_ => return Err(ParseError::MalformedObject("Stream missing /Length".into())),
};
let stream_data = self.data[self.pos..self.pos + length].to_vec();
self.pos += length;
let stream = crate::object::PdfStream { dict, data: stream_data };
Ok(PdfObject::Stream(stream))
} else {
Ok(PdfObject::Dictionary(dict))
}
}
fn parse_number_or_ref(&mut self) -> Result<PdfObject, ParseError> {
let start = self.pos;
let mut is_real = false;
if self.peek() == Some(b'-') || self.peek() == Some(b'+') {
self.advance();
}
while let Some(b) = self.peek() {
if b == b'.' { is_real = true; self.advance(); }
else if b.is_ascii_digit() { self.advance(); }
else { break; }
}
let num_str = std::str::from_utf8(&self.data[start..self.pos])
.map_err(|_| ParseError::MalformedObject("Bad number".into()))?;
if !is_real {
let saved_pos = self.pos;
self.skip_whitespace();
if self.peek().map(|b| b.is_ascii_digit()).unwrap_or(false) {
let gen_start = self.pos;
while self.peek().map(|b| b.is_ascii_digit()).unwrap_or(false) {
self.advance();
}
let gen_str = std::str::from_utf8(&self.data[gen_start..self.pos]).unwrap_or("0");
self.skip_whitespace();
if self.peek() == Some(b'R') {
self.advance();
let id: u32 = num_str.parse().unwrap_or(0);
let gen: u16 = gen_str.parse().unwrap_or(0);
return Ok(PdfObject::Reference(ObjRef { id, gen }));
}
self.pos = saved_pos;
} else {
self.pos = saved_pos;
}
let n: i64 = num_str.parse()
.map_err(|_| ParseError::MalformedObject(format!("Bad integer: {}", num_str)))?;
Ok(PdfObject::Integer(n))
} else {
let f: f64 = num_str.parse()
.map_err(|_| ParseError::MalformedObject(format!("Bad real: {}", num_str)))?;
Ok(PdfObject::Real(f))
}
}
fn parse_stream_bytes(&mut self) -> Result<Vec<u8>, ParseError> {
let obj = self.parse_object()?;
match obj {
PdfObject::Stream(s) => {
let filter = s.dict.get("Filter");
match filter {
Some(PdfObject::Name(f)) if f == "FlateDecode" => {
decompress_zlib(&s.data)
.map_err(|e| ParseError::MalformedObject(e))
}
_ => Ok(s.data),
}
}
_ => Err(ParseError::MalformedObject("Expected stream".into())),
}
}
}
fn decompress_zlib(data: &[u8]) -> Result<Vec<u8>, String> {
use flate2::read::ZlibDecoder;
use std::io::Read;
let mut decoder = ZlibDecoder::new(data);
let mut out = Vec::new();
decoder.read_to_end(&mut out).map_err(|e| e.to_string())?;
Ok(out)
}