use std::collections::HashMap;
use crate::error::{Error, Result};
use crate::objects::{PdfDict, PdfObject, PdfRef, PdfStream};
use crate::parser::{Lexer, Parser, XRefEntry, XRefParser, XRefTable};
const MAX_OBJECT_STREAM_OBJECTS: usize = 1_000_000;
pub struct LoadedDocument {
data: Vec<u8>,
xref: XRefTable,
cache: HashMap<PdfRef, PdfObject>,
version: String,
}
impl LoadedDocument {
pub fn load(data: Vec<u8>) -> Result<Self> {
let version = Self::parse_header(&data)?;
let xref_parser = XRefParser::new(&data);
let xref = xref_parser.parse_all()?;
Ok(LoadedDocument {
data,
xref,
cache: HashMap::new(),
version,
})
}
fn parse_header(data: &[u8]) -> Result<String> {
let header_data = &data[..std::cmp::min(data.len(), 1024)];
let pdf_marker = b"%PDF-";
let pos = header_data
.windows(pdf_marker.len())
.position(|w| w == pdf_marker)
.ok_or_else(|| Error::InvalidStructure("Missing PDF header".to_string()))?;
let version_start = pos + pdf_marker.len();
let mut version_end = version_start;
while version_end < header_data.len()
&& (header_data[version_end].is_ascii_digit() || header_data[version_end] == b'.')
{
version_end += 1;
}
let version = std::str::from_utf8(&header_data[version_start..version_end])
.map_err(|_| Error::InvalidStructure("Invalid PDF version encoding".to_string()))?
.to_string();
Ok(version)
}
pub fn version(&self) -> &str {
&self.version
}
pub fn object_count(&self) -> usize {
self.xref.len()
}
pub fn trailer(&self) -> Option<&PdfDict> {
self.xref.trailer()
}
pub fn catalog_ref(&self) -> Option<PdfRef> {
self.xref.catalog_ref()
}
pub fn info_ref(&self) -> Option<PdfRef> {
self.xref.info_ref()
}
pub fn resolve(&mut self, reference: PdfRef) -> Result<&PdfObject> {
if self.cache.contains_key(&reference) {
return Ok(self.cache.get(&reference).unwrap());
}
let entry = self.xref.get(reference.object_number()).ok_or_else(|| {
Error::MissingObject(format!("Object {} not found", reference.object_number()))
})?;
let obj = match entry {
XRefEntry::InUse { offset, generation } => {
if *generation != reference.generation() {
return Err(Error::MissingObject(format!(
"Object {} generation mismatch: expected {}, got {}",
reference.object_number(),
generation,
reference.generation()
)));
}
let offset = usize::try_from(*offset).map_err(|_| Error::Parse {
message: "Object offset out of range".to_string(),
position: self.data.len(),
})?;
self.parse_object_at(offset)?
}
XRefEntry::Compressed { stream_obj, index } => {
self.parse_compressed_object(*stream_obj, *index)?
}
XRefEntry::Free { .. } => {
return Err(Error::MissingObject(format!(
"Object {} is free",
reference.object_number()
)));
}
};
self.cache.insert(reference, obj);
Ok(self.cache.get(&reference).unwrap())
}
fn parse_object_at(&self, offset: usize) -> Result<PdfObject> {
use crate::parser::lexer::Token;
if offset >= self.data.len() {
return Err(Error::Parse {
message: "Object offset beyond end of file".to_string(),
position: offset,
});
}
let mut lexer = Lexer::new(&self.data[offset..]);
let obj_token = lexer.next_token()?;
if !matches!(obj_token, Token::Integer(_)) {
return Err(Error::Parse {
message: format!("Expected object number, got {:?}", obj_token),
position: offset,
});
}
let gen_token = lexer.next_token()?;
if !matches!(gen_token, Token::Integer(_)) {
return Err(Error::Parse {
message: format!("Expected generation number, got {:?}", gen_token),
position: offset,
});
}
let obj_keyword = lexer.next_token()?;
if obj_keyword != Token::Obj {
return Err(Error::Parse {
message: format!("Expected 'obj' keyword, got {:?}", obj_keyword),
position: offset,
});
}
let mut parser = Parser::new(&mut lexer);
parser.parse_object()
}
fn parse_compressed_object(&mut self, stream_obj: u32, index: u32) -> Result<PdfObject> {
let stream_ref = PdfRef::new(stream_obj);
let stream = self.resolve(stream_ref)?.clone();
let stream = match stream {
PdfObject::Stream(s) => s,
_ => {
return Err(Error::InvalidStructure(format!(
"Object {} is not a stream",
stream_obj
)))
}
};
if stream.dict().get_type() != Some("ObjStm") {
return Err(Error::InvalidStructure(format!(
"Object {} is not an object stream",
stream_obj
)));
}
let n = stream
.dict()
.get_integer("N")
.ok_or_else(|| Error::InvalidStructure("Object stream missing N".to_string()))?
.try_into()
.map_err(|_| Error::InvalidStructure("Object stream N out of range".to_string()))?;
if n > MAX_OBJECT_STREAM_OBJECTS {
return Err(Error::InvalidStructure(
"Object stream too large".to_string(),
));
}
let first = stream
.dict()
.get_integer("First")
.ok_or_else(|| Error::InvalidStructure("Object stream missing First".to_string()))?
.try_into()
.map_err(|_| Error::InvalidStructure("Object stream First out of range".to_string()))?;
let data = stream.decode()?;
if first > data.len() {
return Err(Error::InvalidStructure(
"Object stream First beyond data length".to_string(),
));
}
let mut header_lexer = Lexer::new(&data[..first]);
let mut header_parser = Parser::new(&mut header_lexer);
let mut obj_offsets: Vec<(u32, usize)> = Vec::with_capacity(n);
for _ in 0..n {
let obj_num = header_parser.parse_object()?.as_integer().ok_or_else(|| {
Error::InvalidStructure("Invalid object stream header".to_string())
})?;
let obj_num = u32::try_from(obj_num).map_err(|_| {
Error::InvalidStructure("Object stream object number out of range".to_string())
})?;
let obj_offset = header_parser.parse_object()?.as_integer().ok_or_else(|| {
Error::InvalidStructure("Invalid object stream header".to_string())
})?;
let obj_offset = usize::try_from(obj_offset).map_err(|_| {
Error::InvalidStructure("Object stream offset out of range".to_string())
})?;
obj_offsets.push((obj_num, obj_offset));
}
let index = usize::try_from(index)
.map_err(|_| Error::MissingObject("Object index out of range".to_string()))?;
if index >= obj_offsets.len() {
return Err(Error::MissingObject(format!(
"Object index {} out of range in object stream {}",
index, stream_obj
)));
}
let (_, obj_offset) = obj_offsets[index];
let abs_offset = first
.checked_add(obj_offset)
.ok_or_else(|| Error::InvalidStructure("Object stream offset overflow".to_string()))?;
if abs_offset > data.len() {
return Err(Error::InvalidStructure(
"Object stream offset beyond data length".to_string(),
));
}
let mut lexer = Lexer::new(&data[abs_offset..]);
let mut parser = Parser::new(&mut lexer);
parser.parse_object()
}
pub fn resolve_dict(&mut self, reference: PdfRef) -> Result<PdfDict> {
let obj = self.resolve(reference)?.clone();
match obj {
PdfObject::Dict(d) => Ok(d),
_ => Err(Error::InvalidObjectType {
expected: "dictionary".to_string(),
actual: obj.type_name().to_string(),
}),
}
}
pub fn resolve_stream(&mut self, reference: PdfRef) -> Result<PdfStream> {
let obj = self.resolve(reference)?.clone();
match obj {
PdfObject::Stream(s) => Ok(s),
_ => Err(Error::InvalidObjectType {
expected: "stream".to_string(),
actual: obj.type_name().to_string(),
}),
}
}
pub fn catalog(&mut self) -> Result<PdfDict> {
let catalog_ref = self.catalog_ref().ok_or_else(|| {
Error::InvalidStructure("Missing catalog reference in trailer".to_string())
})?;
self.resolve_dict(catalog_ref)
}
pub fn pages_ref(&mut self) -> Result<PdfRef> {
let catalog = self.catalog()?;
catalog
.get_ref("Pages")
.ok_or_else(|| Error::InvalidStructure("Catalog missing Pages".to_string()))
}
pub fn page_count(&mut self) -> Result<usize> {
let pages_ref = self.pages_ref()?;
let pages = self.resolve_dict(pages_ref)?;
let count = pages
.get_integer("Count")
.ok_or_else(|| Error::InvalidStructure("Pages missing Count".to_string()))?;
usize_from_i64(count, "Pages Count")
}
pub fn page(&mut self, index: usize) -> Result<PdfDict> {
let pages_ref = self.pages_ref()?;
self.find_page(pages_ref, index, &mut 0)
}
fn find_page(
&mut self,
node_ref: PdfRef,
target_index: usize,
current_index: &mut usize,
) -> Result<PdfDict> {
let node = self.resolve_dict(node_ref)?;
match node.get_type() {
Some("Pages") => {
let kids = node
.get_array("Kids")
.ok_or_else(|| Error::InvalidStructure("Pages missing Kids".to_string()))?
.clone();
for i in 0..kids.len() {
let kid_ref = kids
.get_reference(i)
.ok_or_else(|| Error::InvalidStructure("Invalid Kids entry".to_string()))?;
let kid = self.resolve_dict(kid_ref)?;
match kid.get_type() {
Some("Page") => {
if *current_index == target_index {
return Ok(kid);
}
*current_index += 1;
}
Some("Pages") => {
let count = match kid.get_integer("Count") {
Some(value) => usize_from_i64(value, "Pages Count")?,
None => 0,
};
let next_index = current_index.checked_add(count).ok_or_else(|| {
Error::InvalidStructure("Page count overflow".to_string())
})?;
if target_index < next_index {
return self.find_page(kid_ref, target_index, current_index);
}
*current_index = next_index;
}
_ => {
return Err(Error::InvalidStructure(
"Invalid page tree node".to_string(),
))
}
}
}
Err(Error::MissingObject(format!(
"Page {} not found",
target_index
)))
}
Some("Page") => {
if *current_index == target_index {
Ok(node)
} else {
*current_index += 1;
Err(Error::MissingObject(format!(
"Page {} not found",
target_index
)))
}
}
_ => Err(Error::InvalidStructure(
"Invalid page tree node".to_string(),
)),
}
}
pub fn info(&mut self) -> Result<Option<PdfDict>> {
if let Some(info_ref) = self.info_ref() {
Ok(Some(self.resolve_dict(info_ref)?))
} else {
Ok(None)
}
}
pub fn title(&mut self) -> Result<Option<String>> {
if let Some(info) = self.info()? {
if let Some(PdfObject::String(s)) = info.get("Title") {
return Ok(Some(s.decode_text()));
}
}
Ok(None)
}
pub fn author(&mut self) -> Result<Option<String>> {
if let Some(info) = self.info()? {
if let Some(PdfObject::String(s)) = info.get("Author") {
return Ok(Some(s.decode_text()));
}
}
Ok(None)
}
pub fn data(&self) -> &[u8] {
&self.data
}
pub fn xref(&self) -> &XRefTable {
&self.xref
}
}
fn usize_from_i64(value: i64, context: &str) -> Result<usize> {
if value < 0 {
return Err(Error::InvalidStructure(format!(
"{} must be non-negative",
context
)));
}
usize::try_from(value).map_err(|_| Error::InvalidStructure(format!("{} out of range", context)))
}
#[cfg(test)]
mod tests {
use super::*;
use crate::objects::PdfName;
#[test]
fn test_parse_header() {
let data = b"%PDF-1.7\n%\xE2\xE3\xCF\xD3\n";
let version = LoadedDocument::parse_header(data).unwrap();
assert_eq!(version, "1.7");
}
#[test]
fn test_parse_header_1_4() {
let data = b"%PDF-1.4\n";
let version = LoadedDocument::parse_header(data).unwrap();
assert_eq!(version, "1.4");
}
#[test]
fn test_parse_header_invalid() {
let data = b"Not a PDF file";
let result = LoadedDocument::parse_header(data);
assert!(result.is_err());
}
#[test]
fn test_load_generated_pdf() {
use crate::api::Document;
let mut doc = Document::new();
doc.title("Test Document");
doc.author("Test Author");
doc.text_at("Hello, World!", [72.0, 700.0]);
let pdf_data = doc.render().unwrap();
let mut loaded = LoadedDocument::load(pdf_data).unwrap();
assert_eq!(loaded.version(), "1.7");
assert!(loaded.object_count() > 0);
let count = loaded.page_count().unwrap();
assert_eq!(count, 1);
let page = loaded.page(0).unwrap();
assert_eq!(page.get_type(), Some("Page"));
}
#[test]
fn test_load_multi_page_pdf() {
use crate::api::Document;
let mut doc = Document::new();
doc.text_at("Page 1", [72.0, 700.0]);
doc.start_new_page();
doc.text_at("Page 2", [72.0, 700.0]);
doc.start_new_page();
doc.text_at("Page 3", [72.0, 700.0]);
let pdf_data = doc.render().unwrap();
let mut loaded = LoadedDocument::load(pdf_data).unwrap();
let count = loaded.page_count().unwrap();
assert_eq!(count, 3);
for i in 0..3 {
let page = loaded.page(i).unwrap();
assert_eq!(page.get_type(), Some("Page"));
}
}
#[test]
fn test_object_stream_first_out_of_bounds() {
let mut dict = PdfDict::new();
dict.set("Type", PdfObject::Name(PdfName::new("ObjStm")));
dict.set("N", PdfObject::Integer(1));
dict.set("First", PdfObject::Integer(10));
let stream = PdfStream::new(dict, b"0 0".to_vec());
let mut doc = LoadedDocument {
data: Vec::new(),
xref: XRefTable::new(),
cache: HashMap::new(),
version: "1.7".to_string(),
};
doc.cache.insert(PdfRef::new(1), PdfObject::Stream(stream));
let result = doc.parse_compressed_object(1, 0);
assert!(result.is_err());
}
}