use crate::catalog::{NodeId, OutlineTreeArean, PageTreeArean, decode_catalog_data, PageNode};
use crate::constants::pdf_key::{START_XREF, XREF};
use crate::constants::{
AUTHOR, CREATION_DATE, CREATOR, INFO, MOD_DATE, PREV, PRODUCER, ROOT, TITLE,
};
use crate::convert_glyph_from_dict;
use crate::date::Date;
use crate::encoding::PreDefinedEncoding;
use crate::error::PDFError::{
InvalidPDFDocument, ObjectAttrMiss, PDFParseError, XrefTableNotFound,
};
use crate::error::Result;
use crate::objects::{Dictionary, ObjRefTuple, PDFNumber, PDFObject, XEntry};
use crate::parser::{parse, parse_text_xref, parse_with_offset};
use crate::pstr::convert_glyph_text;
use crate::sequence::{FileSequence, Sequence};
use crate::tokenizer::Tokenizer;
use crate::utils::{count_leading_line_endings, line_ending, literal_to_u64, xrefs_search};
use crate::vpdf::PDFVersion;
use std::path::PathBuf;
use std::str::FromStr;
pub struct PDFDescribe {
producer: Option<String>,
creator: Option<String>,
creation_date: Option<Date>,
author: Option<String>,
title: Option<String>,
mod_date: Option<Date>,
}
pub struct PDFDocument {
xrefs: Vec<XEntry>,
version: PDFVersion,
tokenizer: Tokenizer,
page_tree_arena: PageTreeArean,
outline_tree_arean: Option<OutlineTreeArean>,
describe: Option<PDFDescribe>,
}
impl PDFDocument {
pub fn open(path: PathBuf) -> Result<PDFDocument> {
let file = std::fs::File::open(path)?;
let sequence = FileSequence::new(file);
Self::new(sequence)
}
pub fn new(mut sequence: impl Sequence + 'static) -> Result<PDFDocument> {
let version = parse_version(&mut sequence)?;
let offset = cal_xref_table_offset(&mut sequence)?;
let mut tokenizer = Tokenizer::new(sequence);
tokenizer.seek(offset)?;
let (xrefs, catalog, info) = merge_xref_table(&mut tokenizer)?;
let (page_tree_arena, outline_tree_arean) = match catalog {
Some(catalog) => decode_catalog_data(&mut tokenizer, catalog, &xrefs)?,
None => return Err(ObjectAttrMiss("Trailer can't found catalog attr.")),
};
let mut describe = None;
if let Some(obj) = info {
let entry = xrefs_search(&xrefs, obj)?;
if let PDFObject::IndirectObject(_, _, value) =
parse_with_offset(&mut tokenizer, entry.value)?
{
if let PDFObject::Dict(dict) = *value {
describe = Some(PDFDescribe::new(dict));
}
}
}
let document = PDFDocument {
xrefs,
version,
tokenizer,
page_tree_arena,
outline_tree_arean,
describe,
};
Ok(document)
}
pub fn get_xref_slice(&self) -> &[XEntry] {
&self.xrefs
}
pub fn find_xref_index<F>(&self, visit: F) -> Option<usize>
where
F: Fn(&XEntry) -> bool,
{
self.xrefs.iter().position(visit)
}
pub fn get_version(&self) -> &PDFVersion {
&self.version
}
pub fn read_object(&mut self, index: usize) -> Result<Option<PDFObject>> {
if index >= self.xrefs.len() {
return Ok(None);
}
let entry = &self.xrefs[index];
if entry.is_freed() {
return Ok(None);
}
self.tokenizer.seek(entry.get_value())?;
let object = parse(&mut self.tokenizer)?;
Ok(Some(object))
}
pub fn read_object_with_ref(&mut self, tuple: ObjRefTuple) -> Result<Option<PDFObject>> {
self.xrefs
.iter()
.position(|entry| entry.obj_num == tuple.0 && entry.gen_num == tuple.1)
.map(|index| self.read_object(index))
.unwrap_or(Ok(None))
}
pub fn get_page_num(&self) -> usize {
self.page_tree_arena.get_page_num()
}
pub fn get_page_ids(&self) -> Vec<NodeId> {
self.page_tree_arena.get_leaf_page_ids()
}
pub fn get_page(&self, node_id: NodeId) -> Option<&PageNode> {
self.page_tree_arena.get_page_node(node_id)
}
}
fn parse_version(sequence: &mut impl Sequence) -> Result<PDFVersion> {
let mut buf = [0u8; 1024];
let n = sequence.read(&mut buf)?;
if n < 8 {
return Err(InvalidPDFDocument);
}
if buf.len() < 8 || !buf.starts_with(b"%PDF-") {
return Err(InvalidPDFDocument);
}
let version = String::from_utf8(buf[5..8].to_vec())?;
Ok(version.try_into()?)
}
fn merge_xref_table(
mut tokenizer: &mut Tokenizer,
) -> Result<(Vec<XEntry>, Option<(u32, u16)>, Option<(u32, u16)>)> {
let mut xrefs = Vec::<XEntry>::new();
let mut info = None;
let mut catalog = None;
loop {
let is_xref = tokenizer.check_next_token0(false, |token| token.key_was(XREF))?;
if !is_xref {
return Err(XrefTableNotFound);
}
let entries = parse_text_xref(tokenizer)?;
if xrefs.is_empty() {
xrefs.extend_from_slice(&entries);
} else {
for entry in entries {
if let None = xrefs.iter().find(|it| it.obj_num == entry.obj_num) {
xrefs.push(entry);
}
}
}
if let PDFObject::Dict(dictionary) = parse(&mut tokenizer)? {
if let Some(PDFObject::ObjectRef(obj_num, gen_num)) = dictionary.get(ROOT) {
catalog = Some((*obj_num, *gen_num));
if let Some(PDFObject::ObjectRef(obj_num, gen_num)) = dictionary.get(INFO) {
info = Some((*obj_num, *gen_num));
}
}
if let Some(PDFObject::Number(PDFNumber::Unsigned(prev))) = dictionary.get(PREV) {
tokenizer.seek(*prev)?;
continue;
}
return Ok((xrefs, catalog, info));
}
return Err(PDFParseError("Xref table broken."));
}
}
fn cal_xref_table_offset(sequence: &mut impl Sequence) -> Result<u64> {
let size = sequence.size()?;
let pos = if size > 1024 { size - 1024 } else { 0 };
let mut buf = [0u8; 1024];
sequence.seek(pos)?;
let n = sequence.read(&mut buf)?;
let chars = START_XREF.as_bytes();
let mut tx = chars.len();
let mut index = n;
for i in (0..n).rev() {
let b = buf[i];
if chars[tx - 1] == b {
tx -= 1;
if tx == 0 {
index = i;
break;
}
}
}
if index == n {
return Err(InvalidPDFDocument);
}
index = index + chars.len();
let crlf_num = count_leading_line_endings(&buf[index..n]);
let start = index + (crlf_num as usize);
let mut end = 0usize;
for i in start..n {
if line_ending(buf[i]) {
end = i;
break;
}
}
if end == 0 || start == end {
return Err(InvalidPDFDocument);
}
let offset = literal_to_u64(&buf[start..end]);
Ok(offset)
}
impl PDFDescribe {
pub(crate) fn new(dictionary: Dictionary) -> PDFDescribe {
let encoding = PreDefinedEncoding::PDFDoc;
let producer = convert_glyph_from_dict!(dictionary, PRODUCER, &encoding);
let creator = convert_glyph_from_dict!(dictionary, CREATOR, &encoding);
let creation_date =
convert_glyph_from_dict!(dictionary, CREATION_DATE, &encoding).map_or(None, |text| {
match Date::from_str(text.as_str()) {
Ok(date) => Some(date),
Err(_) => None,
}
});
let mod_date =
convert_glyph_from_dict!(dictionary, MOD_DATE, &encoding).map_or(None, |text| {
match Date::from_str(text.as_str()) {
Ok(date) => Some(date),
Err(_) => None,
}
});
let author = convert_glyph_from_dict!(dictionary, AUTHOR, &encoding);
let title = convert_glyph_from_dict!(dictionary, TITLE, &encoding);
PDFDescribe {
producer,
creator,
creation_date,
author,
title,
mod_date,
}
}
}