use crate::constants::{CONTENTS, COUNT, FIRST, KIDS, LAST, NEXT, OUTLINES, PAGES, PREV, TITLE, TYPE};
use crate::encoding::PreDefinedEncoding;
use crate::error::PDFError::{ObjectAttrMiss, PDFParseError, XrefEntryNotFound};
use crate::error::Result;
use crate::objects::{Dictionary, ObjRefTuple, PDFNumber, PDFObject, XEntry};
use crate::parser::parse_with_offset;
use crate::pstr::convert_glyph_text;
use crate::tokenizer::Tokenizer;
use crate::utils::xrefs_search;
use std::collections::HashMap;
use std::fmt::{Display, Formatter};
macro_rules! mixture_node_id {
($obj_num:expr,$gen_num:expr) => {{
let node_id = ($obj_num as u64) << 16 | $gen_num as u64;
node_id
}};
}
macro_rules! extract_node_id {
($node_id:expr) => {{
let obj_num = ($node_id >> 16) as u32;
let gen_num = ($node_id & 0xFFFF) as u16;
(obj_num, gen_num)
}};
}
pub type NodeId = u64;
pub struct PageTreeArean {
root_id: NodeId,
nodes: HashMap<NodeId, PageNode>,
}
pub struct PageNode {
node_id: NodeId,
attrs: Dictionary,
count: u64,
kids: Option<Vec<NodeId>>,
parent_id: Option<NodeId>,
}
pub(crate) struct OutlineTreeArean {
root_id: NodeId,
nodes: HashMap<NodeId, OutlineNode>,
}
pub struct OutlineNode {
count: i64,
title: Option<String>,
prev_id: Option<NodeId>,
next_id: Option<NodeId>,
first_id: Option<NodeId>,
last_id: Option<NodeId>,
parent_id: Option<NodeId>
}
pub(crate) fn decode_catalog_data(
tokenizer: &mut Tokenizer,
catalog: (u32, u16),
xrefs: &[XEntry],
) -> Result<(PageTreeArean, Option<OutlineTreeArean>)> {
let entry = xrefs_search(xrefs, catalog)?;
let obj = parse_with_offset(tokenizer, entry.value)?;
let catalog_attr = match obj {
PDFObject::IndirectObject(_, _, value) => value.to_dict(),
_ => return Err(ObjectAttrMiss("PDF catalog not found.")),
};
match catalog_attr {
Some(dict) => {
let page_tree_arean;
if let Some(PDFObject::ObjectRef(obj_num, gen_num)) = dict.get(PAGES) {
let mut nodes = HashMap::new();
let obj_num = *obj_num;
let gen_num = *gen_num;
build_page_tree(tokenizer, xrefs, (obj_num, gen_num), None, &mut nodes)?;
page_tree_arean = PageTreeArean::new(mixture_node_id!(obj_num, gen_num), nodes);
} else {
return Err(ObjectAttrMiss("Catalog attribute not contain pages attr."));
}
let mut outline = None;
if let Some(PDFObject::ObjectRef(obj_num, gen_num)) = dict.get(OUTLINES) {
let mut map = HashMap::<NodeId, OutlineNode>::new();
let obj_num = *obj_num;
let gen_num = *gen_num;
build_outline_tree(tokenizer, xrefs, obj_num, gen_num, None, &mut map)?;
outline = Some(OutlineTreeArean::new(mixture_node_id!(obj_num, gen_num), map));
}
Ok((page_tree_arean, outline))
}
_ => Err(ObjectAttrMiss("Catalog attribute not found or not a dict.")),
}
}
fn build_page_tree(
tokenizer: &mut Tokenizer,
xrefs: &[XEntry],
obj_ref: (u32, u16),
parent_id: Option<NodeId>,
nodes: &mut HashMap<NodeId, PageNode>,
) -> Result<()> {
let entry = xrefs_search(xrefs, obj_ref)?;
let obj = match parse_with_offset(tokenizer, entry.value)? {
PDFObject::IndirectObject(_, _, value) => *value,
_ => return Err(XrefEntryNotFound(obj_ref.0, obj_ref.1)),
};
let attrs = match obj {
PDFObject::Dict(dict) => dict,
_ => return Err(PDFParseError("Page attributes is not a dict")),
};
let is_page_tree = attrs.named_value_was(TYPE, PAGES);
if !is_page_tree {
let node_id = mixture_node_id!(obj_ref.0, obj_ref.1);
let leaf_node = PageNode::new(
node_id,
attrs,
None,
0,
parent_id,
);
nodes.insert(node_id, leaf_node);
return Ok(());
}
let count = match attrs.get_u64_num(COUNT) {
Some(count) => count,
_ => return Err(PDFParseError("Page count not exist or not a number")),
};
let mut kids = None;
if count > 0 {
let arr = match attrs.get_array_value(KIDS) {
Some(kids) => kids,
_ => return Err(PDFParseError("Page kids not exist or not an array")),
};
let mut children: Vec<NodeId> = Vec::with_capacity(arr.len());
let tmp = mixture_node_id!(obj_ref.0, obj_ref.1);
for kid in arr {
if let PDFObject::ObjectRef(obj_num, gen_num) = kid {
children.push(mixture_node_id!(*obj_num, *gen_num));
build_page_tree(tokenizer, xrefs, (*obj_num, *gen_num), Some(tmp), nodes)?;
} else {
return Err(PDFParseError(
"Page kids not exist or not an object reference",
));
}
}
kids = Some(children)
};
let node_id = mixture_node_id!(obj_ref.0, obj_ref.1);
let page_node = PageNode::new(
node_id,
attrs,
kids,
count,
parent_id,
);
nodes.insert(node_id, page_node);
Ok(())
}
fn build_outline_tree(
tokenizer: &mut Tokenizer,
xrefs: &[XEntry],
obj_num: u32,
gen_num: u16,
parent_id: Option<NodeId>,
map: &mut HashMap<NodeId, OutlineNode>,
) -> Result<()> {
let entry = xrefs_search(xrefs, (obj_num, gen_num))?;
let object = parse_with_offset(tokenizer, entry.value)?;
let (_, _, attrs) = match object.as_indirect_object() {
Some((obj_num, gen_num, obj)) => match obj.as_dict() {
Some(dict) => (obj_num, gen_num, dict),
_ => return Err(PDFParseError("Outline attribute except a dict.")),
},
_ => return Err(PDFParseError("Outline object is not an indirect object")),
};
let mut title = None;
let mut prev_id = None;
let mut next_id = None;
let mut first_id = None;
let mut last_id = None;
let node_id = mixture_node_id!(obj_num, gen_num);
if let Some(PDFObject::ObjectRef(obj_num, gen_num)) = attrs.get(PREV) {
prev_id = Some(mixture_node_id!(*obj_num, *gen_num));
}
if let Some(PDFObject::ObjectRef(obj_num, gen_num)) = attrs.get(FIRST) {
first_id = Some(mixture_node_id!(*obj_num, *gen_num));
build_outline_tree(tokenizer, xrefs, *obj_num, *gen_num, Some(node_id), map)?;
}
if let Some(PDFObject::ObjectRef(obj_num, gen_num)) = attrs.get(LAST) {
last_id = Some(mixture_node_id!(*obj_num, *gen_num));
}
if let Some(PDFObject::ObjectRef(obj_num, gen_num)) = attrs.get(NEXT) {
next_id = Some(mixture_node_id!(*obj_num, *gen_num));
build_outline_tree(tokenizer, xrefs, *obj_num, *gen_num, Some(node_id), map)?;
}
if let Some(PDFObject::String(pstr)) = attrs.get(TITLE){
title = Some(convert_glyph_text(pstr, &PreDefinedEncoding::PDFDoc));
}
let count = match attrs.get(COUNT) {
Some(PDFObject::Number(PDFNumber::Signed(value))) => *value,
Some(PDFObject::Number(PDFNumber::Unsigned(value))) => *value as i64,
_ => 0i64
};
let outline_node = OutlineNode {
count,
title,
prev_id,
next_id,
first_id,
last_id,
parent_id,
};
map.insert(node_id, outline_node);
Ok(())
}
impl PageTreeArean {
pub(crate) fn new(root_id: NodeId, nodes: HashMap<NodeId, PageNode>) -> Self {
Self { nodes, root_id }
}
pub fn get_root_node(&self) -> Option<&PageNode> {
self.nodes.get(&self.root_id)
}
pub(crate) fn get_page_num(&self) -> usize {
self.nodes.values().filter(|node| node.count == 0).count()
}
pub(crate) fn get_leaf_page_ids(&self) -> Vec<NodeId> {
let root_id = self.root_id;
let mut page_node_ids = Vec::new();
self.fetch_kid_page(&mut page_node_ids, root_id);
page_node_ids
}
pub(crate) fn get_page_node(&self, node_id: NodeId) -> Option<&PageNode> {
self.nodes.get(&node_id)
}
fn fetch_kid_page(&self, page_node_ids: &mut Vec<NodeId>, node_id: NodeId) {
if let Some(page_node) = self.nodes.get(&node_id) {
if page_node.count == 0 {
page_node_ids.push(node_id);
} else if let Some(kids) = page_node.kids.as_ref() {
for kid_id in kids {
self.fetch_kid_page(page_node_ids, *kid_id);
}
}
}
}
}
fn fmt_page_node(
page_tree_arean: &PageTreeArean,
node_id: &NodeId,
f: &mut Formatter<'_>,
indent: usize,
is_last: bool,
) -> std::fmt::Result {
if let Some(page_node) = page_tree_arean.nodes.get(node_id) {
let (obj_num, gen_num) = extract_node_id!(node_id);
let prefix = if indent == 0 {
String::new()
} else {
format!("{}{}", "│ ".repeat(indent - 1), if is_last { "└── " } else { "├── " })
};
writeln!(f, "{}Page:[{},{}]", prefix, obj_num, gen_num)?;
writeln!(f, "{}├── Count:{}",
"│ ".repeat(indent),
page_node.kids.as_ref().map_or(0, |k| k.len())
)?;
writeln!(f, "{}└── Kids", "│ ".repeat(indent))?;
if let Some(kids) = page_node.kids.as_ref() {
let total = kids.len();
for (i, kid_id) in kids.iter().enumerate() {
let is_kid_last = (i == total - 1);
fmt_page_node(page_tree_arean, kid_id, f, indent + 1, is_kid_last)?;
}
}
}
Ok(())
}
impl Display for PageTreeArean {
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
let root_id = &self.root_id;
if let Some(page_node) = self.nodes.get(&root_id) {
fmt_page_node(self, root_id, f, 0,false)?;
}
Ok(())
}
}
impl OutlineTreeArean {
pub(crate) fn new(root_id: NodeId, nodes: HashMap<NodeId, OutlineNode>) -> Self {
Self { root_id, nodes }
}
}
impl PageNode {
pub(crate) fn new(node_id: NodeId, attrs: Dictionary, kids: Option<Vec<NodeId>>, count: u64, parent_id: Option<NodeId>) -> Self {
Self { node_id, attrs, kids, count, parent_id }
}
pub fn get_page_obj_ref(&self) -> ObjRefTuple {
extract_node_id!(self.node_id)
}
pub fn get_parent_obj_ref(&self) -> Option<ObjRefTuple> {
self.parent_id.map(|id| extract_node_id!(id))
}
pub fn get_page_id(&self) -> NodeId {
self.node_id
}
pub fn get_parent_id(&self) -> Option<NodeId> {
self.parent_id
}
pub fn get_attrs(&self) -> &Dictionary {
&self.attrs
}
pub fn get_attr(&self, key: &str) -> Option<&PDFObject> {
self.attrs.get(key)
}
pub fn get_kids(&self) -> &Option<Vec<NodeId>> {
&self.kids
}
pub fn get_count(&self) -> u64 {
self.count
}
pub(crate) fn get_contents(&self)->Vec<ObjRefTuple> {
match self.attrs.get(CONTENTS) {
Some(PDFObject::ObjectRef(obj_num, gen_num)) => vec![(*obj_num, *gen_num)],
Some(PDFObject::Array(arr)) => arr.iter().filter_map(|obj| {
if let PDFObject::ObjectRef(obj_num, gen_num) = obj {
Some((*obj_num, *gen_num))
} else {
None
}
}).collect(),
_ => vec![]
}
}
}