use super::encodings::Encoding;
use super::{Bookmark, Dictionary, Object, ObjectId};
use crate::encryption::crypt_filters::*;
use crate::encryption::{self, EncryptionState, PasswordAlgorithm};
use crate::xobject::PdfImage;
use crate::xref::{Xref, XrefType};
use crate::{Error, ObjectStream, Result, Stream};
use log::debug;
use std::cmp::max;
use std::collections::{BTreeMap, HashMap, HashSet};
use std::io::Write;
use std::str;
use std::sync::Arc;
#[derive(Debug, Clone)]
pub struct Document {
pub version: String,
pub binary_mark: Vec<u8>,
pub trailer: Dictionary,
pub reference_table: Xref,
pub objects: BTreeMap<ObjectId, Object>,
pub max_id: u32,
pub max_bookmark_id: u32,
pub bookmarks: Vec<u32>,
pub bookmark_table: HashMap<u32, Bookmark>,
pub xref_start: usize,
pub encryption_state: Option<EncryptionState>,
}
impl Document {
pub fn new() -> Self {
Self {
version: "1.4".to_string(),
binary_mark: vec![0xBB, 0xAD, 0xC0, 0xDE],
trailer: Dictionary::new(),
reference_table: Xref::new(0, XrefType::CrossReferenceStream),
objects: BTreeMap::new(),
max_id: 0,
max_bookmark_id: 0,
bookmarks: Vec::new(),
bookmark_table: HashMap::new(),
xref_start: 0,
encryption_state: None,
}
}
pub fn new_from_prev(prev: &Document) -> Self {
let mut new_trailer = prev.trailer.clone();
new_trailer.set("Prev", Object::Integer(prev.xref_start as i64));
Self {
version: "1.4".to_string(),
binary_mark: vec![0xBB, 0xAD, 0xC0, 0xDE],
trailer: new_trailer,
reference_table: Xref::new(0, prev.reference_table.cross_reference_type),
objects: BTreeMap::new(),
max_id: prev.max_id,
max_bookmark_id: prev.max_bookmark_id,
bookmarks: Vec::new(),
bookmark_table: HashMap::new(),
xref_start: 0,
encryption_state: None,
}
}
const DEREF_LIMIT: usize = 128;
fn recursive_fix_pages(&mut self, bookmarks: &[u32], first: bool) -> ObjectId {
if !bookmarks.is_empty() {
for id in bookmarks {
let (children, mut page) = match self.bookmark_table.get(id) {
Some(n) => (n.children.clone(), n.page),
None => return (0, 0),
};
if 0 == page.0 && !children.is_empty() {
let objectid = self.recursive_fix_pages(&children[..], false);
let bookmark = self.bookmark_table.get_mut(id).unwrap();
bookmark.page = objectid;
page = objectid;
}
if !first && 0 != page.0 {
return page;
}
if first && !children.is_empty() {
self.recursive_fix_pages(&children[..], first);
}
}
}
(0, 0)
}
pub fn adjust_zero_pages(&mut self) {
self.recursive_fix_pages(&self.bookmarks.clone(), true);
}
pub fn dereference<'a>(&'a self, mut object: &'a Object) -> Result<(Option<ObjectId>, &'a Object)> {
let mut nb_deref = 0;
let mut id = None;
while let Ok(ref_id) = object.as_reference() {
id = Some(ref_id);
object = self.objects.get(&ref_id).ok_or(Error::ObjectNotFound(ref_id))?;
nb_deref += 1;
if nb_deref > Self::DEREF_LIMIT {
return Err(Error::ReferenceLimit);
}
}
Ok((id, object))
}
pub fn get_object(&self, id: ObjectId) -> Result<&Object> {
let object = self.objects.get(&id).ok_or(Error::ObjectNotFound(id))?;
self.dereference(object).map(|(_, object)| object)
}
pub fn has_object(&self, id: ObjectId) -> bool {
self.objects.contains_key(&id)
}
pub fn get_object_mut(&mut self, id: ObjectId) -> Result<&mut Object> {
let object = self.objects.get(&id).ok_or(Error::ObjectNotFound(id))?;
let (ref_id, _obj) = self.dereference(object)?;
Ok(self.objects.get_mut(&ref_id.unwrap_or(id)).unwrap())
}
pub fn get_object_page(&self, id: ObjectId) -> Result<ObjectId> {
for (_, object_id) in self.get_pages() {
let page = self.get_object(object_id)?.as_dict()?;
let annots = page.get(b"Annots")?.as_array()?;
let mut objects_ids = annots.iter().map(Object::as_reference);
let contains = objects_ids.any(|object_id| Some(id) == object_id.ok());
if contains {
return Ok(object_id);
}
}
Err(Error::PageNumberNotFound(0))
}
pub fn get_dictionary(&self, id: ObjectId) -> Result<&Dictionary> {
self.get_object(id).and_then(Object::as_dict)
}
pub fn get_dictionary_mut(&mut self, id: ObjectId) -> Result<&mut Dictionary> {
self.get_object_mut(id).and_then(Object::as_dict_mut)
}
pub fn get_dict_in_dict<'a>(&'a self, node: &'a Dictionary, key: &[u8]) -> Result<&'a Dictionary> {
match node.get(key)? {
Object::Reference(object_id) => self.get_dictionary(*object_id),
Object::Dictionary(dic) => Ok(dic),
obj => Err(Error::ObjectType {
expected: "Dictionary",
found: obj.enum_variant(),
}),
}
}
pub fn traverse_objects<A: Fn(&mut Object)>(&mut self, action: A) -> Vec<ObjectId> {
fn traverse_array<A: Fn(&mut Object)>(array: &mut [Object], action: &A, refs: &mut Vec<ObjectId>) {
for item in array.iter_mut() {
traverse_object(item, action, refs);
}
}
fn traverse_dictionary<A: Fn(&mut Object)>(dict: &mut Dictionary, action: &A, refs: &mut Vec<ObjectId>) {
for (_, v) in dict.iter_mut() {
traverse_object(v, action, refs);
}
}
fn traverse_object<A: Fn(&mut Object)>(object: &mut Object, action: &A, refs: &mut Vec<ObjectId>) {
action(object);
match object {
Object::Array(array) => traverse_array(array, action, refs),
Object::Dictionary(dict) => traverse_dictionary(dict, action, refs),
Object::Stream(stream) => traverse_dictionary(&mut stream.dict, action, refs),
Object::Reference(id) => {
if !refs.contains(id) {
refs.push(*id);
}
}
_ => {}
}
}
let mut refs = vec![];
traverse_dictionary(&mut self.trailer, &action, &mut refs);
let mut index = 0;
while index < refs.len() {
if let Some(object) = self.objects.get_mut(&refs[index]) {
traverse_object(object, &action, &mut refs);
}
index += 1;
}
refs
}
pub fn get_encrypted(&self) -> Result<&Dictionary> {
self.trailer
.get(b"Encrypt")
.and_then(Object::as_reference)
.and_then(|id| self.get_dictionary(id))
}
pub fn is_encrypted(&self) -> bool {
self.get_encrypted().is_ok()
}
pub fn was_encrypted(&self) -> bool {
self.encryption_state.is_some()
}
pub fn authenticate_raw_owner_password<P>(&self, password: P) -> Result<()>
where
P: AsRef<[u8]>,
{
if !self.is_encrypted() {
return Err(Error::NotEncrypted);
}
let password = password.as_ref();
let algorithm = PasswordAlgorithm::try_from(self)?;
algorithm.authenticate_owner_password(self, password)?;
Ok(())
}
pub fn authenticate_raw_user_password<P>(&self, password: P) -> Result<()>
where
P: AsRef<[u8]>,
{
if !self.is_encrypted() {
return Err(Error::NotEncrypted);
}
let password = password.as_ref();
let algorithm = PasswordAlgorithm::try_from(self)?;
algorithm.authenticate_user_password(self, password)?;
Ok(())
}
pub fn authenticate_raw_password<P>(&self, password: P) -> Result<()>
where
P: AsRef<[u8]>,
{
if !self.is_encrypted() {
return Err(Error::NotEncrypted);
}
let password = password.as_ref();
let algorithm = PasswordAlgorithm::try_from(self)?;
algorithm
.authenticate_owner_password(self, password)
.or(algorithm.authenticate_user_password(self, password))?;
Ok(())
}
pub fn authenticate_owner_password(&self, password: &str) -> Result<()> {
if !self.is_encrypted() {
return Err(Error::NotEncrypted);
}
let algorithm = PasswordAlgorithm::try_from(self)?;
let password = algorithm.sanitize_password(password)?;
algorithm.authenticate_owner_password(self, &password)?;
Ok(())
}
pub fn authenticate_user_password(&self, password: &str) -> Result<()> {
if !self.is_encrypted() {
return Err(Error::NotEncrypted);
}
let algorithm = PasswordAlgorithm::try_from(self)?;
let password = algorithm.sanitize_password(password)?;
algorithm.authenticate_user_password(self, &password)?;
Ok(())
}
pub fn authenticate_password(&self, password: &str) -> Result<()> {
if !self.is_encrypted() {
return Err(Error::NotEncrypted);
}
let algorithm = PasswordAlgorithm::try_from(self)?;
let password = algorithm.sanitize_password(password)?;
algorithm
.authenticate_owner_password(self, &password)
.or(algorithm.authenticate_user_password(self, &password))?;
Ok(())
}
pub fn get_crypt_filters(&self) -> BTreeMap<Vec<u8>, Arc<dyn CryptFilter>> {
let mut crypt_filters = BTreeMap::new();
if let Ok(filters) = self
.get_encrypted()
.and_then(|dict| dict.get(b"CF"))
.and_then(|object| object.as_dict())
{
for (name, filter) in filters {
let Ok(filter) = filter.as_dict() else {
continue;
};
if filter.get(b"Type").is_ok() && !filter.has_type(b"CryptFilter") {
continue;
}
let cfm = filter.get(b"CFM").and_then(|object| object.as_name()).ok();
let crypt_filter: Arc<dyn CryptFilter> = match cfm {
Some(b"V2") => Arc::new(Rc4CryptFilter),
Some(b"AESV2") => Arc::new(Aes128CryptFilter),
Some(b"AESV3") => Arc::new(Aes256CryptFilter),
Some(b"Identity") | None => Arc::new(IdentityCryptFilter),
_ => continue,
};
crypt_filters.insert(name.to_vec(), crypt_filter);
}
}
crypt_filters
}
pub fn encrypt(&mut self, state: &EncryptionState) -> Result<()> {
if self.is_encrypted() {
return Err(Error::AlreadyEncrypted);
}
let encrypted = state.encode()?;
for (&id, obj) in self.objects.iter_mut() {
encryption::encrypt_object(state, id, obj)?;
}
let object_id = self.add_object(encrypted);
self.trailer.set(b"Encrypt", Object::Reference(object_id));
self.encryption_state = None;
Ok(())
}
pub fn decrypt(&mut self, password: &str) -> Result<()> {
if !self.is_encrypted() {
return Err(Error::NotEncrypted);
}
let algorithm = PasswordAlgorithm::try_from(&*self)?;
let password = algorithm.sanitize_password(password)?;
self.decrypt_raw(&password)
}
pub fn decrypt_raw<P>(&mut self, password: P) -> Result<()>
where
P: AsRef<[u8]>,
{
if !self.is_encrypted() {
return Err(Error::NotEncrypted);
}
self.authenticate_raw_password(&password)?;
let encryption_obj_id = self.trailer.get(b"Encrypt").and_then(Object::as_reference)?;
let state = EncryptionState::decode(&*self, password)?;
for (&id, obj) in self.objects.iter_mut() {
if id == encryption_obj_id {
continue;
}
encryption::decrypt_object(&state, id, obj)?;
}
let mut object_streams = vec![];
for (_, object) in self.objects.iter_mut() {
let Ok(ref mut stream) = object.as_stream_mut() else {
continue;
};
if !stream.dict.has_type(b"ObjStm") {
continue;
}
let Some(obj_stream) = ObjectStream::new(stream).ok() else {
continue;
};
object_streams.extend(obj_stream.objects);
}
for (id, entry) in object_streams {
self.objects.entry(id).or_insert(entry);
}
let object_id = self.trailer.remove(b"Encrypt").unwrap().as_reference()?;
self.objects.remove(&object_id);
self.encryption_state = Some(state);
Ok(())
}
pub fn catalog(&self) -> Result<&Dictionary> {
self.trailer
.get(b"Root")
.and_then(Object::as_reference)
.and_then(|id| self.get_dictionary(id))
}
pub fn catalog_mut(&mut self) -> Result<&mut Dictionary> {
self.trailer
.get(b"Root")
.and_then(Object::as_reference)
.and_then(move |id| self.get_dictionary_mut(id))
}
pub fn get_pages(&self) -> BTreeMap<u32, ObjectId> {
self.page_iter().enumerate().map(|(i, p)| ((i + 1) as u32, p)).collect()
}
pub fn page_iter(&self) -> impl Iterator<Item = ObjectId> + '_ {
PageTreeIter::new(self)
}
pub fn get_page_contents(&self, page_id: ObjectId) -> Vec<ObjectId> {
let mut streams = vec![];
if let Ok(page) = self.get_dictionary(page_id) {
let mut nb_deref = 0;
if let Ok(mut contents) = page.get(b"Contents") {
loop {
match contents {
Object::Reference(id) => match self.objects.get(id) {
None | Some(Object::Stream(_)) => {
streams.push(*id);
}
Some(o) => {
nb_deref += 1;
if nb_deref < Self::DEREF_LIMIT {
contents = o;
continue;
}
}
},
Object::Array(arr) => {
for content in arr {
if let Ok(id) = content.as_reference() {
streams.push(id)
}
}
}
_ => {}
}
break;
}
}
}
streams
}
pub fn add_page_contents(&mut self, page_id: ObjectId, content: Vec<u8>) -> Result<()> {
let page = self.get_dictionary(page_id)?;
let mut current_content_list: Vec<Object> = match page.get(b"Contents") {
Ok(Object::Reference(id)) => {
vec![Object::Reference(*id)]
}
Ok(Object::Array(arr)) => arr.clone(),
_ => vec![],
};
let content_object_id = self.add_object(Object::Stream(Stream::new(Dictionary::new(), content)));
current_content_list.push(Object::Reference(content_object_id));
let page_mut = self.get_object_mut(page_id).and_then(Object::as_dict_mut)?;
page_mut.set("Contents", current_content_list);
Ok(())
}
pub fn get_page_content(&self, page_id: ObjectId) -> Result<Vec<u8>> {
let mut content = Vec::new();
let content_streams = self.get_page_contents(page_id);
for object_id in content_streams {
if let Ok(content_stream) = self.get_object(object_id).and_then(Object::as_stream) {
match content_stream.decompressed_content() {
Ok(data) => content.write_all(&data)?,
Err(_) => content.write_all(&content_stream.content)?,
};
}
}
Ok(content)
}
pub fn get_page_resources(&self, page_id: ObjectId) -> Result<(Option<&Dictionary>, Vec<ObjectId>)> {
fn collect_resources(
page_node: &Dictionary, resource_ids: &mut Vec<ObjectId>, doc: &Document,
already_seen: &mut HashSet<ObjectId>,
) -> Result<()> {
if let Ok(resource_id) = page_node.get(b"Resources").and_then(Object::as_reference) {
resource_ids.push(resource_id);
}
if let Ok(parent_id) = page_node.get(b"Parent").and_then(Object::as_reference) {
if already_seen.contains(&parent_id) {
return Err(Error::ReferenceCycle(parent_id));
}
already_seen.insert(parent_id);
let parent_dict = doc.get_dictionary(parent_id)?;
collect_resources(parent_dict, resource_ids, doc, already_seen)?;
}
Ok(())
}
let mut resource_dict = None;
let mut resource_ids = Vec::new();
if let Ok(page) = self.get_dictionary(page_id) {
resource_dict = page.get(b"Resources").and_then(Object::as_dict).ok();
collect_resources(page, &mut resource_ids, self, &mut HashSet::new())?;
}
Ok((resource_dict, resource_ids))
}
pub fn get_page_fonts(&self, page_id: ObjectId) -> Result<BTreeMap<Vec<u8>, &Dictionary>> {
fn collect_fonts_from_resources<'a>(
resources: &'a Dictionary, fonts: &mut BTreeMap<Vec<u8>, &'a Dictionary>, doc: &'a Document,
) {
if let Ok(font) = resources.get(b"Font") {
let font_dict = match font {
Object::Reference(id) => doc.get_object(*id).and_then(Object::as_dict).ok(),
Object::Dictionary(dict) => Some(dict),
_ => None,
};
if let Some(font_dict) = font_dict {
for (name, value) in font_dict.iter() {
let font = match value {
Object::Reference(id) => doc.get_dictionary(*id).ok(),
Object::Dictionary(dict) => Some(dict),
_ => None,
};
if !fonts.contains_key(name) {
font.map(|font| fonts.insert(name.clone(), font));
}
}
}
}
}
let mut fonts = BTreeMap::new();
let (resource_dict, resource_ids) = self.get_page_resources(page_id)?;
if let Some(resources) = resource_dict {
collect_fonts_from_resources(resources, &mut fonts, self);
}
for resource_id in resource_ids {
if let Ok(resources) = self.get_dictionary(resource_id) {
collect_fonts_from_resources(resources, &mut fonts, self);
}
}
Ok(fonts)
}
pub fn get_page_annotations(&self, page_id: ObjectId) -> Result<Vec<&Dictionary>> {
let mut annotations = vec![];
if let Ok(page) = self.get_dictionary(page_id) {
match page.get(b"Annots") {
Ok(Object::Reference(id)) => self
.get_object(*id)
.and_then(Object::as_array)?
.iter()
.flat_map(Object::as_reference)
.flat_map(|id| self.get_dictionary(id))
.for_each(|a| annotations.push(a)),
Ok(Object::Array(a)) => a
.iter()
.flat_map(Object::as_reference)
.flat_map(|id| self.get_dictionary(id))
.for_each(|a| annotations.push(a)),
_ => {}
}
}
Ok(annotations)
}
pub fn get_page_images(&'_ self, page_id: ObjectId) -> Result<Vec<PdfImage<'_>>> {
let mut images = vec![];
if let Ok(page) = self.get_dictionary(page_id) {
let resources = self.get_dict_in_dict(page, b"Resources")?;
let xobject = match self.get_dict_in_dict(resources, b"XObject") {
Ok(xobject) => xobject,
Err(err) => match err {
Error::DictKey(_) => return Ok(Vec::default()),
_ => Err(err)?,
},
};
for (_, xvalue) in xobject.iter() {
let id = xvalue.as_reference()?;
let xvalue = self.get_object(id)?;
let xvalue = xvalue.as_stream()?;
let dict = &xvalue.dict;
if dict.get(b"Subtype")?.as_name()? != b"Image" {
continue;
}
let width = dict.get(b"Width")?.as_i64()?;
let height = dict.get(b"Height")?.as_i64()?;
let color_space = match dict.get(b"ColorSpace") {
Ok(cs) => match cs {
Object::Array(array) => Some(String::from_utf8_lossy(array[0].as_name()?).to_string()),
Object::Name(name) => Some(String::from_utf8_lossy(name).to_string()),
_ => None,
},
Err(_) => None,
};
let bits_per_component = match dict.get(b"BitsPerComponent") {
Ok(bpc) => Some(bpc.as_i64()?),
Err(_) => None,
};
let mut filters = vec![];
if let Ok(filter) = dict.get(b"Filter") {
match filter {
Object::Array(array) => {
for obj in array.iter() {
let name = obj.as_name()?;
filters.push(String::from_utf8_lossy(name).to_string());
}
}
Object::Name(name) => {
filters.push(String::from_utf8_lossy(name).to_string());
}
_ => {}
}
};
images.push(PdfImage {
id,
width,
height,
color_space,
bits_per_component,
filters: Some(filters),
content: &xvalue.content,
origin_dict: &xvalue.dict,
});
}
}
Ok(images)
}
pub fn decode_text(encoding: &Encoding, bytes: &[u8]) -> Result<String> {
debug!("Decoding text with {encoding:#?}");
encoding.bytes_to_string(bytes)
}
pub fn encode_text(encoding: &Encoding, text: &str) -> Vec<u8> {
encoding.string_to_bytes(text)
}
}
impl Default for Document {
fn default() -> Self {
Self::new()
}
}
struct PageTreeIter<'a> {
doc: &'a Document,
stack: Vec<&'a [Object]>,
kids: Option<&'a [Object]>,
iter_limit: usize,
}
impl<'a> PageTreeIter<'a> {
const PAGE_TREE_DEPTH_LIMIT: usize = 256;
fn new(doc: &'a Document) -> Self {
if let Ok(page_tree_id) = doc
.catalog()
.and_then(|cat| cat.get(b"Pages"))
.and_then(Object::as_reference)
{
Self {
doc,
kids: Self::kids(doc, page_tree_id),
stack: Vec::with_capacity(32),
iter_limit: doc.objects.len(),
}
} else {
Self {
doc,
kids: None,
stack: Vec::new(),
iter_limit: doc.objects.len(),
}
}
}
fn kids(doc: &Document, page_tree_id: ObjectId) -> Option<&[Object]> {
doc.get_dictionary(page_tree_id)
.and_then(|page_tree| page_tree.get_deref(b"Kids", doc))
.and_then(Object::as_array)
.map(|k| k.as_slice())
.ok()
}
}
impl Iterator for PageTreeIter<'_> {
type Item = ObjectId;
fn next(&mut self) -> Option<Self::Item> {
loop {
while let Some((kid, new_kids)) = self.kids.and_then(|k| k.split_first()) {
if self.iter_limit == 0 {
return None;
}
self.iter_limit -= 1;
self.kids = Some(new_kids);
if let Ok(kid_id) = kid.as_reference() {
if let Ok(type_name) = self.doc.get_dictionary(kid_id).and_then(Dictionary::get_type) {
match type_name {
b"Page" => {
return Some(kid_id);
}
b"Pages" => {
if self.stack.len() < Self::PAGE_TREE_DEPTH_LIMIT {
let kids = self.kids.unwrap();
if !kids.is_empty() {
self.stack.push(kids);
}
self.kids = Self::kids(self.doc, kid_id);
}
}
_ => {}
}
}
}
}
if let kids @ Some(_) = self.stack.pop() {
self.kids = kids;
} else {
return None;
}
}
}
fn size_hint(&self) -> (usize, Option<usize>) {
let kids = self.kids.unwrap_or(&[]);
let nb_pages: usize = kids
.iter()
.chain(self.stack.iter().flat_map(|k| k.iter()))
.map(|kid| {
if let Ok(dict) = kid.as_reference().and_then(|id| self.doc.get_dictionary(id)) {
if let Ok(b"Pages") = dict.get_type() {
let count = dict.get_deref(b"Count", self.doc).and_then(Object::as_i64).unwrap_or(0);
max(0, count) as usize
} else {
1
}
} else {
1
}
})
.sum();
(nb_pages, Some(nb_pages))
}
}
impl std::iter::FusedIterator for PageTreeIter<'_> {}