Skip to main content

hayro_syntax/
data.rs

1use crate::object::ObjectIdentifier;
2use crate::object::Stream;
3use crate::reader::ReaderContext;
4use crate::sync::HashMap;
5use crate::sync::{Arc, Mutex, MutexExt};
6use crate::util::SegmentList;
7use alloc::borrow::Cow;
8use alloc::vec::Vec;
9use core::fmt::{Debug, Formatter};
10
11/// A container for the bytes of a PDF file.
12#[derive(Clone)]
13pub struct PdfData {
14    #[cfg(feature = "std")]
15    inner: Arc<dyn AsRef<[u8]> + Send + Sync>,
16    #[cfg(not(feature = "std"))]
17    inner: Arc<dyn AsRef<[u8]>>,
18}
19
20impl Debug for PdfData {
21    fn fmt(&self, f: &mut Formatter<'_>) -> core::fmt::Result {
22        write!(f, "PdfData {{ ... }}")
23    }
24}
25
26impl AsRef<[u8]> for PdfData {
27    fn as_ref(&self) -> &[u8] {
28        (*self.inner).as_ref()
29    }
30}
31
32#[cfg(feature = "std")]
33impl<T: AsRef<[u8]> + Send + Sync + 'static> From<Arc<T>> for PdfData {
34    fn from(data: Arc<T>) -> Self {
35        Self { inner: data }
36    }
37}
38
39#[cfg(not(feature = "std"))]
40impl<T: AsRef<[u8]> + 'static> From<Arc<T>> for PdfData {
41    fn from(data: Arc<T>) -> Self {
42        Self { inner: data }
43    }
44}
45
46impl From<Vec<u8>> for PdfData {
47    fn from(data: Vec<u8>) -> Self {
48        Self {
49            inner: Arc::new(data),
50        }
51    }
52}
53
54/// A structure for storing the data of the PDF.
55// To explain further: This crate uses a zero-parse approach, meaning that objects like
56// dictionaries or arrays always store the underlying data and parse objects lazily as needed,
57// instead of allocating the data and storing it in an owned way. However, the problem is that
58// not all data is readily available in the original data of the PDF: Objects can also be
59// stored in an object streams, in which case we first need to decode the stream before we can
60// access the data.
61//
62// The purpose of `Data` is to allow us to access the original data as well as maybe decoded data
63// by faking the same lifetime, so that we don't run into lifetime issues when dealing with
64// PDF objects that actually stem from different data sources.
65pub(crate) struct Data {
66    data: PdfData,
67    // 32 segments are more than enough as we can't have more objects than this.
68    decoded: SegmentList<Option<Vec<u8>>, 32>,
69    map: Mutex<HashMap<ObjectIdentifier, usize>>,
70}
71
72impl Debug for Data {
73    fn fmt(&self, f: &mut Formatter<'_>) -> core::fmt::Result {
74        write!(f, "Data {{ ... }}")
75    }
76}
77
78impl Data {
79    /// Create a new `Data` structure.
80    pub(crate) fn new(data: PdfData) -> Self {
81        Self {
82            data,
83            decoded: SegmentList::new(),
84            map: Mutex::new(HashMap::new()),
85        }
86    }
87
88    /// Get access to the original data of the PDF.
89    pub(crate) fn get(&self) -> &PdfData {
90        &self.data
91    }
92
93    /// Get access to the data of a decoded object stream.
94    pub(crate) fn get_with(&self, id: ObjectIdentifier, ctx: &ReaderContext<'_>) -> Option<&[u8]> {
95        if let Some(&idx) = self.map.get().get(&id) {
96            self.decoded.get(idx)?.as_deref()
97        } else {
98            // Block scope to keep the lock short-lived.
99            let idx = {
100                let mut locked = self.map.get();
101                let idx = locked.len();
102                locked.insert(id, idx);
103                idx
104            };
105            self.decoded
106                .get_or_init(idx, || {
107                    let stream = ctx.xref().get_with::<Stream<'_>>(id, ctx)?;
108                    stream.decoded().ok().map(Cow::into_owned)
109                })
110                .as_deref()
111        }
112    }
113}