Skip to main content

pdf_syntax/
data.rs

1use crate::object::ObjectIdentifier;
2use crate::object::Stream;
3use crate::reader::ReaderContext;
4use crate::sync::HashMap;
5use crate::sync::{Arc, Mutex, MutexExt};
6use crate::util::SegmentList;
7use alloc::vec::Vec;
8use core::fmt::{Debug, Formatter};
9
10/// A container for the bytes of a PDF file.
11#[derive(Clone)]
12pub struct PdfData {
13    #[cfg(feature = "std")]
14    inner: Arc<dyn AsRef<[u8]> + Send + Sync>,
15    #[cfg(not(feature = "std"))]
16    inner: Arc<dyn AsRef<[u8]>>,
17}
18
19impl Debug for PdfData {
20    fn fmt(&self, f: &mut Formatter<'_>) -> core::fmt::Result {
21        write!(f, "PdfData {{ ... }}")
22    }
23}
24
25impl AsRef<[u8]> for PdfData {
26    fn as_ref(&self) -> &[u8] {
27        (*self.inner).as_ref()
28    }
29}
30
31#[cfg(feature = "std")]
32impl<T: AsRef<[u8]> + Send + Sync + 'static> From<Arc<T>> for PdfData {
33    fn from(data: Arc<T>) -> Self {
34        Self { inner: data }
35    }
36}
37
38#[cfg(not(feature = "std"))]
39impl<T: AsRef<[u8]> + 'static> From<Arc<T>> for PdfData {
40    fn from(data: Arc<T>) -> Self {
41        Self { inner: data }
42    }
43}
44
45impl From<Vec<u8>> for PdfData {
46    fn from(data: Vec<u8>) -> Self {
47        Self {
48            inner: Arc::new(data),
49        }
50    }
51}
52
53/// A structure for storing the data of the PDF.
54// To explain further: This crate uses a zero-parse approach, meaning that objects like
55// dictionaries or arrays always store the underlying data and parse objects lazily as needed,
56// instead of allocating the data and storing it in an owned way. However, the problem is that
57// not all data is readily available in the original data of the PDF: Objects can also be
58// stored in an object streams, in which case we first need to decode the stream before we can
59// access the data.
60//
61// The purpose of `Data` is to allow us to access the original data as well as maybe decoded data
62// by faking the same lifetime, so that we don't run into lifetime issues when dealing with
63// PDF objects that actually stem from different data sources.
64pub(crate) struct Data {
65    data: PdfData,
66    // 32 segments are more than enough as we can't have more objects than this.
67    decoded: SegmentList<Option<Vec<u8>>, 32>,
68    map: Mutex<HashMap<ObjectIdentifier, usize>>,
69}
70
71impl Debug for Data {
72    fn fmt(&self, f: &mut Formatter<'_>) -> core::fmt::Result {
73        write!(f, "Data {{ ... }}")
74    }
75}
76
77impl Data {
78    /// Create a new `Data` structure.
79    pub(crate) fn new(data: PdfData) -> Self {
80        Self {
81            data,
82            decoded: SegmentList::new(),
83            map: Mutex::new(HashMap::new()),
84        }
85    }
86
87    /// Get access to the original data of the PDF.
88    pub(crate) fn get(&self) -> &PdfData {
89        &self.data
90    }
91
92    /// Get access to the data of a decoded object stream.
93    pub(crate) fn get_with(&self, id: ObjectIdentifier, ctx: &ReaderContext<'_>) -> Option<&[u8]> {
94        if let Some(&idx) = self.map.get().get(&id) {
95            self.decoded.get(idx)?.as_deref()
96        } else {
97            // Block scope to keep the lock short-lived.
98            let idx = {
99                let mut locked = self.map.get();
100                let idx = locked.len();
101                locked.insert(id, idx);
102                idx
103            };
104            self.decoded
105                .get_or_init(idx, || {
106                    let stream = ctx.xref().get_with::<Stream<'_>>(id, ctx)?;
107                    stream.decoded().ok()
108                })
109                .as_deref()
110        }
111    }
112}