Skip to main content

pdf_syntax/
pdf.rs

1//! The starting point for reading PDF files.
2
3use crate::PdfData;
4use crate::object::Object;
5use crate::page::Pages;
6use crate::page::cached::CachedPages;
7use crate::reader::Reader;
8use crate::sync::Arc;
9use crate::xref::{XRef, XRefError, fallback, root_xref};
10
11pub use crate::crypto::DecryptionError;
12use crate::metadata::Metadata;
13
14/// A PDF file.
15pub struct Pdf {
16    xref: Arc<XRef>,
17    header_version: PdfVersion,
18    pages: CachedPages,
19    data: PdfData,
20}
21
22/// Maximum number of xref entries (indirect objects) allowed in a single PDF.
23///
24/// PDFs exceeding this limit are rejected with [`LoadPdfError::TooLarge`] to
25/// prevent unbounded memory growth. Corpus data shows legitimate documents
26/// rarely exceed 50 K objects; 500 K is a safe, generous upper bound. (#497)
27pub const MAX_OBJECTS: usize = 500_000;
28
29/// Maximum number of pages allowed in a single PDF.
30///
31/// Traversal of the page tree is capped at this value and documents that
32/// exceed it are rejected with [`LoadPdfError::TooLarge`]. (#497)
33pub const MAX_PAGES: usize = 50_000;
34
35/// Parser-internal limits applied while loading a PDF.
36///
37/// The default preserves the historical parser behavior (no caps). Callers
38/// that need stricter limits can set individual caps before loading.
39#[derive(Debug, Clone, Copy)]
40pub struct PdfLoadLimits {
41    /// `u32::MAX` is the "no cap" sentinel (same pattern as `max_image_pixels`).
42    max_object_depth: u32,
43    /// `u32::MAX` is the "no cap" sentinel.
44    max_image_pixels: u32,
45    /// Maximum decoded stream size in bytes.
46    ///
47    /// `u32::MAX` is the "no cap" sentinel (same pattern as `max_image_pixels`).
48    /// When set below `u32::MAX`, `Stream::decoded()` / `Stream::decoded_image()`
49    /// return `Err(DecodeFailure::StreamTooLarge { .. })` if the decoded payload
50    /// exceeds this threshold. The raw (compressed) bytes are not checked;
51    /// only the fully-decoded output is.
52    ///
53    /// Stored as `u32` (max ~4 GB) to keep `PdfLoadLimits` at 12 bytes, which
54    /// prevents it from inflating `Array<'a>` and therefore `Object<'a>`.
55    /// Caller-supplied `u64` values are clamped to `u32::MAX - 1` on conversion.
56    max_stream_bytes: u32,
57}
58
59impl Default for PdfLoadLimits {
60    fn default() -> Self {
61        // `u32::MAX` is the "no cap" sentinel for all three limits. A raw
62        // `Default::default()` would produce 0 everywhere, which would reject
63        // every image / stream and silently break rendering.
64        Self {
65            max_object_depth: u32::MAX,
66            max_image_pixels: u32::MAX,
67            max_stream_bytes: u32::MAX,
68        }
69    }
70}
71
72impl PdfLoadLimits {
73    /// Create a limit set with no caller overrides.
74    pub fn new() -> Self {
75        Self::default()
76    }
77
78    /// Set the maximum page-tree/object traversal depth.
79    pub fn max_object_depth(mut self, depth: u32) -> Self {
80        self.max_object_depth = depth;
81        self
82    }
83
84    /// Set the maximum decoded image pixel count.
85    pub fn max_image_pixels(mut self, pixels: u64) -> Self {
86        self.max_image_pixels = u32::try_from(pixels).unwrap_or(u32::MAX);
87        self
88    }
89
90    /// Set the maximum decoded stream size in bytes.
91    ///
92    /// Values above ~4 GB are clamped to `u32::MAX - 1` (≈ 4 GB − 1) due to
93    /// internal storage as `u32`. For the intended use-case (decompression-bomb
94    /// protection at 32 MB–1 GB), the 4 GB ceiling is more than sufficient.
95    ///
96    /// Any call to [`Stream::decoded`] or [`Stream::decoded_image`] that
97    /// produces more bytes than `max_bytes` returns
98    /// `Err(DecodeFailure::StreamTooLarge { observed, limit })`.
99    pub fn max_stream_bytes(mut self, max_bytes: u64) -> Self {
100        // Clamp to u32::MAX - 1 to distinguish from the "no limit" sentinel.
101        self.max_stream_bytes = u32::try_from(max_bytes).unwrap_or(u32::MAX - 1);
102        self
103    }
104
105    pub(crate) fn object_depth_limit(self) -> Option<u32> {
106        if self.max_object_depth == u32::MAX {
107            None
108        } else {
109            Some(self.max_object_depth)
110        }
111    }
112
113    pub(crate) fn image_pixel_limit(self) -> Option<u32> {
114        if self.max_image_pixels == u32::MAX {
115            None
116        } else {
117            Some(self.max_image_pixels)
118        }
119    }
120
121    pub(crate) fn stream_byte_limit(self) -> Option<u64> {
122        if self.max_stream_bytes == u32::MAX {
123            None
124        } else {
125            Some(u64::from(self.max_stream_bytes))
126        }
127    }
128}
129
130/// An error that occurred while loading a PDF file.
131#[derive(Debug, Copy, Clone, PartialEq, Eq)]
132pub enum LoadPdfError {
133    /// An error occurred while processing an encrypted document.
134    Decryption(DecryptionError),
135    /// The PDF was invalid or could not be parsed due to some other unknown reason.
136    Invalid,
137    /// The PDF exceeds a configured size limit (object count or page count).
138    ///
139    /// The first field is the number of xref objects; the second is the page
140    /// count. Either or both may have triggered the limit. (#497)
141    TooLarge(usize, usize),
142}
143
144#[allow(clippy::len_without_is_empty)]
145impl Pdf {
146    /// Try to read the given PDF file.
147    ///
148    /// Returns `Err` if it was unable to read it.
149    pub fn new(data: impl Into<PdfData>) -> Result<Self, LoadPdfError> {
150        Self::new_with_password(data, "")
151    }
152
153    /// Try to read the given PDF file with parser load limits.
154    ///
155    /// Returns `Err` if it was unable to read it.
156    pub fn new_with_limits(
157        data: impl Into<PdfData>,
158        limits: PdfLoadLimits,
159    ) -> Result<Self, LoadPdfError> {
160        Self::new_with_password_and_limits(data, "", limits)
161    }
162
163    /// Try to read the given PDF file with a password.
164    ///
165    /// Returns `Err` if it was unable to read it or if the password is incorrect.
166    pub fn new_with_password(
167        data: impl Into<PdfData>,
168        password: &str,
169    ) -> Result<Self, LoadPdfError> {
170        Self::new_with_password_and_limits(data, password, PdfLoadLimits::default())
171    }
172
173    /// Try to read the given PDF file with a password and parser load limits.
174    ///
175    /// Returns `Err` if it was unable to read it or if the password is incorrect.
176    pub fn new_with_password_and_limits(
177        data: impl Into<PdfData>,
178        password: &str,
179        limits: PdfLoadLimits,
180    ) -> Result<Self, LoadPdfError> {
181        let data = data.into();
182        let password = password.as_bytes();
183        let version = find_version(data.as_ref()).unwrap_or(PdfVersion::Pdf10);
184        let xref = match root_xref(data.clone(), password, limits) {
185            Ok(x) => x,
186            Err(e) => match e {
187                XRefError::Unknown => {
188                    fallback(data.clone(), password, limits).ok_or(LoadPdfError::Invalid)?
189                }
190                XRefError::Encryption(e) => return Err(LoadPdfError::Decryption(e)),
191            },
192        };
193        let xref = Arc::new(xref);
194
195        // Reject documents whose xref table exceeds the object limit.
196        // This fires before we decode any object data, so the cost is minimal.
197        // The limit prevents unbounded memory growth on adversarially large PDFs. (#497)
198        let object_count = xref.len();
199        if object_count > MAX_OBJECTS {
200            return Err(LoadPdfError::TooLarge(object_count, 0));
201        }
202
203        let pages = CachedPages::new(xref.clone()).ok_or(LoadPdfError::Invalid)?;
204
205        // Reject documents whose page tree resolves to more pages than allowed.
206        // resolve_pages already caps traversal at MAX_PAGE_COUNT (100 K); checking
207        // against our stricter MAX_PAGES (50 K) here gives a clean error instead
208        // of silently truncating. (#497)
209        let page_count = pages.get().len();
210        if page_count > MAX_PAGES {
211            return Err(LoadPdfError::TooLarge(object_count, page_count));
212        }
213
214        Ok(Self {
215            xref,
216            header_version: version,
217            pages,
218            data,
219        })
220    }
221
222    /// Return the number of objects present in the PDF file.
223    pub fn len(&self) -> usize {
224        self.xref.len()
225    }
226
227    /// Return an iterator over all objects defined in the PDF file.
228    pub fn objects(&self) -> impl IntoIterator<Item = Object<'_>> {
229        self.xref.objects()
230    }
231
232    /// Return the version of the PDF file.
233    pub fn version(&self) -> PdfVersion {
234        self.xref
235            .trailer_data()
236            .version
237            .unwrap_or(self.header_version)
238    }
239
240    /// Return the underlying data of the PDF file.
241    pub fn data(&self) -> &PdfData {
242        &self.data
243    }
244
245    /// Return the pages of the PDF file.
246    pub fn pages(&self) -> &Pages<'_> {
247        self.pages.get()
248    }
249
250    /// Return the xref of the PDF file.
251    pub fn xref(&self) -> &XRef {
252        &self.xref
253    }
254
255    /// Return the metadata in the document information dictionary of the document.
256    pub fn metadata(&self) -> &Metadata {
257        self.xref.metadata()
258    }
259}
260
261fn find_version(data: &[u8]) -> Option<PdfVersion> {
262    let data = &data[..data.len().min(2000)];
263    let mut r = Reader::new(data);
264
265    while r.forward_tag(b"%PDF-").is_none() {
266        r.read_byte()?;
267    }
268
269    PdfVersion::from_bytes(r.tail()?)
270}
271
272/// The version of a PDF document.
273#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
274pub enum PdfVersion {
275    /// PDF 1.0.
276    Pdf10,
277    /// PDF 1.1.
278    Pdf11,
279    /// PDF 1.2.
280    Pdf12,
281    /// PDF 1.3.
282    Pdf13,
283    /// PDF 1.4.
284    Pdf14,
285    /// PDF 1.5.
286    Pdf15,
287    /// PDF 1.6.
288    Pdf16,
289    /// PDF 1.7.
290    Pdf17,
291    /// PDF 2.0.
292    Pdf20,
293}
294
295impl PdfVersion {
296    pub(crate) fn from_bytes(bytes: &[u8]) -> Option<Self> {
297        match bytes.get(..3)? {
298            b"1.0" => Some(Self::Pdf10),
299            b"1.1" => Some(Self::Pdf11),
300            b"1.2" => Some(Self::Pdf12),
301            b"1.3" => Some(Self::Pdf13),
302            b"1.4" => Some(Self::Pdf14),
303            b"1.5" => Some(Self::Pdf15),
304            b"1.6" => Some(Self::Pdf16),
305            b"1.7" => Some(Self::Pdf17),
306            b"2.0" => Some(Self::Pdf20),
307            _ => None,
308        }
309    }
310}
311
312#[cfg(test)]
313mod tests {
314    use crate::pdf::{Pdf, PdfVersion};
315
316    #[test]
317    fn issue_49() {
318        let _ = Pdf::new(Vec::new());
319    }
320
321    #[test]
322    #[ignore = "requires hayro-tests corpus"]
323    fn pdf_version_header() {
324        let data = std::fs::read("../hayro-tests/downloads/pdfjs/alphatrans.pdf").unwrap();
325        let pdf = Pdf::new(data).unwrap();
326
327        assert_eq!(pdf.version(), PdfVersion::Pdf17);
328    }
329
330    #[test]
331    #[ignore = "requires hayro-tests corpus"]
332    fn pdf_version_catalog() {
333        let data = std::fs::read("../hayro-tests/downloads/pdfbox/2163.pdf").unwrap();
334        let pdf = Pdf::new(data).unwrap();
335
336        assert_eq!(pdf.version(), PdfVersion::Pdf14);
337    }
338}