Skip to main content

pdf_syntax/
pdf.rs

1//! The starting point for reading PDF files.
2
3use crate::PdfData;
4use crate::object::Object;
5use crate::page::Pages;
6use crate::page::cached::CachedPages;
7use crate::reader::Reader;
8use crate::sync::Arc;
9use crate::xref::{XRef, XRefError, fallback, root_xref};
10
11pub use crate::crypto::DecryptionError;
12use crate::metadata::Metadata;
13
14/// Structural recovery that occurred while loading a [`Pdf`].
15///
16/// Recovery is always attempted automatically; these flags let a caller learn
17/// that it happened, so a repaired document is distinguishable from a clean
18/// one. They never change the (always-on) recovery behaviour.
19///
20/// Marked `#[non_exhaustive]` so future recovery categories (e.g. an
21/// object-stream rebuild flag) can be added without a breaking change.
22/// Downstream crates read fields by name; they must not construct or
23/// exhaustively destructure this struct.
24#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)]
25#[non_exhaustive]
26pub struct LoadRecovery {
27    /// The cross-reference table was invalid and rebuilt by scanning the file
28    /// for objects. Object recovery may be incomplete.
29    pub xref_rebuilt: bool,
30    /// The page tree was invalid and pages were recovered by a brute-force
31    /// scan. Page order may differ from the source.
32    pub page_tree_rebuilt: bool,
33}
34
35/// A PDF file.
36pub struct Pdf {
37    xref: Arc<XRef>,
38    header_version: PdfVersion,
39    pages: CachedPages,
40    data: PdfData,
41    recovery: LoadRecovery,
42}
43
44/// Maximum number of xref entries (indirect objects) allowed in a single PDF.
45///
46/// PDFs exceeding this limit are rejected with [`LoadPdfError::TooLarge`] to
47/// prevent unbounded memory growth. Corpus data shows legitimate documents
48/// rarely exceed 50 K objects; 500 K is a safe, generous upper bound. (#497)
49pub const MAX_OBJECTS: usize = 500_000;
50
51/// Maximum number of pages allowed in a single PDF.
52///
53/// Traversal of the page tree is capped at this value and documents that
54/// exceed it are rejected with [`LoadPdfError::TooLarge`]. (#497)
55pub const MAX_PAGES: usize = 50_000;
56
57/// Parser-internal limits applied while loading a PDF.
58///
59/// The default preserves the historical parser behavior (no caps). Callers
60/// that need stricter limits can set individual caps before loading.
61#[derive(Debug, Clone, Copy)]
62pub struct PdfLoadLimits {
63    /// `u32::MAX` is the "no cap" sentinel (same pattern as `max_image_pixels`).
64    max_object_depth: u32,
65    /// `u32::MAX` is the "no cap" sentinel.
66    max_image_pixels: u32,
67    /// Maximum decoded stream size in bytes.
68    ///
69    /// `u32::MAX` is the "no cap" sentinel (same pattern as `max_image_pixels`).
70    /// When set below `u32::MAX`, `Stream::decoded()` / `Stream::decoded_image()`
71    /// return `Err(DecodeFailure::StreamTooLarge { .. })` if the decoded payload
72    /// exceeds this threshold. The raw (compressed) bytes are not checked;
73    /// only the fully-decoded output is.
74    ///
75    /// Stored as `u32` (max ~4 GB) to keep `PdfLoadLimits` at 12 bytes, which
76    /// prevents it from inflating `Array<'a>` and therefore `Object<'a>`.
77    /// Caller-supplied `u64` values are clamped to `u32::MAX - 1` on conversion.
78    max_stream_bytes: u32,
79}
80
81impl Default for PdfLoadLimits {
82    fn default() -> Self {
83        // `u32::MAX` is the "no cap" sentinel for all three limits. A raw
84        // `Default::default()` would produce 0 everywhere, which would reject
85        // every image / stream and silently break rendering.
86        Self {
87            max_object_depth: u32::MAX,
88            max_image_pixels: u32::MAX,
89            max_stream_bytes: u32::MAX,
90        }
91    }
92}
93
94impl PdfLoadLimits {
95    /// Create a limit set with no caller overrides.
96    pub fn new() -> Self {
97        Self::default()
98    }
99
100    /// Set the maximum page-tree/object traversal depth.
101    pub fn max_object_depth(mut self, depth: u32) -> Self {
102        self.max_object_depth = depth;
103        self
104    }
105
106    /// Set the maximum decoded image pixel count.
107    pub fn max_image_pixels(mut self, pixels: u64) -> Self {
108        self.max_image_pixels = u32::try_from(pixels).unwrap_or(u32::MAX);
109        self
110    }
111
112    /// Set the maximum decoded stream size in bytes.
113    ///
114    /// Values above ~4 GB are clamped to `u32::MAX - 1` (≈ 4 GB − 1) due to
115    /// internal storage as `u32`. For the intended use-case (decompression-bomb
116    /// protection at 32 MB–1 GB), the 4 GB ceiling is more than sufficient.
117    ///
118    /// Any call to [`Stream::decoded`] or [`Stream::decoded_image`] that
119    /// produces more bytes than `max_bytes` returns
120    /// `Err(DecodeFailure::StreamTooLarge { observed, limit })`.
121    pub fn max_stream_bytes(mut self, max_bytes: u64) -> Self {
122        // Clamp to u32::MAX - 1 to distinguish from the "no limit" sentinel.
123        self.max_stream_bytes = u32::try_from(max_bytes).unwrap_or(u32::MAX - 1);
124        self
125    }
126
127    pub(crate) fn object_depth_limit(self) -> Option<u32> {
128        if self.max_object_depth == u32::MAX {
129            None
130        } else {
131            Some(self.max_object_depth)
132        }
133    }
134
135    pub(crate) fn image_pixel_limit(self) -> Option<u32> {
136        if self.max_image_pixels == u32::MAX {
137            None
138        } else {
139            Some(self.max_image_pixels)
140        }
141    }
142
143    pub(crate) fn stream_byte_limit(self) -> Option<u64> {
144        if self.max_stream_bytes == u32::MAX {
145            None
146        } else {
147            Some(u64::from(self.max_stream_bytes))
148        }
149    }
150}
151
152/// An error that occurred while loading a PDF file.
153#[derive(Debug, Copy, Clone, PartialEq, Eq)]
154pub enum LoadPdfError {
155    /// An error occurred while processing an encrypted document.
156    Decryption(DecryptionError),
157    /// The PDF was invalid or could not be parsed due to some other unknown reason.
158    Invalid,
159    /// The PDF exceeds a configured size limit (object count or page count).
160    ///
161    /// The first field is the number of xref objects; the second is the page
162    /// count. Either or both may have triggered the limit. (#497)
163    TooLarge(usize, usize),
164}
165
166#[allow(clippy::len_without_is_empty)]
167impl Pdf {
168    /// Try to read the given PDF file.
169    ///
170    /// Returns `Err` if it was unable to read it.
171    pub fn new(data: impl Into<PdfData>) -> Result<Self, LoadPdfError> {
172        Self::new_with_password(data, "")
173    }
174
175    /// Try to read the given PDF file with parser load limits.
176    ///
177    /// Returns `Err` if it was unable to read it.
178    pub fn new_with_limits(
179        data: impl Into<PdfData>,
180        limits: PdfLoadLimits,
181    ) -> Result<Self, LoadPdfError> {
182        Self::new_with_password_and_limits(data, "", limits)
183    }
184
185    /// Try to read the given PDF file with a password.
186    ///
187    /// Returns `Err` if it was unable to read it or if the password is incorrect.
188    pub fn new_with_password(
189        data: impl Into<PdfData>,
190        password: &str,
191    ) -> Result<Self, LoadPdfError> {
192        Self::new_with_password_and_limits(data, password, PdfLoadLimits::default())
193    }
194
195    /// Try to read the given PDF file with a password and parser load limits.
196    ///
197    /// Returns `Err` if it was unable to read it or if the password is incorrect.
198    pub fn new_with_password_and_limits(
199        data: impl Into<PdfData>,
200        password: &str,
201        limits: PdfLoadLimits,
202    ) -> Result<Self, LoadPdfError> {
203        let data = data.into();
204        let password = password.as_bytes();
205        let version = find_version(data.as_ref()).unwrap_or(PdfVersion::Pdf10);
206        let mut xref_rebuilt = false;
207        let xref = match root_xref(data.clone(), password, limits) {
208            Ok(x) => x,
209            Err(e) => match e {
210                XRefError::Unknown => {
211                    // The xref table was invalid; rebuild it by scanning objects.
212                    xref_rebuilt = true;
213                    fallback(data.clone(), password, limits).ok_or(LoadPdfError::Invalid)?
214                }
215                XRefError::Encryption(e) => return Err(LoadPdfError::Decryption(e)),
216            },
217        };
218        let xref = Arc::new(xref);
219
220        // Reject documents whose xref table exceeds the object limit.
221        // This fires before we decode any object data, so the cost is minimal.
222        // The limit prevents unbounded memory growth on adversarially large PDFs. (#497)
223        let object_count = xref.len();
224        if object_count > MAX_OBJECTS {
225            return Err(LoadPdfError::TooLarge(object_count, 0));
226        }
227
228        let pages = CachedPages::new(xref.clone()).ok_or(LoadPdfError::Invalid)?;
229
230        // Reject documents whose page tree resolves to more pages than allowed.
231        // resolve_pages already caps traversal at MAX_PAGE_COUNT (100 K); checking
232        // against our stricter MAX_PAGES (50 K) here gives a clean error instead
233        // of silently truncating. (#497)
234        let page_count = pages.get().len();
235        if page_count > MAX_PAGES {
236            return Err(LoadPdfError::TooLarge(object_count, page_count));
237        }
238
239        let recovery = LoadRecovery {
240            xref_rebuilt,
241            page_tree_rebuilt: pages.page_tree_rebuilt(),
242        };
243
244        Ok(Self {
245            xref,
246            header_version: version,
247            pages,
248            data,
249            recovery,
250        })
251    }
252
253    /// Structural recovery applied while loading this document (xref / page-tree
254    /// rebuild). All-`false` for a document that parsed cleanly.
255    pub fn load_recovery(&self) -> LoadRecovery {
256        self.recovery
257    }
258
259    /// Return the number of objects present in the PDF file.
260    pub fn len(&self) -> usize {
261        self.xref.len()
262    }
263
264    /// Return an iterator over all objects defined in the PDF file.
265    pub fn objects(&self) -> impl IntoIterator<Item = Object<'_>> {
266        self.xref.objects()
267    }
268
269    /// Return the version of the PDF file.
270    pub fn version(&self) -> PdfVersion {
271        self.xref
272            .trailer_data()
273            .version
274            .unwrap_or(self.header_version)
275    }
276
277    /// Return the underlying data of the PDF file.
278    pub fn data(&self) -> &PdfData {
279        &self.data
280    }
281
282    /// Return the pages of the PDF file.
283    pub fn pages(&self) -> &Pages<'_> {
284        self.pages.get()
285    }
286
287    /// Return the xref of the PDF file.
288    pub fn xref(&self) -> &XRef {
289        &self.xref
290    }
291
292    /// Return the metadata in the document information dictionary of the document.
293    pub fn metadata(&self) -> &Metadata {
294        self.xref.metadata()
295    }
296}
297
298fn find_version(data: &[u8]) -> Option<PdfVersion> {
299    let data = &data[..data.len().min(2000)];
300    let mut r = Reader::new(data);
301
302    while r.forward_tag(b"%PDF-").is_none() {
303        r.read_byte()?;
304    }
305
306    PdfVersion::from_bytes(r.tail()?)
307}
308
309/// The version of a PDF document.
310#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
311pub enum PdfVersion {
312    /// PDF 1.0.
313    Pdf10,
314    /// PDF 1.1.
315    Pdf11,
316    /// PDF 1.2.
317    Pdf12,
318    /// PDF 1.3.
319    Pdf13,
320    /// PDF 1.4.
321    Pdf14,
322    /// PDF 1.5.
323    Pdf15,
324    /// PDF 1.6.
325    Pdf16,
326    /// PDF 1.7.
327    Pdf17,
328    /// PDF 2.0.
329    Pdf20,
330}
331
332impl PdfVersion {
333    pub(crate) fn from_bytes(bytes: &[u8]) -> Option<Self> {
334        match bytes.get(..3)? {
335            b"1.0" => Some(Self::Pdf10),
336            b"1.1" => Some(Self::Pdf11),
337            b"1.2" => Some(Self::Pdf12),
338            b"1.3" => Some(Self::Pdf13),
339            b"1.4" => Some(Self::Pdf14),
340            b"1.5" => Some(Self::Pdf15),
341            b"1.6" => Some(Self::Pdf16),
342            b"1.7" => Some(Self::Pdf17),
343            b"2.0" => Some(Self::Pdf20),
344            _ => None,
345        }
346    }
347}
348
349#[cfg(test)]
350mod tests {
351    use crate::pdf::{Pdf, PdfVersion};
352
353    #[test]
354    fn issue_49() {
355        let _ = Pdf::new(Vec::new());
356    }
357
358    #[test]
359    #[ignore = "requires hayro-tests corpus"]
360    fn pdf_version_header() {
361        let data = std::fs::read("../hayro-tests/downloads/pdfjs/alphatrans.pdf").unwrap();
362        let pdf = Pdf::new(data).unwrap();
363
364        assert_eq!(pdf.version(), PdfVersion::Pdf17);
365    }
366
367    #[test]
368    #[ignore = "requires hayro-tests corpus"]
369    fn pdf_version_catalog() {
370        let data = std::fs::read("../hayro-tests/downloads/pdfbox/2163.pdf").unwrap();
371        let pdf = Pdf::new(data).unwrap();
372
373        assert_eq!(pdf.version(), PdfVersion::Pdf14);
374    }
375}