pdf_syntax/pdf.rs
1//! The starting point for reading PDF files.
2
3use crate::PdfData;
4use crate::object::Object;
5use crate::page::Pages;
6use crate::page::cached::CachedPages;
7use crate::reader::Reader;
8use crate::sync::Arc;
9use crate::xref::{XRef, XRefError, fallback, root_xref};
10
11pub use crate::crypto::DecryptionError;
12use crate::metadata::Metadata;
13
14/// Structural recovery that occurred while loading a [`Pdf`].
15///
16/// Recovery is always attempted automatically; these flags let a caller learn
17/// that it happened, so a repaired document is distinguishable from a clean
18/// one. They never change the (always-on) recovery behaviour.
19///
20/// Marked `#[non_exhaustive]` so future recovery categories (e.g. an
21/// object-stream rebuild flag) can be added without a breaking change.
22/// Downstream crates read fields by name; they must not construct or
23/// exhaustively destructure this struct.
24#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)]
25#[non_exhaustive]
26pub struct LoadRecovery {
27 /// The cross-reference table was invalid and rebuilt by scanning the file
28 /// for objects. Object recovery may be incomplete.
29 pub xref_rebuilt: bool,
30 /// The page tree was invalid and pages were recovered by a brute-force
31 /// scan. Page order may differ from the source.
32 pub page_tree_rebuilt: bool,
33}
34
35/// A PDF file.
36pub struct Pdf {
37 xref: Arc<XRef>,
38 header_version: PdfVersion,
39 pages: CachedPages,
40 data: PdfData,
41 recovery: LoadRecovery,
42}
43
44/// Maximum number of xref entries (indirect objects) allowed in a single PDF.
45///
46/// PDFs exceeding this limit are rejected with [`LoadPdfError::TooLarge`] to
47/// prevent unbounded memory growth. Corpus data shows legitimate documents
48/// rarely exceed 50 K objects; 500 K is a safe, generous upper bound. (#497)
49pub const MAX_OBJECTS: usize = 500_000;
50
51/// Maximum number of pages allowed in a single PDF.
52///
53/// Traversal of the page tree is capped at this value and documents that
54/// exceed it are rejected with [`LoadPdfError::TooLarge`]. (#497)
55pub const MAX_PAGES: usize = 50_000;
56
57/// Parser-internal limits applied while loading a PDF.
58///
59/// The default preserves the historical parser behavior (no caps). Callers
60/// that need stricter limits can set individual caps before loading.
61#[derive(Debug, Clone, Copy)]
62pub struct PdfLoadLimits {
63 /// `u32::MAX` is the "no cap" sentinel (same pattern as `max_image_pixels`).
64 max_object_depth: u32,
65 /// `u32::MAX` is the "no cap" sentinel.
66 max_image_pixels: u32,
67 /// Maximum decoded stream size in bytes.
68 ///
69 /// `u32::MAX` is the "no cap" sentinel (same pattern as `max_image_pixels`).
70 /// When set below `u32::MAX`, `Stream::decoded()` / `Stream::decoded_image()`
71 /// return `Err(DecodeFailure::StreamTooLarge { .. })` if the decoded payload
72 /// exceeds this threshold. The raw (compressed) bytes are not checked;
73 /// only the fully-decoded output is.
74 ///
75 /// Stored as `u32` (max ~4 GB) to keep `PdfLoadLimits` at 12 bytes, which
76 /// prevents it from inflating `Array<'a>` and therefore `Object<'a>`.
77 /// Caller-supplied `u64` values are clamped to `u32::MAX - 1` on conversion.
78 max_stream_bytes: u32,
79}
80
81impl Default for PdfLoadLimits {
82 fn default() -> Self {
83 // `u32::MAX` is the "no cap" sentinel for all three limits. A raw
84 // `Default::default()` would produce 0 everywhere, which would reject
85 // every image / stream and silently break rendering.
86 Self {
87 max_object_depth: u32::MAX,
88 max_image_pixels: u32::MAX,
89 max_stream_bytes: u32::MAX,
90 }
91 }
92}
93
94impl PdfLoadLimits {
95 /// Create a limit set with no caller overrides.
96 pub fn new() -> Self {
97 Self::default()
98 }
99
100 /// Set the maximum page-tree/object traversal depth.
101 pub fn max_object_depth(mut self, depth: u32) -> Self {
102 self.max_object_depth = depth;
103 self
104 }
105
106 /// Set the maximum decoded image pixel count.
107 pub fn max_image_pixels(mut self, pixels: u64) -> Self {
108 self.max_image_pixels = u32::try_from(pixels).unwrap_or(u32::MAX);
109 self
110 }
111
112 /// Set the maximum decoded stream size in bytes.
113 ///
114 /// Values above ~4 GB are clamped to `u32::MAX - 1` (≈ 4 GB − 1) due to
115 /// internal storage as `u32`. For the intended use-case (decompression-bomb
116 /// protection at 32 MB–1 GB), the 4 GB ceiling is more than sufficient.
117 ///
118 /// Any call to [`Stream::decoded`] or [`Stream::decoded_image`] that
119 /// produces more bytes than `max_bytes` returns
120 /// `Err(DecodeFailure::StreamTooLarge { observed, limit })`.
121 pub fn max_stream_bytes(mut self, max_bytes: u64) -> Self {
122 // Clamp to u32::MAX - 1 to distinguish from the "no limit" sentinel.
123 self.max_stream_bytes = u32::try_from(max_bytes).unwrap_or(u32::MAX - 1);
124 self
125 }
126
127 pub(crate) fn object_depth_limit(self) -> Option<u32> {
128 if self.max_object_depth == u32::MAX {
129 None
130 } else {
131 Some(self.max_object_depth)
132 }
133 }
134
135 pub(crate) fn image_pixel_limit(self) -> Option<u32> {
136 if self.max_image_pixels == u32::MAX {
137 None
138 } else {
139 Some(self.max_image_pixels)
140 }
141 }
142
143 pub(crate) fn stream_byte_limit(self) -> Option<u64> {
144 if self.max_stream_bytes == u32::MAX {
145 None
146 } else {
147 Some(u64::from(self.max_stream_bytes))
148 }
149 }
150}
151
152/// An error that occurred while loading a PDF file.
153#[derive(Debug, Copy, Clone, PartialEq, Eq)]
154pub enum LoadPdfError {
155 /// An error occurred while processing an encrypted document.
156 Decryption(DecryptionError),
157 /// The PDF was invalid or could not be parsed due to some other unknown reason.
158 Invalid,
159 /// The PDF exceeds a configured size limit (object count or page count).
160 ///
161 /// The first field is the number of xref objects; the second is the page
162 /// count. Either or both may have triggered the limit. (#497)
163 TooLarge(usize, usize),
164}
165
166#[allow(clippy::len_without_is_empty)]
167impl Pdf {
168 /// Try to read the given PDF file.
169 ///
170 /// Returns `Err` if it was unable to read it.
171 pub fn new(data: impl Into<PdfData>) -> Result<Self, LoadPdfError> {
172 Self::new_with_password(data, "")
173 }
174
175 /// Try to read the given PDF file with parser load limits.
176 ///
177 /// Returns `Err` if it was unable to read it.
178 pub fn new_with_limits(
179 data: impl Into<PdfData>,
180 limits: PdfLoadLimits,
181 ) -> Result<Self, LoadPdfError> {
182 Self::new_with_password_and_limits(data, "", limits)
183 }
184
185 /// Try to read the given PDF file with a password.
186 ///
187 /// Returns `Err` if it was unable to read it or if the password is incorrect.
188 pub fn new_with_password(
189 data: impl Into<PdfData>,
190 password: &str,
191 ) -> Result<Self, LoadPdfError> {
192 Self::new_with_password_and_limits(data, password, PdfLoadLimits::default())
193 }
194
195 /// Try to read the given PDF file with a password and parser load limits.
196 ///
197 /// Returns `Err` if it was unable to read it or if the password is incorrect.
198 pub fn new_with_password_and_limits(
199 data: impl Into<PdfData>,
200 password: &str,
201 limits: PdfLoadLimits,
202 ) -> Result<Self, LoadPdfError> {
203 let data = data.into();
204 let password = password.as_bytes();
205 let version = find_version(data.as_ref()).unwrap_or(PdfVersion::Pdf10);
206 let mut xref_rebuilt = false;
207 let xref = match root_xref(data.clone(), password, limits) {
208 Ok(x) => x,
209 Err(e) => match e {
210 XRefError::Unknown => {
211 // The xref table was invalid; rebuild it by scanning objects.
212 xref_rebuilt = true;
213 fallback(data.clone(), password, limits).ok_or(LoadPdfError::Invalid)?
214 }
215 XRefError::Encryption(e) => return Err(LoadPdfError::Decryption(e)),
216 },
217 };
218 let xref = Arc::new(xref);
219
220 // Reject documents whose xref table exceeds the object limit.
221 // This fires before we decode any object data, so the cost is minimal.
222 // The limit prevents unbounded memory growth on adversarially large PDFs. (#497)
223 let object_count = xref.len();
224 if object_count > MAX_OBJECTS {
225 return Err(LoadPdfError::TooLarge(object_count, 0));
226 }
227
228 let pages = CachedPages::new(xref.clone()).ok_or(LoadPdfError::Invalid)?;
229
230 // Reject documents whose page tree resolves to more pages than allowed.
231 // resolve_pages already caps traversal at MAX_PAGE_COUNT (100 K); checking
232 // against our stricter MAX_PAGES (50 K) here gives a clean error instead
233 // of silently truncating. (#497)
234 let page_count = pages.get().len();
235 if page_count > MAX_PAGES {
236 return Err(LoadPdfError::TooLarge(object_count, page_count));
237 }
238
239 let recovery = LoadRecovery {
240 xref_rebuilt,
241 page_tree_rebuilt: pages.page_tree_rebuilt(),
242 };
243
244 Ok(Self {
245 xref,
246 header_version: version,
247 pages,
248 data,
249 recovery,
250 })
251 }
252
253 /// Structural recovery applied while loading this document (xref / page-tree
254 /// rebuild). All-`false` for a document that parsed cleanly.
255 pub fn load_recovery(&self) -> LoadRecovery {
256 self.recovery
257 }
258
259 /// Return the number of objects present in the PDF file.
260 pub fn len(&self) -> usize {
261 self.xref.len()
262 }
263
264 /// Return an iterator over all objects defined in the PDF file.
265 pub fn objects(&self) -> impl IntoIterator<Item = Object<'_>> {
266 self.xref.objects()
267 }
268
269 /// Return the version of the PDF file.
270 pub fn version(&self) -> PdfVersion {
271 self.xref
272 .trailer_data()
273 .version
274 .unwrap_or(self.header_version)
275 }
276
277 /// Return the underlying data of the PDF file.
278 pub fn data(&self) -> &PdfData {
279 &self.data
280 }
281
282 /// Return the pages of the PDF file.
283 pub fn pages(&self) -> &Pages<'_> {
284 self.pages.get()
285 }
286
287 /// Return the xref of the PDF file.
288 pub fn xref(&self) -> &XRef {
289 &self.xref
290 }
291
292 /// Return the metadata in the document information dictionary of the document.
293 pub fn metadata(&self) -> &Metadata {
294 self.xref.metadata()
295 }
296}
297
298fn find_version(data: &[u8]) -> Option<PdfVersion> {
299 let data = &data[..data.len().min(2000)];
300 let mut r = Reader::new(data);
301
302 while r.forward_tag(b"%PDF-").is_none() {
303 r.read_byte()?;
304 }
305
306 PdfVersion::from_bytes(r.tail()?)
307}
308
309/// The version of a PDF document.
310#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
311pub enum PdfVersion {
312 /// PDF 1.0.
313 Pdf10,
314 /// PDF 1.1.
315 Pdf11,
316 /// PDF 1.2.
317 Pdf12,
318 /// PDF 1.3.
319 Pdf13,
320 /// PDF 1.4.
321 Pdf14,
322 /// PDF 1.5.
323 Pdf15,
324 /// PDF 1.6.
325 Pdf16,
326 /// PDF 1.7.
327 Pdf17,
328 /// PDF 2.0.
329 Pdf20,
330}
331
332impl PdfVersion {
333 pub(crate) fn from_bytes(bytes: &[u8]) -> Option<Self> {
334 match bytes.get(..3)? {
335 b"1.0" => Some(Self::Pdf10),
336 b"1.1" => Some(Self::Pdf11),
337 b"1.2" => Some(Self::Pdf12),
338 b"1.3" => Some(Self::Pdf13),
339 b"1.4" => Some(Self::Pdf14),
340 b"1.5" => Some(Self::Pdf15),
341 b"1.6" => Some(Self::Pdf16),
342 b"1.7" => Some(Self::Pdf17),
343 b"2.0" => Some(Self::Pdf20),
344 _ => None,
345 }
346 }
347}
348
349#[cfg(test)]
350mod tests {
351 use crate::pdf::{Pdf, PdfVersion};
352
353 #[test]
354 fn issue_49() {
355 let _ = Pdf::new(Vec::new());
356 }
357
358 #[test]
359 #[ignore = "requires hayro-tests corpus"]
360 fn pdf_version_header() {
361 let data = std::fs::read("../hayro-tests/downloads/pdfjs/alphatrans.pdf").unwrap();
362 let pdf = Pdf::new(data).unwrap();
363
364 assert_eq!(pdf.version(), PdfVersion::Pdf17);
365 }
366
367 #[test]
368 #[ignore = "requires hayro-tests corpus"]
369 fn pdf_version_catalog() {
370 let data = std::fs::read("../hayro-tests/downloads/pdfbox/2163.pdf").unwrap();
371 let pdf = Pdf::new(data).unwrap();
372
373 assert_eq!(pdf.version(), PdfVersion::Pdf14);
374 }
375}