pdf_syntax/pdf.rs
1//! The starting point for reading PDF files.
2
3use crate::PdfData;
4use crate::object::Object;
5use crate::page::Pages;
6use crate::page::cached::CachedPages;
7use crate::reader::Reader;
8use crate::sync::Arc;
9use crate::xref::{XRef, XRefError, fallback, root_xref};
10
11pub use crate::crypto::DecryptionError;
12use crate::metadata::Metadata;
13
14/// A PDF file.
15pub struct Pdf {
16 xref: Arc<XRef>,
17 header_version: PdfVersion,
18 pages: CachedPages,
19 data: PdfData,
20}
21
22/// Maximum number of xref entries (indirect objects) allowed in a single PDF.
23///
24/// PDFs exceeding this limit are rejected with [`LoadPdfError::TooLarge`] to
25/// prevent unbounded memory growth. Corpus data shows legitimate documents
26/// rarely exceed 50 K objects; 500 K is a safe, generous upper bound. (#497)
27pub const MAX_OBJECTS: usize = 500_000;
28
29/// Maximum number of pages allowed in a single PDF.
30///
31/// Traversal of the page tree is capped at this value and documents that
32/// exceed it are rejected with [`LoadPdfError::TooLarge`]. (#497)
33pub const MAX_PAGES: usize = 50_000;
34
35/// Parser-internal limits applied while loading a PDF.
36///
37/// The default preserves the historical parser behavior (no caps). Callers
38/// that need stricter limits can set individual caps before loading.
39#[derive(Debug, Clone, Copy)]
40pub struct PdfLoadLimits {
41 /// `u32::MAX` is the "no cap" sentinel (same pattern as `max_image_pixels`).
42 max_object_depth: u32,
43 /// `u32::MAX` is the "no cap" sentinel.
44 max_image_pixels: u32,
45 /// Maximum decoded stream size in bytes.
46 ///
47 /// `u32::MAX` is the "no cap" sentinel (same pattern as `max_image_pixels`).
48 /// When set below `u32::MAX`, `Stream::decoded()` / `Stream::decoded_image()`
49 /// return `Err(DecodeFailure::StreamTooLarge { .. })` if the decoded payload
50 /// exceeds this threshold. The raw (compressed) bytes are not checked;
51 /// only the fully-decoded output is.
52 ///
53 /// Stored as `u32` (max ~4 GB) to keep `PdfLoadLimits` at 12 bytes, which
54 /// prevents it from inflating `Array<'a>` and therefore `Object<'a>`.
55 /// Caller-supplied `u64` values are clamped to `u32::MAX - 1` on conversion.
56 max_stream_bytes: u32,
57}
58
59impl Default for PdfLoadLimits {
60 fn default() -> Self {
61 // `u32::MAX` is the "no cap" sentinel for all three limits. A raw
62 // `Default::default()` would produce 0 everywhere, which would reject
63 // every image / stream and silently break rendering.
64 Self {
65 max_object_depth: u32::MAX,
66 max_image_pixels: u32::MAX,
67 max_stream_bytes: u32::MAX,
68 }
69 }
70}
71
72impl PdfLoadLimits {
73 /// Create a limit set with no caller overrides.
74 pub fn new() -> Self {
75 Self::default()
76 }
77
78 /// Set the maximum page-tree/object traversal depth.
79 pub fn max_object_depth(mut self, depth: u32) -> Self {
80 self.max_object_depth = depth;
81 self
82 }
83
84 /// Set the maximum decoded image pixel count.
85 pub fn max_image_pixels(mut self, pixels: u64) -> Self {
86 self.max_image_pixels = u32::try_from(pixels).unwrap_or(u32::MAX);
87 self
88 }
89
90 /// Set the maximum decoded stream size in bytes.
91 ///
92 /// Values above ~4 GB are clamped to `u32::MAX - 1` (≈ 4 GB − 1) due to
93 /// internal storage as `u32`. For the intended use-case (decompression-bomb
94 /// protection at 32 MB–1 GB), the 4 GB ceiling is more than sufficient.
95 ///
96 /// Any call to [`Stream::decoded`] or [`Stream::decoded_image`] that
97 /// produces more bytes than `max_bytes` returns
98 /// `Err(DecodeFailure::StreamTooLarge { observed, limit })`.
99 pub fn max_stream_bytes(mut self, max_bytes: u64) -> Self {
100 // Clamp to u32::MAX - 1 to distinguish from the "no limit" sentinel.
101 self.max_stream_bytes = u32::try_from(max_bytes).unwrap_or(u32::MAX - 1);
102 self
103 }
104
105 pub(crate) fn object_depth_limit(self) -> Option<u32> {
106 if self.max_object_depth == u32::MAX {
107 None
108 } else {
109 Some(self.max_object_depth)
110 }
111 }
112
113 pub(crate) fn image_pixel_limit(self) -> Option<u32> {
114 if self.max_image_pixels == u32::MAX {
115 None
116 } else {
117 Some(self.max_image_pixels)
118 }
119 }
120
121 pub(crate) fn stream_byte_limit(self) -> Option<u64> {
122 if self.max_stream_bytes == u32::MAX {
123 None
124 } else {
125 Some(u64::from(self.max_stream_bytes))
126 }
127 }
128}
129
130/// An error that occurred while loading a PDF file.
131#[derive(Debug, Copy, Clone, PartialEq, Eq)]
132pub enum LoadPdfError {
133 /// An error occurred while processing an encrypted document.
134 Decryption(DecryptionError),
135 /// The PDF was invalid or could not be parsed due to some other unknown reason.
136 Invalid,
137 /// The PDF exceeds a configured size limit (object count or page count).
138 ///
139 /// The first field is the number of xref objects; the second is the page
140 /// count. Either or both may have triggered the limit. (#497)
141 TooLarge(usize, usize),
142}
143
144#[allow(clippy::len_without_is_empty)]
145impl Pdf {
146 /// Try to read the given PDF file.
147 ///
148 /// Returns `Err` if it was unable to read it.
149 pub fn new(data: impl Into<PdfData>) -> Result<Self, LoadPdfError> {
150 Self::new_with_password(data, "")
151 }
152
153 /// Try to read the given PDF file with parser load limits.
154 ///
155 /// Returns `Err` if it was unable to read it.
156 pub fn new_with_limits(
157 data: impl Into<PdfData>,
158 limits: PdfLoadLimits,
159 ) -> Result<Self, LoadPdfError> {
160 Self::new_with_password_and_limits(data, "", limits)
161 }
162
163 /// Try to read the given PDF file with a password.
164 ///
165 /// Returns `Err` if it was unable to read it or if the password is incorrect.
166 pub fn new_with_password(
167 data: impl Into<PdfData>,
168 password: &str,
169 ) -> Result<Self, LoadPdfError> {
170 Self::new_with_password_and_limits(data, password, PdfLoadLimits::default())
171 }
172
173 /// Try to read the given PDF file with a password and parser load limits.
174 ///
175 /// Returns `Err` if it was unable to read it or if the password is incorrect.
176 pub fn new_with_password_and_limits(
177 data: impl Into<PdfData>,
178 password: &str,
179 limits: PdfLoadLimits,
180 ) -> Result<Self, LoadPdfError> {
181 let data = data.into();
182 let password = password.as_bytes();
183 let version = find_version(data.as_ref()).unwrap_or(PdfVersion::Pdf10);
184 let xref = match root_xref(data.clone(), password, limits) {
185 Ok(x) => x,
186 Err(e) => match e {
187 XRefError::Unknown => {
188 fallback(data.clone(), password, limits).ok_or(LoadPdfError::Invalid)?
189 }
190 XRefError::Encryption(e) => return Err(LoadPdfError::Decryption(e)),
191 },
192 };
193 let xref = Arc::new(xref);
194
195 // Reject documents whose xref table exceeds the object limit.
196 // This fires before we decode any object data, so the cost is minimal.
197 // The limit prevents unbounded memory growth on adversarially large PDFs. (#497)
198 let object_count = xref.len();
199 if object_count > MAX_OBJECTS {
200 return Err(LoadPdfError::TooLarge(object_count, 0));
201 }
202
203 let pages = CachedPages::new(xref.clone()).ok_or(LoadPdfError::Invalid)?;
204
205 // Reject documents whose page tree resolves to more pages than allowed.
206 // resolve_pages already caps traversal at MAX_PAGE_COUNT (100 K); checking
207 // against our stricter MAX_PAGES (50 K) here gives a clean error instead
208 // of silently truncating. (#497)
209 let page_count = pages.get().len();
210 if page_count > MAX_PAGES {
211 return Err(LoadPdfError::TooLarge(object_count, page_count));
212 }
213
214 Ok(Self {
215 xref,
216 header_version: version,
217 pages,
218 data,
219 })
220 }
221
222 /// Return the number of objects present in the PDF file.
223 pub fn len(&self) -> usize {
224 self.xref.len()
225 }
226
227 /// Return an iterator over all objects defined in the PDF file.
228 pub fn objects(&self) -> impl IntoIterator<Item = Object<'_>> {
229 self.xref.objects()
230 }
231
232 /// Return the version of the PDF file.
233 pub fn version(&self) -> PdfVersion {
234 self.xref
235 .trailer_data()
236 .version
237 .unwrap_or(self.header_version)
238 }
239
240 /// Return the underlying data of the PDF file.
241 pub fn data(&self) -> &PdfData {
242 &self.data
243 }
244
245 /// Return the pages of the PDF file.
246 pub fn pages(&self) -> &Pages<'_> {
247 self.pages.get()
248 }
249
250 /// Return the xref of the PDF file.
251 pub fn xref(&self) -> &XRef {
252 &self.xref
253 }
254
255 /// Return the metadata in the document information dictionary of the document.
256 pub fn metadata(&self) -> &Metadata {
257 self.xref.metadata()
258 }
259}
260
261fn find_version(data: &[u8]) -> Option<PdfVersion> {
262 let data = &data[..data.len().min(2000)];
263 let mut r = Reader::new(data);
264
265 while r.forward_tag(b"%PDF-").is_none() {
266 r.read_byte()?;
267 }
268
269 PdfVersion::from_bytes(r.tail()?)
270}
271
272/// The version of a PDF document.
273#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
274pub enum PdfVersion {
275 /// PDF 1.0.
276 Pdf10,
277 /// PDF 1.1.
278 Pdf11,
279 /// PDF 1.2.
280 Pdf12,
281 /// PDF 1.3.
282 Pdf13,
283 /// PDF 1.4.
284 Pdf14,
285 /// PDF 1.5.
286 Pdf15,
287 /// PDF 1.6.
288 Pdf16,
289 /// PDF 1.7.
290 Pdf17,
291 /// PDF 2.0.
292 Pdf20,
293}
294
295impl PdfVersion {
296 pub(crate) fn from_bytes(bytes: &[u8]) -> Option<Self> {
297 match bytes.get(..3)? {
298 b"1.0" => Some(Self::Pdf10),
299 b"1.1" => Some(Self::Pdf11),
300 b"1.2" => Some(Self::Pdf12),
301 b"1.3" => Some(Self::Pdf13),
302 b"1.4" => Some(Self::Pdf14),
303 b"1.5" => Some(Self::Pdf15),
304 b"1.6" => Some(Self::Pdf16),
305 b"1.7" => Some(Self::Pdf17),
306 b"2.0" => Some(Self::Pdf20),
307 _ => None,
308 }
309 }
310}
311
312#[cfg(test)]
313mod tests {
314 use crate::pdf::{Pdf, PdfVersion};
315
316 #[test]
317 fn issue_49() {
318 let _ = Pdf::new(Vec::new());
319 }
320
321 #[test]
322 #[ignore = "requires hayro-tests corpus"]
323 fn pdf_version_header() {
324 let data = std::fs::read("../hayro-tests/downloads/pdfjs/alphatrans.pdf").unwrap();
325 let pdf = Pdf::new(data).unwrap();
326
327 assert_eq!(pdf.version(), PdfVersion::Pdf17);
328 }
329
330 #[test]
331 #[ignore = "requires hayro-tests corpus"]
332 fn pdf_version_catalog() {
333 let data = std::fs::read("../hayro-tests/downloads/pdfbox/2163.pdf").unwrap();
334 let pdf = Pdf::new(data).unwrap();
335
336 assert_eq!(pdf.version(), PdfVersion::Pdf14);
337 }
338}