Skip to main content

pdf_syntax/
pdf.rs

1//! The starting point for reading PDF files.
2
3use crate::PdfData;
4use crate::object::Object;
5use crate::page::Pages;
6use crate::page::cached::CachedPages;
7use crate::reader::Reader;
8use crate::sync::Arc;
9use crate::xref::{XRef, XRefError, fallback, root_xref};
10
11pub use crate::crypto::DecryptionError;
12use crate::metadata::Metadata;
13
14/// A PDF file.
15pub struct Pdf {
16    xref: Arc<XRef>,
17    header_version: PdfVersion,
18    pages: CachedPages,
19    data: PdfData,
20}
21
22/// Maximum number of xref entries (indirect objects) allowed in a single PDF.
23///
24/// PDFs exceeding this limit are rejected with [`LoadPdfError::TooLarge`] to
25/// prevent unbounded memory growth. Corpus data shows legitimate documents
26/// rarely exceed 50 K objects; 500 K is a safe, generous upper bound. (#497)
27pub const MAX_OBJECTS: usize = 500_000;
28
29/// Maximum number of pages allowed in a single PDF.
30///
31/// Traversal of the page tree is capped at this value and documents that
32/// exceed it are rejected with [`LoadPdfError::TooLarge`]. (#497)
33pub const MAX_PAGES: usize = 50_000;
34
35/// An error that occurred while loading a PDF file.
36#[derive(Debug, Copy, Clone, PartialEq, Eq)]
37pub enum LoadPdfError {
38    /// An error occurred while processing an encrypted document.
39    Decryption(DecryptionError),
40    /// The PDF was invalid or could not be parsed due to some other unknown reason.
41    Invalid,
42    /// The PDF exceeds a configured size limit (object count or page count).
43    ///
44    /// The first field is the number of xref objects; the second is the page
45    /// count. Either or both may have triggered the limit. (#497)
46    TooLarge(usize, usize),
47}
48
49#[allow(clippy::len_without_is_empty)]
50impl Pdf {
51    /// Try to read the given PDF file.
52    ///
53    /// Returns `Err` if it was unable to read it.
54    pub fn new(data: impl Into<PdfData>) -> Result<Self, LoadPdfError> {
55        Self::new_with_password(data, "")
56    }
57
58    /// Try to read the given PDF file with a password.
59    ///
60    /// Returns `Err` if it was unable to read it or if the password is incorrect.
61    pub fn new_with_password(
62        data: impl Into<PdfData>,
63        password: &str,
64    ) -> Result<Self, LoadPdfError> {
65        let data = data.into();
66        let password = password.as_bytes();
67        let version = find_version(data.as_ref()).unwrap_or(PdfVersion::Pdf10);
68        let xref = match root_xref(data.clone(), password) {
69            Ok(x) => x,
70            Err(e) => match e {
71                XRefError::Unknown => {
72                    fallback(data.clone(), password).ok_or(LoadPdfError::Invalid)?
73                }
74                XRefError::Encryption(e) => return Err(LoadPdfError::Decryption(e)),
75            },
76        };
77        let xref = Arc::new(xref);
78
79        // Reject documents whose xref table exceeds the object limit.
80        // This fires before we decode any object data, so the cost is minimal.
81        // The limit prevents unbounded memory growth on adversarially large PDFs. (#497)
82        let object_count = xref.len();
83        if object_count > MAX_OBJECTS {
84            return Err(LoadPdfError::TooLarge(object_count, 0));
85        }
86
87        let pages = CachedPages::new(xref.clone()).ok_or(LoadPdfError::Invalid)?;
88
89        // Reject documents whose page tree resolves to more pages than allowed.
90        // resolve_pages already caps traversal at MAX_PAGE_COUNT (100 K); checking
91        // against our stricter MAX_PAGES (50 K) here gives a clean error instead
92        // of silently truncating. (#497)
93        let page_count = pages.get().len();
94        if page_count > MAX_PAGES {
95            return Err(LoadPdfError::TooLarge(object_count, page_count));
96        }
97
98        Ok(Self {
99            xref,
100            header_version: version,
101            pages,
102            data,
103        })
104    }
105
106    /// Return the number of objects present in the PDF file.
107    pub fn len(&self) -> usize {
108        self.xref.len()
109    }
110
111    /// Return an iterator over all objects defined in the PDF file.
112    pub fn objects(&self) -> impl IntoIterator<Item = Object<'_>> {
113        self.xref.objects()
114    }
115
116    /// Return the version of the PDF file.
117    pub fn version(&self) -> PdfVersion {
118        self.xref
119            .trailer_data()
120            .version
121            .unwrap_or(self.header_version)
122    }
123
124    /// Return the underlying data of the PDF file.
125    pub fn data(&self) -> &PdfData {
126        &self.data
127    }
128
129    /// Return the pages of the PDF file.
130    pub fn pages(&self) -> &Pages<'_> {
131        self.pages.get()
132    }
133
134    /// Return the xref of the PDF file.
135    pub fn xref(&self) -> &XRef {
136        &self.xref
137    }
138
139    /// Return the metadata in the document information dictionary of the document.
140    pub fn metadata(&self) -> &Metadata {
141        self.xref.metadata()
142    }
143}
144
145fn find_version(data: &[u8]) -> Option<PdfVersion> {
146    let data = &data[..data.len().min(2000)];
147    let mut r = Reader::new(data);
148
149    while r.forward_tag(b"%PDF-").is_none() {
150        r.read_byte()?;
151    }
152
153    PdfVersion::from_bytes(r.tail()?)
154}
155
156/// The version of a PDF document.
157#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
158pub enum PdfVersion {
159    /// PDF 1.0.
160    Pdf10,
161    /// PDF 1.1.
162    Pdf11,
163    /// PDF 1.2.
164    Pdf12,
165    /// PDF 1.3.
166    Pdf13,
167    /// PDF 1.4.
168    Pdf14,
169    /// PDF 1.5.
170    Pdf15,
171    /// PDF 1.6.
172    Pdf16,
173    /// PDF 1.7.
174    Pdf17,
175    /// PDF 2.0.
176    Pdf20,
177}
178
179impl PdfVersion {
180    pub(crate) fn from_bytes(bytes: &[u8]) -> Option<Self> {
181        match bytes.get(..3)? {
182            b"1.0" => Some(Self::Pdf10),
183            b"1.1" => Some(Self::Pdf11),
184            b"1.2" => Some(Self::Pdf12),
185            b"1.3" => Some(Self::Pdf13),
186            b"1.4" => Some(Self::Pdf14),
187            b"1.5" => Some(Self::Pdf15),
188            b"1.6" => Some(Self::Pdf16),
189            b"1.7" => Some(Self::Pdf17),
190            b"2.0" => Some(Self::Pdf20),
191            _ => None,
192        }
193    }
194}
195
196#[cfg(test)]
197mod tests {
198    use crate::pdf::{Pdf, PdfVersion};
199
200    #[test]
201    fn issue_49() {
202        let _ = Pdf::new(Vec::new());
203    }
204
205    #[test]
206    #[ignore = "requires hayro-tests corpus"]
207    fn pdf_version_header() {
208        let data = std::fs::read("../hayro-tests/downloads/pdfjs/alphatrans.pdf").unwrap();
209        let pdf = Pdf::new(data).unwrap();
210
211        assert_eq!(pdf.version(), PdfVersion::Pdf17);
212    }
213
214    #[test]
215    #[ignore = "requires hayro-tests corpus"]
216    fn pdf_version_catalog() {
217        let data = std::fs::read("../hayro-tests/downloads/pdfbox/2163.pdf").unwrap();
218        let pdf = Pdf::new(data).unwrap();
219
220        assert_eq!(pdf.version(), PdfVersion::Pdf14);
221    }
222}