Skip to main content

hayro_syntax/
pdf.rs

1//! The starting point for reading PDF files.
2
3use crate::PdfData;
4use crate::object::Object;
5use crate::page::Pages;
6use crate::page::cached::CachedPages;
7use crate::reader::Reader;
8use crate::sync::Arc;
9use crate::xref::{XRef, XRefError, fallback, root_xref};
10
11pub use crate::crypto::DecryptionError;
12use crate::metadata::Metadata;
13
14/// A PDF file.
15pub struct Pdf {
16    xref: Arc<XRef>,
17    header_version: PdfVersion,
18    pages: CachedPages,
19    data: PdfData,
20}
21
22/// An error that occurred while loading a PDF file.
23#[derive(Debug, Copy, Clone, PartialEq, Eq)]
24pub enum LoadPdfError {
25    /// An error occurred while processing an encrypted document.
26    Decryption(DecryptionError),
27    /// The PDF was invalid or could not be parsed due to some other unknown reason.
28    Invalid,
29}
30
31#[allow(clippy::len_without_is_empty)]
32impl Pdf {
33    /// Try to read the given PDF file.
34    ///
35    /// Returns `Err` if it was unable to read it.
36    pub fn new(data: impl Into<PdfData>) -> Result<Self, LoadPdfError> {
37        Self::new_with_password(data, "")
38    }
39
40    /// Try to read the given PDF file with a password.
41    ///
42    /// Returns `Err` if it was unable to read it or if the password is incorrect.
43    pub fn new_with_password(
44        data: impl Into<PdfData>,
45        password: &str,
46    ) -> Result<Self, LoadPdfError> {
47        let data = data.into();
48        let password = password.as_bytes();
49        let version = find_version(data.as_ref()).unwrap_or(PdfVersion::Pdf10);
50        let xref = match root_xref(data.clone(), password) {
51            Ok(x) => x,
52            Err(e) => match e {
53                XRefError::Unknown => {
54                    fallback(data.clone(), password).ok_or(LoadPdfError::Invalid)?
55                }
56                XRefError::Encryption(e) => return Err(LoadPdfError::Decryption(e)),
57            },
58        };
59        let xref = Arc::new(xref);
60
61        let pages = CachedPages::new(xref.clone()).ok_or(LoadPdfError::Invalid)?;
62
63        Ok(Self {
64            xref,
65            header_version: version,
66            pages,
67            data,
68        })
69    }
70
71    /// Return the number of objects present in the PDF file.
72    pub fn len(&self) -> usize {
73        self.xref.len()
74    }
75
76    /// Return an iterator over all objects defined in the PDF file.
77    pub fn objects(&self) -> impl IntoIterator<Item = Object<'_>> {
78        self.xref.objects()
79    }
80
81    /// Return the version of the PDF file.
82    pub fn version(&self) -> PdfVersion {
83        self.xref
84            .trailer_data()
85            .version
86            .unwrap_or(self.header_version)
87    }
88
89    /// Return the underlying data of the PDF file.
90    pub fn data(&self) -> &PdfData {
91        &self.data
92    }
93
94    /// Return the pages of the PDF file.
95    pub fn pages(&self) -> &Pages<'_> {
96        self.pages.get()
97    }
98
99    /// Return the xref of the PDF file.
100    pub fn xref(&self) -> &XRef {
101        &self.xref
102    }
103
104    /// Return the metadata in the document information dictionary of the document.
105    pub fn metadata(&self) -> &Metadata {
106        self.xref.metadata()
107    }
108}
109
110fn find_version(data: &[u8]) -> Option<PdfVersion> {
111    let data = &data[..data.len().min(2000)];
112    let mut r = Reader::new(data);
113
114    while r.forward_tag(b"%PDF-").is_none() {
115        r.read_byte()?;
116    }
117
118    PdfVersion::from_bytes(r.tail()?)
119}
120
121/// The version of a PDF document.
122#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
123pub enum PdfVersion {
124    /// PDF 1.0.
125    Pdf10,
126    /// PDF 1.1.
127    Pdf11,
128    /// PDF 1.2.
129    Pdf12,
130    /// PDF 1.3.
131    Pdf13,
132    /// PDF 1.4.
133    Pdf14,
134    /// PDF 1.5.
135    Pdf15,
136    /// PDF 1.6.
137    Pdf16,
138    /// PDF 1.7.
139    Pdf17,
140    /// PDF 2.0.
141    Pdf20,
142}
143
144impl PdfVersion {
145    pub(crate) fn from_bytes(bytes: &[u8]) -> Option<Self> {
146        match bytes.get(..3)? {
147            b"1.0" => Some(Self::Pdf10),
148            b"1.1" => Some(Self::Pdf11),
149            b"1.2" => Some(Self::Pdf12),
150            b"1.3" => Some(Self::Pdf13),
151            b"1.4" => Some(Self::Pdf14),
152            b"1.5" => Some(Self::Pdf15),
153            b"1.6" => Some(Self::Pdf16),
154            b"1.7" => Some(Self::Pdf17),
155            b"2.0" => Some(Self::Pdf20),
156            _ => None,
157        }
158    }
159}
160
161#[cfg(test)]
162mod tests {
163    use crate::pdf::{Pdf, PdfVersion};
164
165    #[test]
166    fn issue_49() {
167        let _ = Pdf::new(Vec::new());
168    }
169
170    #[test]
171    fn pdf_version_header() {
172        let data = std::fs::read("../hayro-tests/downloads/pdfjs/alphatrans.pdf").unwrap();
173        let pdf = Pdf::new(data).unwrap();
174
175        assert_eq!(pdf.version(), PdfVersion::Pdf17);
176    }
177
178    #[test]
179    fn pdf_version_catalog() {
180        let data = std::fs::read("../hayro-tests/downloads/pdfbox/2163.pdf").unwrap();
181        let pdf = Pdf::new(data).unwrap();
182
183        assert_eq!(pdf.version(), PdfVersion::Pdf14);
184    }
185}