hayro_syntax/
pdf.rs

1//! The starting point for reading PDF files.
2
3use crate::PdfData;
4use crate::object::Object;
5use crate::page::Pages;
6use crate::page::cached::CachedPages;
7use crate::reader::Reader;
8use crate::xref::{XRef, XRefError, fallback, root_xref};
9use std::sync::Arc;
10
11/// A PDF file.
12pub struct Pdf {
13    xref: Arc<XRef>,
14    header_version: PdfVersion,
15    pages: CachedPages,
16    data: PdfData,
17}
18
19/// An error that occurred while loading a PDF file.
20#[derive(Debug, Copy, Clone)]
21pub enum LoadPdfError {
22    /// The PDF was encrypted. Encrypted PDF files are currently not supported.
23    Encryption,
24    /// The PDF was invalid or could not be parsed due to some other unknown reason.
25    Invalid,
26}
27
28#[allow(clippy::len_without_is_empty)]
29impl Pdf {
30    /// Try to read the given PDF file.
31    ///
32    /// Returns `None` if it was unable to read it.
33    pub fn new(data: PdfData) -> Result<Self, LoadPdfError> {
34        let version = find_version(data.as_ref().as_ref()).unwrap_or(PdfVersion::Pdf10);
35        let xref = match root_xref(data.clone()) {
36            Ok(x) => x,
37            Err(e) => match e {
38                XRefError::Unknown => fallback(data.clone()).ok_or(LoadPdfError::Invalid)?,
39                XRefError::Encrypted => return Err(LoadPdfError::Encryption),
40            },
41        };
42        let xref = Arc::new(xref);
43
44        let pages = CachedPages::new(xref.clone()).ok_or(LoadPdfError::Invalid)?;
45
46        Ok(Self {
47            xref,
48            header_version: version,
49            pages,
50            data,
51        })
52    }
53
54    /// Return the number of objects present in the PDF file.
55    pub fn len(&self) -> usize {
56        self.xref.len()
57    }
58
59    /// Return an iterator over all objects defined in the PDF file.
60    pub fn objects(&self) -> impl IntoIterator<Item = Object> {
61        self.xref.objects()
62    }
63
64    /// Return the version of the PDF file.
65    pub fn version(&self) -> PdfVersion {
66        self.xref
67            .trailer_data()
68            .version
69            .unwrap_or(self.header_version)
70    }
71
72    /// Return the underlying data of the PDF file.
73    pub fn data(&self) -> &PdfData {
74        &self.data
75    }
76
77    /// Return the pages of the PDF file.
78    pub fn pages(&self) -> &Pages {
79        self.pages.get()
80    }
81
82    /// Return the xref of the PDF file.
83    pub fn xref(&self) -> &XRef {
84        &self.xref
85    }
86}
87
88fn find_version(data: &[u8]) -> Option<PdfVersion> {
89    let data = &data[..data.len().min(2000)];
90    let mut r = Reader::new(data);
91
92    while r.forward_tag(b"%PDF-").is_none() {
93        r.read_byte()?;
94    }
95
96    PdfVersion::from_bytes(r.tail()?)
97}
98
99/// The version of a PDF document.
100#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
101pub enum PdfVersion {
102    /// PDF 1.0.
103    Pdf10,
104    /// PDF 1.1.
105    Pdf11,
106    /// PDF 1.2.
107    Pdf12,
108    /// PDF 1.3.
109    Pdf13,
110    /// PDF 1.4.
111    Pdf14,
112    /// PDF 1.5.
113    Pdf15,
114    /// PDF 1.6.
115    Pdf16,
116    /// PDF 1.7.
117    Pdf17,
118    /// PDF 2.0.
119    Pdf20,
120}
121
122impl PdfVersion {
123    pub(crate) fn from_bytes(bytes: &[u8]) -> Option<PdfVersion> {
124        match bytes.get(..3)? {
125            b"1.0" => Some(PdfVersion::Pdf10),
126            b"1.1" => Some(PdfVersion::Pdf11),
127            b"1.2" => Some(PdfVersion::Pdf12),
128            b"1.3" => Some(PdfVersion::Pdf13),
129            b"1.4" => Some(PdfVersion::Pdf14),
130            b"1.5" => Some(PdfVersion::Pdf15),
131            b"1.6" => Some(PdfVersion::Pdf16),
132            b"1.7" => Some(PdfVersion::Pdf17),
133            b"2.0" => Some(PdfVersion::Pdf20),
134            _ => None,
135        }
136    }
137}
138
139#[cfg(test)]
140mod tests {
141    use crate::pdf::{Pdf, PdfVersion};
142    use std::sync::Arc;
143
144    #[test]
145    fn issue_49() {
146        let data = Arc::new([]);
147        let _ = Pdf::new(data);
148    }
149
150    #[test]
151    fn pdf_version_header() {
152        let data = std::fs::read("../hayro-tests/pdfs/pdfjs/alphatrans.pdf").unwrap();
153        let pdf = Pdf::new(Arc::new(data)).unwrap();
154
155        assert_eq!(pdf.version(), PdfVersion::Pdf17);
156    }
157
158    #[test]
159    fn pdf_version_catalog() {
160        let data = std::fs::read("../hayro-tests/downloads/pdfbox/2163.pdf").unwrap();
161        let pdf = Pdf::new(Arc::new(data)).unwrap();
162
163        assert_eq!(pdf.version(), PdfVersion::Pdf14);
164    }
165}