Skip to main content

zpdf_parser/
header.rs

1use zpdf_core::{Error, Result};
2
3#[derive(Debug, Clone, Copy)]
4pub struct PdfHeader {
5    pub major: u8,
6    pub minor: u8,
7}
8
9/// Version assumed when a `%PDF` marker is present but the version digits that
10/// should follow it are missing or malformed. Matching other robust readers,
11/// a bad version is treated as a modern document (PDF 1.7) rather than rejected.
12const DEFAULT_VERSION: PdfHeader = PdfHeader { major: 1, minor: 7 };
13
14/// Locate and parse the `%PDF` header. Real-world corpora are full of files
15/// whose version field is garbage (`%PDF-1.)`, `%PDF-0000000`, `%PDF-a.4`),
16/// missing entirely (`%PDF-\n2 0 obj`), or written without the conventional
17/// hyphen (`%PDF/DA2`). Matching mainstream readers, we accept any file that
18/// contains the literal `%PDF` and fall back to [`DEFAULT_VERSION`] whenever the
19/// trailing version cannot be read. `Err(NotAPdf)` is reserved for files with no
20/// `%PDF` marker at all — the caller then tries object-scan recovery, which can
21/// still open a headerless fragment that begins directly with `N G obj`.
22pub fn parse_header(data: &[u8]) -> Result<PdfHeader> {
23    let marker = b"%PDF";
24    let pos = data
25        .windows(marker.len())
26        .position(|w| w == marker)
27        .ok_or(Error::NotAPdf)?;
28
29    // The version conventionally follows as "-M.m"; tolerate a missing hyphen
30    // and a malformed/garbage version. A bad version does not make the body
31    // unparseable, so warn and assume a modern default rather than rejecting
32    // the file. (NotAPdf above is reserved for a wholly missing `%PDF` marker.)
33    let rest = &data[pos + marker.len()..];
34    let rest = rest.strip_prefix(b"-").unwrap_or(rest);
35    match parse_version(rest) {
36        Some(h) => Ok(h),
37        None => {
38            let shown = String::from_utf8_lossy(&rest[..rest.len().min(8)]);
39            tracing::warn!(
40                "malformed PDF header version {shown:?}; assuming PDF {}.{}",
41                DEFAULT_VERSION.major,
42                DEFAULT_VERSION.minor
43            );
44            Ok(DEFAULT_VERSION)
45        }
46    }
47}
48
49/// Best-effort `M.m` parse from the bytes following the `%PDF[-]` marker.
50/// Returns `None` (caller defaults) if the major digit, the `.`, or the minor
51/// digit is absent or out of range.
52fn parse_version(rest: &[u8]) -> Option<PdfHeader> {
53    let major = rest.first()?.checked_sub(b'0').filter(|&v| v <= 9)?;
54    if rest.get(1).copied()? != b'.' {
55        return None;
56    }
57    let minor = rest.get(2)?.checked_sub(b'0').filter(|&v| v <= 9)?;
58    Some(PdfHeader { major, minor })
59}
60
61#[cfg(test)]
62mod tests {
63    use super::*;
64
65    #[test]
66    fn valid_header() {
67        let data = b"%PDF-1.7\n";
68        let h = parse_header(data).unwrap();
69        assert_eq!(h.major, 1);
70        assert_eq!(h.minor, 7);
71    }
72
73    #[test]
74    fn pdf_2_0() {
75        let data = b"%PDF-2.0\n";
76        let h = parse_header(data).unwrap();
77        assert_eq!(h.major, 2);
78        assert_eq!(h.minor, 0);
79    }
80
81    #[test]
82    fn garbage_before_header() {
83        let data = b"\xef\xbb\xbf%PDF-1.4\n";
84        let h = parse_header(data).unwrap();
85        assert_eq!(h.major, 1);
86        assert_eq!(h.minor, 4);
87    }
88
89    #[test]
90    fn not_a_pdf() {
91        assert!(parse_header(b"not a pdf").is_err());
92    }
93
94    #[test]
95    fn marker_without_hyphen_defaults_version() {
96        // `%PDF/DA2 ...` — real Ghostscript output; accepted (default version).
97        let h = parse_header(b"%PDF/DA2 \x1d\n").unwrap();
98        assert_eq!((h.major, h.minor), (1, 7));
99    }
100
101    #[test]
102    fn malformed_version_defaults() {
103        // veraPDF corpus 6.1.2 file-header test (`%PDF-a.4`) plus assorted
104        // fuzzed/headerless variants: a bad version must not reject the file.
105        for bytes in [
106            &b"%PDF-a.4\n"[..],
107            &b"%PDF-1.)"[..],
108            &b"%PDF-0000000"[..],
109            &b"%PDF-/Si3/De"[..],
110            &b"%PDF-1e66666"[..],
111            &b"%PDF-{<~00~"[..],
112            &b"%PDF-\n2 0 obj"[..],
113        ] {
114            let h = parse_header(bytes).expect("marker present => header parses");
115            assert_eq!((h.major, h.minor), (1, 7), "input {bytes:?}");
116        }
117    }
118
119    #[test]
120    fn truncated_version_defaults() {
121        let h = parse_header(b"%PDF-").unwrap();
122        assert_eq!((h.major, h.minor), (1, 7));
123    }
124
125    #[test]
126    fn no_marker_at_all_is_err() {
127        // A headerless object fragment has no `%PDF`; the parser rejects it here
128        // and the caller falls back to object-scan recovery.
129        assert!(parse_header(b"1 0 obj<</Type/Catalog>>endobj").is_err());
130    }
131}