Skip to main content

rpdfium_parser/
header.rs

1// Derived from PDFium's cpdf_parser.cpp (CPDF_Parser::StartParse / ParseHeader)
2// Original: Copyright 2014 The PDFium Authors
3// Licensed under BSD-3-Clause / Apache-2.0
4// See pdfium-upstream/LICENSE for the original license.
5
6//! PDF header parsing — detects `%PDF-X.Y` and returns the version.
7
8use rpdfium_core::ParsingMode;
9use rpdfium_core::error::PdfError;
10
11/// PDF file version extracted from the header.
12#[derive(Debug, Clone, Copy, PartialEq, Eq)]
13pub struct PdfVersion {
14    pub major: u8,
15    pub minor: u8,
16}
17
18impl PdfVersion {
19    pub fn new(major: u8, minor: u8) -> Self {
20        Self { major, minor }
21    }
22}
23
24impl std::fmt::Display for PdfVersion {
25    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
26        write!(f, "{}.{}", self.major, self.minor)
27    }
28}
29
30/// Maximum number of bytes to search for the `%PDF-` header marker.
31/// PDFium searches up to 1024 bytes.
32const MAX_HEADER_SEARCH: usize = 1024;
33
34/// Parse the PDF header from the beginning of the source data.
35///
36/// Returns the detected version and the byte offset immediately after
37/// the header line (after the trailing newline, if present).
38///
39/// In lenient mode, allows garbage bytes before the header and extra whitespace.
40/// In strict mode, the header must appear at byte 0.
41pub fn parse_header(source: &[u8], mode: ParsingMode) -> Result<(PdfVersion, u64), PdfError> {
42    if source.len() < 8 {
43        return Err(PdfError::InvalidHeader);
44    }
45
46    let header_pos = find_header(source, mode)?;
47    let remaining = &source[header_pos..];
48
49    // Expect: %PDF-X.Y
50    if remaining.len() < 8 {
51        return Err(PdfError::InvalidHeader);
52    }
53
54    // Parse major version digit
55    let major = match remaining[5] {
56        b @ b'0'..=b'9' => b - b'0',
57        _ => return Err(PdfError::InvalidHeader),
58    };
59
60    // Expect '.'
61    if remaining[6] != b'.' {
62        return Err(PdfError::InvalidHeader);
63    }
64
65    // Parse minor version digit
66    let minor = match remaining[7] {
67        b @ b'0'..=b'9' => b - b'0',
68        _ => return Err(PdfError::InvalidHeader),
69    };
70
71    // Skip past the header line
72    let mut pos = header_pos + 8;
73
74    // Skip any remaining characters on the header line (e.g. extra digits, though unusual)
75    while pos < source.len() && source[pos] != b'\r' && source[pos] != b'\n' {
76        pos += 1;
77    }
78
79    // Skip the line ending
80    if pos < source.len() && source[pos] == b'\r' {
81        pos += 1;
82    }
83    if pos < source.len() && source[pos] == b'\n' {
84        pos += 1;
85    }
86
87    Ok((PdfVersion::new(major, minor), pos as u64))
88}
89
90/// Locate the `%PDF-` marker within the source.
91fn find_header(source: &[u8], mode: ParsingMode) -> Result<usize, PdfError> {
92    let marker = b"%PDF-";
93    let search_limit = source.len().min(MAX_HEADER_SEARCH);
94
95    match mode {
96        ParsingMode::Strict => {
97            if source.starts_with(marker) {
98                Ok(0)
99            } else {
100                Err(PdfError::InvalidHeader)
101            }
102        }
103        ParsingMode::Lenient => {
104            // Search for %PDF- within the first MAX_HEADER_SEARCH bytes
105            for i in 0..search_limit.saturating_sub(marker.len()) {
106                if source[i..].starts_with(marker) {
107                    if i > 0 {
108                        tracing::warn!(
109                            offset = i,
110                            "PDF header not at byte 0; found garbage before %PDF-"
111                        );
112                    }
113                    return Ok(i);
114                }
115            }
116            Err(PdfError::InvalidHeader)
117        }
118    }
119}
120
121#[cfg(test)]
122mod tests {
123    use super::*;
124
125    #[test]
126    fn test_parse_valid_header_1_7() {
127        let source = b"%PDF-1.7\n";
128        let (version, offset) = parse_header(source, ParsingMode::Strict).unwrap();
129        assert_eq!(version, PdfVersion::new(1, 7));
130        assert_eq!(offset, 9);
131    }
132
133    #[test]
134    fn test_parse_valid_header_2_0() {
135        let source = b"%PDF-2.0\r\n";
136        let (version, offset) = parse_header(source, ParsingMode::Strict).unwrap();
137        assert_eq!(version, PdfVersion::new(2, 0));
138        assert_eq!(offset, 10);
139    }
140
141    #[test]
142    fn test_parse_header_1_4() {
143        let source = b"%PDF-1.4\n%\xe2\xe3\xcf\xd3\n";
144        let (version, offset) = parse_header(source, ParsingMode::Strict).unwrap();
145        assert_eq!(version, PdfVersion::new(1, 4));
146        assert_eq!(offset, 9);
147    }
148
149    #[test]
150    fn test_parse_header_with_cr() {
151        let source = b"%PDF-1.5\r";
152        let (version, offset) = parse_header(source, ParsingMode::Strict).unwrap();
153        assert_eq!(version, PdfVersion::new(1, 5));
154        assert_eq!(offset, 9);
155    }
156
157    #[test]
158    fn test_parse_header_no_newline() {
159        let source = b"%PDF-1.6 rest of file";
160        let (version, _) = parse_header(source, ParsingMode::Strict).unwrap();
161        assert_eq!(version, PdfVersion::new(1, 6));
162    }
163
164    #[test]
165    fn test_strict_rejects_garbage_before_header() {
166        let source = b"garbage%PDF-1.7\n";
167        let result = parse_header(source, ParsingMode::Strict);
168        assert!(result.is_err());
169    }
170
171    #[test]
172    fn test_lenient_accepts_garbage_before_header() {
173        let source = b"\0\0\0%PDF-1.7\n";
174        let (version, _) = parse_header(source, ParsingMode::Lenient).unwrap();
175        assert_eq!(version, PdfVersion::new(1, 7));
176    }
177
178    #[test]
179    fn test_too_short() {
180        let source = b"%PDF-1";
181        let result = parse_header(source, ParsingMode::Strict);
182        assert!(result.is_err());
183    }
184
185    #[test]
186    fn test_invalid_version_char() {
187        let source = b"%PDF-X.Y\n";
188        let result = parse_header(source, ParsingMode::Strict);
189        assert!(result.is_err());
190    }
191
192    #[test]
193    fn test_version_display() {
194        let v = PdfVersion::new(1, 7);
195        assert_eq!(format!("{}", v), "1.7");
196    }
197
198    #[test]
199    fn test_empty_source() {
200        let result = parse_header(b"", ParsingMode::Lenient);
201        assert!(result.is_err());
202    }
203}