oxidize_pdf/parser/
header.rs

1//! PDF Header Parser
2//!
3//! Parses PDF header and version according to ISO 32000-1 Section 7.5.2
4
5use super::{ParseError, ParseResult};
6use std::io::{BufRead, BufReader, Read};
7
8/// PDF Version information
9#[derive(Debug, Clone, PartialEq)]
10pub struct PdfVersion {
11    pub major: u8,
12    pub minor: u8,
13}
14
15impl PdfVersion {
16    /// Create a new PDF version
17    pub fn new(major: u8, minor: u8) -> Self {
18        Self { major, minor }
19    }
20
21    /// Check if this version is supported
22    pub fn is_supported(&self) -> bool {
23        // We support PDF 1.0 through 2.0
24        matches!((self.major, self.minor), (1, 0..=7) | (2, 0))
25    }
26}
27
28impl std::fmt::Display for PdfVersion {
29    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
30        write!(f, "{}.{}", self.major, self.minor)
31    }
32}
33
34/// PDF Header information
35#[derive(Debug, Clone)]
36pub struct PdfHeader {
37    pub version: PdfVersion,
38    pub has_binary_marker: bool,
39}
40
41impl PdfHeader {
42    /// Parse PDF header from a reader
43    pub fn parse<R: Read>(reader: R) -> ParseResult<Self> {
44        let mut buf_reader = BufReader::new(reader);
45        let mut header = Self::parse_version_line(&mut buf_reader)?;
46
47        // Check for binary marker (recommended for PDF 1.2+)
48        header.has_binary_marker = Self::check_binary_marker(&mut buf_reader)?;
49
50        Ok(header)
51    }
52
53    /// Parse the PDF version line
54    fn parse_version_line<R: BufRead>(reader: &mut R) -> ParseResult<Self> {
55        // Read bytes until we find a newline, avoiding UTF-8 conversion
56        let mut line_bytes = Vec::new();
57
58        loop {
59            let mut byte = [0u8; 1];
60            match reader.read_exact(&mut byte) {
61                Ok(_) => {
62                    if byte[0] == b'\n' || byte[0] == b'\r' {
63                        // Handle CRLF
64                        if byte[0] == b'\r' {
65                            // Peek at next byte
66                            let mut next_byte = [0u8; 1];
67                            if reader.read_exact(&mut next_byte).is_ok() && next_byte[0] != b'\n' {
68                                // Not CRLF, put it back by seeking -1
69                                // Since we can't seek in BufRead, we'll just include it
70                                line_bytes.push(byte[0]);
71                            }
72                        }
73                        break;
74                    }
75                    line_bytes.push(byte[0]);
76                    // Limit line length
77                    if line_bytes.len() > 100 {
78                        return Err(ParseError::InvalidHeader);
79                    }
80                }
81                Err(e) if e.kind() == std::io::ErrorKind::UnexpectedEof => {
82                    if line_bytes.is_empty() {
83                        return Err(ParseError::InvalidHeader);
84                    }
85                    break;
86                }
87                Err(e) => return Err(e.into()),
88            }
89        }
90
91        // Convert to string for parsing
92        // PDF headers should be ASCII, but be lenient about it
93        let line = String::from_utf8_lossy(&line_bytes).into_owned();
94
95        // PDF header must start with %PDF-
96        if !line.starts_with("%PDF-") {
97            return Err(ParseError::InvalidHeader);
98        }
99
100        // Extract version (trim any trailing whitespace/newlines)
101        let version_str = line[5..].trim();
102        let parts: Vec<&str> = version_str.split('.').collect();
103
104        if parts.len() != 2 {
105            return Err(ParseError::InvalidHeader);
106        }
107
108        let major = parts[0]
109            .parse::<u8>()
110            .map_err(|_| ParseError::InvalidHeader)?;
111        let minor = parts[1]
112            .parse::<u8>()
113            .map_err(|_| ParseError::InvalidHeader)?;
114
115        let version = PdfVersion::new(major, minor);
116
117        if !version.is_supported() {
118            return Err(ParseError::UnsupportedVersion(version.to_string()));
119        }
120
121        Ok(PdfHeader {
122            version,
123            has_binary_marker: false,
124        })
125    }
126
127    /// Check for binary marker comment
128    fn check_binary_marker<R: BufRead>(reader: &mut R) -> ParseResult<bool> {
129        let mut buffer = Vec::new();
130
131        // Read bytes until we find a newline or EOF
132        loop {
133            let mut byte = [0u8; 1];
134            match reader.read_exact(&mut byte) {
135                Ok(_) => {
136                    buffer.push(byte[0]);
137                    if byte[0] == b'\n' || byte[0] == b'\r' {
138                        break;
139                    }
140                    // Limit line length to prevent excessive memory usage
141                    if buffer.len() > 1024 {
142                        break;
143                    }
144                }
145                Err(e) if e.kind() == std::io::ErrorKind::UnexpectedEof => {
146                    break;
147                }
148                Err(e) => return Err(e.into()),
149            }
150        }
151
152        if buffer.is_empty() {
153            return Ok(false);
154        }
155
156        // Binary marker should be a comment with at least 4 binary characters
157        if buffer.first() == Some(&b'%') {
158            let binary_count = buffer
159                .iter()
160                .skip(1) // Skip the %
161                .filter(|&&b| b >= 128)
162                .count();
163
164            Ok(binary_count >= 4)
165        } else {
166            // Not a comment, probably start of document content
167            Ok(false)
168        }
169    }
170}
171
172#[cfg(test)]
173mod tests {
174    use super::*;
175    use std::io::Cursor;
176
177    #[test]
178    fn test_parse_pdf_header_basic() {
179        let input = b"%PDF-1.7\n";
180        let header = PdfHeader::parse(Cursor::new(input)).unwrap();
181
182        assert_eq!(header.version.major, 1);
183        assert_eq!(header.version.minor, 7);
184        assert!(!header.has_binary_marker);
185    }
186
187    #[test]
188    fn test_parse_pdf_header_with_binary_marker() {
189        let input = b"%PDF-1.4\n%\xE2\xE3\xCF\xD3\n";
190        let header = PdfHeader::parse(Cursor::new(input)).unwrap();
191
192        assert_eq!(header.version.major, 1);
193        assert_eq!(header.version.minor, 4);
194        assert!(header.has_binary_marker);
195    }
196
197    #[test]
198    fn test_parse_pdf_20() {
199        let input = b"%PDF-2.0\n";
200        let header = PdfHeader::parse(Cursor::new(input)).unwrap();
201
202        assert_eq!(header.version.major, 2);
203        assert_eq!(header.version.minor, 0);
204    }
205
206    #[test]
207    fn test_invalid_header() {
208        let input = b"Not a PDF\n";
209        let result = PdfHeader::parse(Cursor::new(input));
210
211        assert!(matches!(result, Err(ParseError::InvalidHeader)));
212    }
213
214    #[test]
215    fn test_unsupported_version() {
216        let input = b"%PDF-3.0\n";
217        let result = PdfHeader::parse(Cursor::new(input));
218
219        assert!(matches!(result, Err(ParseError::UnsupportedVersion(_))));
220    }
221}