oxidize_pdf/parser/
header.rs

1//! PDF Header Parser
2//!
3//! Parses PDF header and version according to ISO 32000-1 Section 7.5.2
4
5use super::{ParseError, ParseResult};
6use std::io::{BufRead, BufReader, Read};
7
8/// PDF Version information
9#[derive(Debug, Clone, PartialEq)]
10pub struct PdfVersion {
11    pub major: u8,
12    pub minor: u8,
13}
14
15impl PdfVersion {
16    /// Create a new PDF version
17    pub fn new(major: u8, minor: u8) -> Self {
18        Self { major, minor }
19    }
20
21    /// Check if this version is supported
22    pub fn is_supported(&self) -> bool {
23        // We support PDF 1.0 through 2.0
24        matches!((self.major, self.minor), (1, 0..=7) | (2, 0))
25    }
26}
27
28impl std::fmt::Display for PdfVersion {
29    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
30        write!(f, "{}.{}", self.major, self.minor)
31    }
32}
33
34/// PDF Header information
35#[derive(Debug, Clone)]
36pub struct PdfHeader {
37    pub version: PdfVersion,
38    pub has_binary_marker: bool,
39}
40
41impl PdfHeader {
42    /// Parse PDF header from a reader
43    pub fn parse<R: Read>(reader: R) -> ParseResult<Self> {
44        let mut buf_reader = BufReader::new(reader);
45        let mut header = Self::parse_version_line(&mut buf_reader)?;
46
47        // Check for binary marker (recommended for PDF 1.2+)
48        header.has_binary_marker = Self::check_binary_marker(&mut buf_reader)?;
49
50        Ok(header)
51    }
52
53    /// Parse the PDF version line
54    fn parse_version_line<R: BufRead>(reader: &mut R) -> ParseResult<Self> {
55        // Read bytes until we find a newline, avoiding UTF-8 conversion
56        let mut line_bytes = Vec::new();
57
58        loop {
59            let mut byte = [0u8; 1];
60            match reader.read_exact(&mut byte) {
61                Ok(_) => {
62                    if byte[0] == b'\n' || byte[0] == b'\r' {
63                        // Handle CRLF
64                        if byte[0] == b'\r' {
65                            // Peek at next byte
66                            let mut next_byte = [0u8; 1];
67                            if reader.read_exact(&mut next_byte).is_ok() && next_byte[0] != b'\n' {
68                                // Not CRLF, put it back by seeking -1
69                                // Since we can't seek in BufRead, we'll just include it
70                                line_bytes.push(byte[0]);
71                            }
72                        }
73                        break;
74                    }
75                    line_bytes.push(byte[0]);
76                    // Limit line length
77                    if line_bytes.len() > 100 {
78                        return Err(ParseError::InvalidHeader);
79                    }
80                }
81                Err(e) if e.kind() == std::io::ErrorKind::UnexpectedEof => {
82                    if line_bytes.is_empty() {
83                        return Err(ParseError::InvalidHeader);
84                    }
85                    break;
86                }
87                Err(e) => return Err(e.into()),
88            }
89        }
90
91        // Convert to string for parsing
92        // PDF headers should be ASCII, but be lenient about it
93        let line = String::from_utf8_lossy(&line_bytes).into_owned();
94
95        // PDF header must start with %PDF-
96        if !line.starts_with("%PDF-") {
97            return Err(ParseError::InvalidHeader);
98        }
99
100        // Extract version (trim any trailing whitespace/newlines)
101        let version_str = line[5..].trim();
102        let parts: Vec<&str> = version_str.split('.').collect();
103
104        if parts.len() != 2 {
105            return Err(ParseError::InvalidHeader);
106        }
107
108        let major = parts[0]
109            .parse::<u8>()
110            .map_err(|_| ParseError::InvalidHeader)?;
111        let minor = parts[1]
112            .parse::<u8>()
113            .map_err(|_| ParseError::InvalidHeader)?;
114
115        let version = PdfVersion::new(major, minor);
116
117        if !version.is_supported() {
118            return Err(ParseError::UnsupportedVersion(version.to_string()));
119        }
120
121        Ok(PdfHeader {
122            version,
123            has_binary_marker: false,
124        })
125    }
126
127    /// Check for binary marker comment
128    fn check_binary_marker<R: BufRead>(reader: &mut R) -> ParseResult<bool> {
129        let mut buffer = Vec::new();
130
131        // Read bytes until we find a newline or EOF
132        loop {
133            let mut byte = [0u8; 1];
134            match reader.read_exact(&mut byte) {
135                Ok(_) => {
136                    buffer.push(byte[0]);
137                    if byte[0] == b'\n' || byte[0] == b'\r' {
138                        break;
139                    }
140                    // Limit line length to prevent excessive memory usage
141                    if buffer.len() > 1024 {
142                        break;
143                    }
144                }
145                Err(e) if e.kind() == std::io::ErrorKind::UnexpectedEof => {
146                    break;
147                }
148                Err(e) => return Err(e.into()),
149            }
150        }
151
152        if buffer.is_empty() {
153            return Ok(false);
154        }
155
156        // Binary marker should be a comment with at least 4 binary characters
157        if buffer.first() == Some(&b'%') {
158            let binary_count = buffer
159                .iter()
160                .skip(1) // Skip the %
161                .filter(|&&b| b >= 128)
162                .count();
163
164            Ok(binary_count >= 4)
165        } else {
166            // Not a comment, probably start of document content
167            Ok(false)
168        }
169    }
170}
171
172#[cfg(test)]
173mod tests {
174    use super::*;
175    use std::io::Cursor;
176
177    #[test]
178    fn test_parse_pdf_header_basic() {
179        let input = b"%PDF-1.7\n";
180        let header = PdfHeader::parse(Cursor::new(input)).unwrap();
181
182        assert_eq!(header.version.major, 1);
183        assert_eq!(header.version.minor, 7);
184        assert!(!header.has_binary_marker);
185    }
186
187    #[test]
188    fn test_parse_pdf_header_with_binary_marker() {
189        let input = b"%PDF-1.4\n%\xE2\xE3\xCF\xD3\n";
190        let header = PdfHeader::parse(Cursor::new(input)).unwrap();
191
192        assert_eq!(header.version.major, 1);
193        assert_eq!(header.version.minor, 4);
194        assert!(header.has_binary_marker);
195    }
196
197    #[test]
198    fn test_parse_pdf_20() {
199        let input = b"%PDF-2.0\n";
200        let header = PdfHeader::parse(Cursor::new(input)).unwrap();
201
202        assert_eq!(header.version.major, 2);
203        assert_eq!(header.version.minor, 0);
204    }
205
206    #[test]
207    fn test_invalid_header() {
208        let input = b"Not a PDF\n";
209        let result = PdfHeader::parse(Cursor::new(input));
210
211        assert!(matches!(result, Err(ParseError::InvalidHeader)));
212    }
213
214    #[test]
215    fn test_unsupported_version() {
216        let input = b"%PDF-3.0\n";
217        let result = PdfHeader::parse(Cursor::new(input));
218
219        assert!(matches!(result, Err(ParseError::UnsupportedVersion(_))));
220    }
221
222    #[test]
223    fn test_pdf_version_new() {
224        let version = PdfVersion::new(1, 5);
225        assert_eq!(version.major, 1);
226        assert_eq!(version.minor, 5);
227    }
228
229    #[test]
230    fn test_pdf_version_display() {
231        let version = PdfVersion::new(1, 7);
232        assert_eq!(version.to_string(), "1.7");
233        assert_eq!(format!("{}", version), "1.7");
234    }
235
236    #[test]
237    fn test_pdf_version_is_supported() {
238        // Supported versions
239        assert!(PdfVersion::new(1, 0).is_supported());
240        assert!(PdfVersion::new(1, 1).is_supported());
241        assert!(PdfVersion::new(1, 4).is_supported());
242        assert!(PdfVersion::new(1, 7).is_supported());
243        assert!(PdfVersion::new(2, 0).is_supported());
244        
245        // Unsupported versions
246        assert!(!PdfVersion::new(0, 9).is_supported());
247        assert!(!PdfVersion::new(1, 8).is_supported());
248        assert!(!PdfVersion::new(2, 1).is_supported());
249        assert!(!PdfVersion::new(3, 0).is_supported());
250    }
251
252    #[test]
253    fn test_pdf_version_equality() {
254        let v1 = PdfVersion::new(1, 5);
255        let v2 = PdfVersion::new(1, 5);
256        let v3 = PdfVersion::new(1, 6);
257        
258        assert_eq!(v1, v2);
259        assert_ne!(v1, v3);
260    }
261
262    #[test]
263    fn test_header_with_crlf() {
264        let input = b"%PDF-1.6\r\n";
265        let header = PdfHeader::parse(Cursor::new(input)).unwrap();
266        
267        assert_eq!(header.version.major, 1);
268        assert_eq!(header.version.minor, 6);
269    }
270
271    #[test]
272    fn test_header_with_cr_only() {
273        let input = b"%PDF-1.3\r";
274        let header = PdfHeader::parse(Cursor::new(input)).unwrap();
275        
276        assert_eq!(header.version.major, 1);
277        assert_eq!(header.version.minor, 3);
278    }
279
280    #[test]
281    fn test_header_with_extra_whitespace() {
282        let input = b"%PDF-1.5   \n";
283        let header = PdfHeader::parse(Cursor::new(input)).unwrap();
284        
285        assert_eq!(header.version.major, 1);
286        assert_eq!(header.version.minor, 5);
287    }
288
289    #[test]
290    fn test_header_no_newline() {
291        let input = b"%PDF-1.2";
292        let header = PdfHeader::parse(Cursor::new(input)).unwrap();
293        
294        assert_eq!(header.version.major, 1);
295        assert_eq!(header.version.minor, 2);
296    }
297
298    #[test]
299    fn test_malformed_version_single_digit() {
300        let input = b"%PDF-1\n";
301        let result = PdfHeader::parse(Cursor::new(input));
302        
303        assert!(matches!(result, Err(ParseError::InvalidHeader)));
304    }
305
306    #[test]
307    fn test_malformed_version_too_many_parts() {
308        let input = b"%PDF-1.4.2\n";
309        let result = PdfHeader::parse(Cursor::new(input));
310        
311        assert!(matches!(result, Err(ParseError::InvalidHeader)));
312    }
313
314    #[test]
315    fn test_malformed_version_non_numeric() {
316        let input = b"%PDF-1.x\n";
317        let result = PdfHeader::parse(Cursor::new(input));
318        
319        assert!(matches!(result, Err(ParseError::InvalidHeader)));
320    }
321
322    #[test]
323    fn test_empty_input() {
324        let input = b"";
325        let result = PdfHeader::parse(Cursor::new(input));
326        
327        assert!(matches!(result, Err(ParseError::InvalidHeader)));
328    }
329
330    #[test]
331    fn test_header_too_long() {
332        // Create a header line that's over 100 characters
333        let long_header = format!("%PDF-1.0{}", "x".repeat(200));
334        let result = PdfHeader::parse(Cursor::new(long_header.as_bytes()));
335        
336        assert!(matches!(result, Err(ParseError::InvalidHeader)));
337    }
338
339    #[test]
340    fn test_binary_marker_insufficient_bytes() {
341        let input = b"%PDF-1.4\n%\xE2\xE3\n";
342        let header = PdfHeader::parse(Cursor::new(input)).unwrap();
343        
344        assert!(!header.has_binary_marker); // Only 2 binary bytes, need 4+
345    }
346
347    #[test]
348    fn test_binary_marker_exact_threshold() {
349        let input = b"%PDF-1.4\n%\xE2\xE3\xCF\xD3\n";
350        let header = PdfHeader::parse(Cursor::new(input)).unwrap();
351        
352        assert!(header.has_binary_marker); // Exactly 4 binary bytes
353    }
354
355    #[test]
356    fn test_binary_marker_more_than_threshold() {
357        let input = b"%PDF-1.4\n%\xE2\xE3\xCF\xD3\x80\x81\n";
358        let header = PdfHeader::parse(Cursor::new(input)).unwrap();
359        
360        assert!(header.has_binary_marker); // More than 4 binary bytes
361    }
362
363    #[test]
364    fn test_binary_marker_no_comment() {
365        let input = b"%PDF-1.4\n1 0 obj\n";
366        let header = PdfHeader::parse(Cursor::new(input)).unwrap();
367        
368        assert!(!header.has_binary_marker); // No % comment
369    }
370
371    #[test]
372    fn test_binary_marker_ascii_only() {
373        let input = b"%PDF-1.4\n%This is a comment\n";
374        let header = PdfHeader::parse(Cursor::new(input)).unwrap();
375        
376        assert!(!header.has_binary_marker); // ASCII only comment
377    }
378
379    #[test]
380    fn test_binary_marker_mixed_content() {
381        let input = b"%PDF-1.4\n%Some text \xE2\xE3\xCF\xD3 more text\n";
382        let header = PdfHeader::parse(Cursor::new(input)).unwrap();
383        
384        assert!(header.has_binary_marker); // Mixed content with sufficient binary
385    }
386
387    #[test]
388    fn test_binary_marker_very_long_line() {
389        let mut long_line = b"%PDF-1.4\n%".to_vec();
390        // Add enough binary characters to exceed the limit
391        for _ in 0..2000 {
392            long_line.push(0x80);
393        }
394        long_line.push(b'\n');
395        
396        let header = PdfHeader::parse(Cursor::new(long_line)).unwrap();
397        
398        assert!(header.has_binary_marker); // Should still detect binary marker
399    }
400
401    #[test]
402    fn test_version_all_supported_ranges() {
403        let supported_versions = vec![
404            (1, 0), (1, 1), (1, 2), (1, 3), (1, 4), (1, 5), (1, 6), (1, 7),
405            (2, 0)
406        ];
407        
408        for (major, minor) in supported_versions {
409            let input = format!("%PDF-{}.{}\n", major, minor);
410            let header = PdfHeader::parse(Cursor::new(input.as_bytes())).unwrap();
411            
412            assert_eq!(header.version.major, major);
413            assert_eq!(header.version.minor, minor);
414            assert!(header.version.is_supported());
415        }
416    }
417
418    #[test]
419    fn test_clone_and_debug() {
420        let version = PdfVersion::new(1, 4);
421        let cloned_version = version.clone();
422        
423        assert_eq!(version, cloned_version);
424        assert_eq!(format!("{:?}", version), "PdfVersion { major: 1, minor: 4 }");
425        
426        let header = PdfHeader {
427            version: version.clone(),
428            has_binary_marker: true,
429        };
430        let cloned_header = header.clone();
431        
432        assert_eq!(header.version, cloned_header.version);
433        assert_eq!(header.has_binary_marker, cloned_header.has_binary_marker);
434    }
435}