oxidize_pdf/parser/
header.rs

1//! PDF Header Parser
2//!
3//! Parses PDF header and version according to ISO 32000-1 Section 7.5.2
4
5use super::{ParseError, ParseResult};
6use std::io::{BufRead, BufReader, Read};
7
8/// PDF Version information
9#[derive(Debug, Clone, PartialEq)]
10pub struct PdfVersion {
11    pub major: u8,
12    pub minor: u8,
13}
14
15impl PdfVersion {
16    /// Create a new PDF version
17    pub fn new(major: u8, minor: u8) -> Self {
18        Self { major, minor }
19    }
20
21    /// Check if this version is supported
22    pub fn is_supported(&self) -> bool {
23        // We support PDF 1.0 through 2.0
24        matches!((self.major, self.minor), (1, 0..=7) | (2, 0))
25    }
26}
27
28impl std::fmt::Display for PdfVersion {
29    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
30        write!(f, "{}.{}", self.major, self.minor)
31    }
32}
33
34/// PDF Header information
35#[derive(Debug, Clone)]
36pub struct PdfHeader {
37    pub version: PdfVersion,
38    pub has_binary_marker: bool,
39}
40
41impl PdfHeader {
42    /// Parse PDF header from a reader
43    pub fn parse<R: Read>(reader: R) -> ParseResult<Self> {
44        let mut buf_reader = BufReader::new(reader);
45        let mut header = Self::parse_version_line(&mut buf_reader)?;
46
47        // Check for binary marker (recommended for PDF 1.2+)
48        header.has_binary_marker = Self::check_binary_marker(&mut buf_reader)?;
49
50        Ok(header)
51    }
52
53    /// Parse the PDF version line
54    fn parse_version_line<R: BufRead>(reader: &mut R) -> ParseResult<Self> {
55        // Read the first line more flexibly - some PDFs might have non-standard formatting
56        let mut line_bytes = Vec::new();
57        let mut consecutive_nulls = 0;
58
59        loop {
60            let mut byte = [0u8; 1];
61            match reader.read_exact(&mut byte) {
62                Ok(_) => {
63                    // Track consecutive null bytes - if we see too many, likely not a PDF
64                    if byte[0] == 0 {
65                        consecutive_nulls += 1;
66                        if consecutive_nulls > 10 {
67                            return Err(ParseError::InvalidHeader);
68                        }
69                    } else {
70                        consecutive_nulls = 0;
71                    }
72
73                    if byte[0] == b'\n' || byte[0] == b'\r' {
74                        // Handle CRLF more robustly
75                        if byte[0] == b'\r' {
76                            // Peek at next byte
77                            let mut next_byte = [0u8; 1];
78                            if reader.read_exact(&mut next_byte).is_ok() && next_byte[0] != b'\n' {
79                                // Not CRLF, put back the byte (can't seek, so store it)
80                                line_bytes.push(byte[0]);
81                            }
82                        }
83                        break;
84                    }
85                    line_bytes.push(byte[0]);
86                    // Be more generous with line length - some PDFs have longer headers
87                    if line_bytes.len() > 200 {
88                        return Err(ParseError::InvalidHeader);
89                    }
90                }
91                Err(e) if e.kind() == std::io::ErrorKind::UnexpectedEof => {
92                    if line_bytes.is_empty() {
93                        return Err(ParseError::InvalidHeader);
94                    }
95                    break;
96                }
97                Err(e) => return Err(e.into()),
98            }
99        }
100
101        // Convert to string for parsing - be more lenient with encoding
102        let line = String::from_utf8_lossy(&line_bytes).into_owned();
103
104        // Look for PDF header anywhere in the first line (some files have leading garbage)
105        let (pdf_start, pdf_prefix_len) = if let Some(pos) = line.find("%PDF-") {
106            (pos, 5) // "%PDF-" is 5 characters
107        } else {
108            // Try case-insensitive match
109            let lower_line = line.to_lowercase();
110            if let Some(pos) = lower_line.find("%pdf-") {
111                (pos, 5) // "%pdf-" is also 5 characters
112            } else {
113                return Err(ParseError::InvalidHeader);
114            }
115        };
116
117        // Extract the PDF header part
118        let pdf_line = &line[pdf_start..];
119        if pdf_line.len() < 7 {
120            // Not enough characters for "%PDF-XY" (minimum format)
121            return Err(ParseError::InvalidHeader);
122        }
123
124        // Extract version (trim any trailing whitespace/newlines)
125        let version_part = &pdf_line[pdf_prefix_len..]; // Skip "%PDF-" or "%pdf-"
126
127        // Extract version more flexibly - look for digits and dots up to the first non-version character
128        let mut version_chars = String::new();
129        for ch in version_part.chars() {
130            if ch.is_ascii_digit() || ch == '.' {
131                version_chars.push(ch);
132            } else if ch.is_whitespace() && !version_chars.is_empty() {
133                // Allow whitespace within version, but clean it up
134                continue;
135            } else if !version_chars.is_empty() {
136                // Found non-version character after version started, stop here
137                break;
138            }
139            // Skip leading non-version characters
140        }
141
142        let version_str = version_chars.trim();
143
144        // Handle various version formats
145        let (major, minor) = if version_str.contains('.') {
146            // Standard format with dot: "1.4", "2.0", etc.
147            let parts: Vec<&str> = version_str.split('.').collect();
148            if parts.len() != 2 {
149                return Err(ParseError::InvalidHeader);
150            }
151
152            let major = parts[0]
153                .trim()
154                .parse::<u8>()
155                .map_err(|_| ParseError::InvalidHeader)?;
156            let minor = parts[1]
157                .trim()
158                .parse::<u8>()
159                .map_err(|_| ParseError::InvalidHeader)?;
160
161            (major, minor)
162        } else {
163            // Try parsing without dot (some malformed PDFs like "%PDF-14")
164            let clean_version = version_str
165                .chars()
166                .filter(|c| c.is_ascii_digit())
167                .collect::<String>();
168
169            if clean_version.len() >= 2 {
170                let major_str = &clean_version[0..1];
171                let minor_str = &clean_version[1..2];
172
173                let major = major_str
174                    .parse::<u8>()
175                    .map_err(|_| ParseError::InvalidHeader)?;
176                let minor = minor_str
177                    .parse::<u8>()
178                    .map_err(|_| ParseError::InvalidHeader)?;
179
180                (major, minor)
181            } else {
182                return Err(ParseError::InvalidHeader);
183            }
184        };
185
186        let version = PdfVersion::new(major, minor);
187
188        if !version.is_supported() {
189            return Err(ParseError::UnsupportedVersion(version.to_string()));
190        }
191
192        Ok(PdfHeader {
193            version,
194            has_binary_marker: false,
195        })
196    }
197
198    /// Check for binary marker comment
199    fn check_binary_marker<R: BufRead>(reader: &mut R) -> ParseResult<bool> {
200        let mut buffer = Vec::new();
201
202        // Read bytes until we find a newline or EOF
203        loop {
204            let mut byte = [0u8; 1];
205            match reader.read_exact(&mut byte) {
206                Ok(_) => {
207                    buffer.push(byte[0]);
208                    if byte[0] == b'\n' || byte[0] == b'\r' {
209                        break;
210                    }
211                    // Limit line length to prevent excessive memory usage
212                    if buffer.len() > 1024 {
213                        break;
214                    }
215                }
216                Err(e) if e.kind() == std::io::ErrorKind::UnexpectedEof => {
217                    break;
218                }
219                Err(e) => return Err(e.into()),
220            }
221        }
222
223        if buffer.is_empty() {
224            return Ok(false);
225        }
226
227        // Binary marker should be a comment with at least 4 binary characters
228        if buffer.first() == Some(&b'%') {
229            let binary_count = buffer
230                .iter()
231                .skip(1) // Skip the %
232                .filter(|&&b| b >= 128)
233                .count();
234
235            Ok(binary_count >= 4)
236        } else {
237            // Not a comment, probably start of document content
238            Ok(false)
239        }
240    }
241}
242
243#[cfg(test)]
244mod tests {
245    use super::*;
246    use std::io::Cursor;
247
248    #[test]
249    fn test_parse_pdf_header_basic() {
250        let input = b"%PDF-1.7\n";
251        let header = PdfHeader::parse(Cursor::new(input)).unwrap();
252
253        assert_eq!(header.version.major, 1);
254        assert_eq!(header.version.minor, 7);
255        assert!(!header.has_binary_marker);
256    }
257
258    #[test]
259    fn test_parse_pdf_header_with_binary_marker() {
260        let input = b"%PDF-1.4\n%\xE2\xE3\xCF\xD3\n";
261        let header = PdfHeader::parse(Cursor::new(input)).unwrap();
262
263        assert_eq!(header.version.major, 1);
264        assert_eq!(header.version.minor, 4);
265        assert!(header.has_binary_marker);
266    }
267
268    #[test]
269    fn test_parse_pdf_20() {
270        let input = b"%PDF-2.0\n";
271        let header = PdfHeader::parse(Cursor::new(input)).unwrap();
272
273        assert_eq!(header.version.major, 2);
274        assert_eq!(header.version.minor, 0);
275    }
276
277    #[test]
278    fn test_invalid_header() {
279        let input = b"Not a PDF\n";
280        let result = PdfHeader::parse(Cursor::new(input));
281
282        assert!(matches!(result, Err(ParseError::InvalidHeader)));
283    }
284
285    #[test]
286    fn test_unsupported_version() {
287        let input = b"%PDF-3.0\n";
288        let result = PdfHeader::parse(Cursor::new(input));
289
290        assert!(matches!(result, Err(ParseError::UnsupportedVersion(_))));
291    }
292
293    #[test]
294    fn test_pdf_version_new() {
295        let version = PdfVersion::new(1, 5);
296        assert_eq!(version.major, 1);
297        assert_eq!(version.minor, 5);
298    }
299
300    #[test]
301    fn test_pdf_version_display() {
302        let version = PdfVersion::new(1, 7);
303        assert_eq!(version.to_string(), "1.7");
304        assert_eq!(format!("{version}"), "1.7");
305    }
306
307    #[test]
308    fn test_pdf_version_is_supported() {
309        // Supported versions
310        assert!(PdfVersion::new(1, 0).is_supported());
311        assert!(PdfVersion::new(1, 1).is_supported());
312        assert!(PdfVersion::new(1, 4).is_supported());
313        assert!(PdfVersion::new(1, 7).is_supported());
314        assert!(PdfVersion::new(2, 0).is_supported());
315
316        // Unsupported versions
317        assert!(!PdfVersion::new(0, 9).is_supported());
318        assert!(!PdfVersion::new(1, 8).is_supported());
319        assert!(!PdfVersion::new(2, 1).is_supported());
320        assert!(!PdfVersion::new(3, 0).is_supported());
321    }
322
323    #[test]
324    fn test_pdf_version_equality() {
325        let v1 = PdfVersion::new(1, 5);
326        let v2 = PdfVersion::new(1, 5);
327        let v3 = PdfVersion::new(1, 6);
328
329        assert_eq!(v1, v2);
330        assert_ne!(v1, v3);
331    }
332
333    #[test]
334    fn test_header_with_crlf() {
335        let input = b"%PDF-1.6\r\n";
336        let header = PdfHeader::parse(Cursor::new(input)).unwrap();
337
338        assert_eq!(header.version.major, 1);
339        assert_eq!(header.version.minor, 6);
340    }
341
342    #[test]
343    fn test_header_with_cr_only() {
344        let input = b"%PDF-1.3\r";
345        let header = PdfHeader::parse(Cursor::new(input)).unwrap();
346
347        assert_eq!(header.version.major, 1);
348        assert_eq!(header.version.minor, 3);
349    }
350
351    #[test]
352    fn test_header_with_extra_whitespace() {
353        let input = b"%PDF-1.5   \n";
354        let header = PdfHeader::parse(Cursor::new(input)).unwrap();
355
356        assert_eq!(header.version.major, 1);
357        assert_eq!(header.version.minor, 5);
358    }
359
360    #[test]
361    fn test_header_no_newline() {
362        let input = b"%PDF-1.2";
363        let header = PdfHeader::parse(Cursor::new(input)).unwrap();
364
365        assert_eq!(header.version.major, 1);
366        assert_eq!(header.version.minor, 2);
367    }
368
369    #[test]
370    fn test_malformed_version_single_digit() {
371        let input = b"%PDF-1\n";
372        let result = PdfHeader::parse(Cursor::new(input));
373
374        assert!(matches!(result, Err(ParseError::InvalidHeader)));
375    }
376
377    #[test]
378    fn test_malformed_version_too_many_parts() {
379        let input = b"%PDF-1.4.2\n";
380        let result = PdfHeader::parse(Cursor::new(input));
381
382        assert!(matches!(result, Err(ParseError::InvalidHeader)));
383    }
384
385    #[test]
386    fn test_malformed_version_non_numeric() {
387        let input = b"%PDF-1.x\n";
388        let result = PdfHeader::parse(Cursor::new(input));
389
390        assert!(matches!(result, Err(ParseError::InvalidHeader)));
391    }
392
393    #[test]
394    fn test_empty_input() {
395        let input = b"";
396        let result = PdfHeader::parse(Cursor::new(input));
397
398        assert!(matches!(result, Err(ParseError::InvalidHeader)));
399    }
400
401    #[test]
402    fn test_header_too_long() {
403        // Create a header line that's over 100 characters
404        let long_header = format!("%PDF-1.0{}", "x".repeat(200));
405        let result = PdfHeader::parse(Cursor::new(long_header.as_bytes()));
406
407        assert!(matches!(result, Err(ParseError::InvalidHeader)));
408    }
409
410    #[test]
411    fn test_binary_marker_insufficient_bytes() {
412        let input = b"%PDF-1.4\n%\xE2\xE3\n";
413        let header = PdfHeader::parse(Cursor::new(input)).unwrap();
414
415        assert!(!header.has_binary_marker); // Only 2 binary bytes, need 4+
416    }
417
418    #[test]
419    fn test_binary_marker_exact_threshold() {
420        let input = b"%PDF-1.4\n%\xE2\xE3\xCF\xD3\n";
421        let header = PdfHeader::parse(Cursor::new(input)).unwrap();
422
423        assert!(header.has_binary_marker); // Exactly 4 binary bytes
424    }
425
426    #[test]
427    fn test_binary_marker_more_than_threshold() {
428        let input = b"%PDF-1.4\n%\xE2\xE3\xCF\xD3\x80\x81\n";
429        let header = PdfHeader::parse(Cursor::new(input)).unwrap();
430
431        assert!(header.has_binary_marker); // More than 4 binary bytes
432    }
433
434    #[test]
435    fn test_binary_marker_no_comment() {
436        let input = b"%PDF-1.4\n1 0 obj\n";
437        let header = PdfHeader::parse(Cursor::new(input)).unwrap();
438
439        assert!(!header.has_binary_marker); // No % comment
440    }
441
442    #[test]
443    fn test_binary_marker_ascii_only() {
444        let input = b"%PDF-1.4\n%This is a comment\n";
445        let header = PdfHeader::parse(Cursor::new(input)).unwrap();
446
447        assert!(!header.has_binary_marker); // ASCII only comment
448    }
449
450    #[test]
451    fn test_binary_marker_mixed_content() {
452        let input = b"%PDF-1.4\n%Some text \xE2\xE3\xCF\xD3 more text\n";
453        let header = PdfHeader::parse(Cursor::new(input)).unwrap();
454
455        assert!(header.has_binary_marker); // Mixed content with sufficient binary
456    }
457
458    #[test]
459    fn test_binary_marker_very_long_line() {
460        let mut long_line = b"%PDF-1.4\n%".to_vec();
461        // Add enough binary characters to exceed the limit
462        for _ in 0..2000 {
463            long_line.push(0x80);
464        }
465        long_line.push(b'\n');
466
467        let header = PdfHeader::parse(Cursor::new(long_line)).unwrap();
468
469        assert!(header.has_binary_marker); // Should still detect binary marker
470    }
471
472    #[test]
473    fn test_version_all_supported_ranges() {
474        let supported_versions = vec![
475            (1, 0),
476            (1, 1),
477            (1, 2),
478            (1, 3),
479            (1, 4),
480            (1, 5),
481            (1, 6),
482            (1, 7),
483            (2, 0),
484        ];
485
486        for (major, minor) in supported_versions {
487            let input = format!("%PDF-{major}.{minor}\n");
488            let header = PdfHeader::parse(Cursor::new(input.as_bytes())).unwrap();
489
490            assert_eq!(header.version.major, major);
491            assert_eq!(header.version.minor, minor);
492            assert!(header.version.is_supported());
493        }
494    }
495
496    #[test]
497    fn test_clone_and_debug() {
498        let version = PdfVersion::new(1, 4);
499        let cloned_version = version.clone();
500
501        assert_eq!(version, cloned_version);
502        assert_eq!(format!("{version:?}"), "PdfVersion { major: 1, minor: 4 }");
503
504        let header = PdfHeader {
505            version: version.clone(),
506            has_binary_marker: true,
507        };
508        let cloned_header = header.clone();
509
510        assert_eq!(header.version, cloned_header.version);
511        assert_eq!(header.has_binary_marker, cloned_header.has_binary_marker);
512    }
513
514    // Enhanced flexibility tests for improved header validation
515
516    #[test]
517    fn test_header_with_leading_garbage() {
518        let input = b"junk%PDF-1.4\n";
519        let header = PdfHeader::parse(Cursor::new(input)).unwrap();
520
521        assert_eq!(header.version.major, 1);
522        assert_eq!(header.version.minor, 4);
523    }
524
525    #[test]
526    fn test_header_case_insensitive() {
527        let input = b"%pdf-1.5\n";
528        let header = PdfHeader::parse(Cursor::new(input)).unwrap();
529
530        assert_eq!(header.version.major, 1);
531        assert_eq!(header.version.minor, 5);
532    }
533
534    #[test]
535    fn test_header_version_without_dot() {
536        let input = b"%PDF-14\n";
537        let header = PdfHeader::parse(Cursor::new(input)).unwrap();
538
539        assert_eq!(header.version.major, 1);
540        assert_eq!(header.version.minor, 4);
541    }
542
543    #[test]
544    fn test_header_longer_line_limit() {
545        // Create a header line that's longer than the old 100 char limit but under 200
546        let mut long_header = b"%PDF-1.7".to_vec();
547        long_header.extend(vec![b' '; 150]); // Add 150 spaces
548        long_header.push(b'\n');
549
550        let header = PdfHeader::parse(Cursor::new(long_header)).unwrap();
551        assert_eq!(header.version.major, 1);
552        assert_eq!(header.version.minor, 7);
553    }
554
555    #[test]
556    fn test_header_with_multiple_spaces() {
557        let input = b"%PDF-  1  .  7  \n";
558        let header = PdfHeader::parse(Cursor::new(input)).unwrap();
559
560        assert_eq!(header.version.major, 1);
561        assert_eq!(header.version.minor, 7);
562    }
563
564    #[test]
565    fn test_header_null_byte_protection() {
566        // A few null bytes should be tolerated
567        let input = b"\0\0%PDF-1.6\n";
568        let header = PdfHeader::parse(Cursor::new(input)).unwrap();
569
570        assert_eq!(header.version.major, 1);
571        assert_eq!(header.version.minor, 6);
572    }
573
574    #[test]
575    fn test_header_too_many_nulls() {
576        // Too many consecutive null bytes should fail
577        let mut input = vec![0u8; 15]; // 15 null bytes
578        input.extend_from_slice(b"%PDF-1.4\n");
579
580        let result = PdfHeader::parse(Cursor::new(input));
581        assert!(matches!(result, Err(ParseError::InvalidHeader)));
582    }
583
584    #[test]
585    fn test_header_minimal_length() {
586        let input = b"%PDF-1.0";
587        let header = PdfHeader::parse(Cursor::new(input)).unwrap();
588
589        assert_eq!(header.version.major, 1);
590        assert_eq!(header.version.minor, 0);
591    }
592
593    #[test]
594    fn test_header_too_short() {
595        let input = b"%PDF-1";
596        let result = PdfHeader::parse(Cursor::new(input));
597        assert!(matches!(result, Err(ParseError::InvalidHeader)));
598    }
599
600    #[test]
601    fn test_header_version_extraction_edge_cases() {
602        // Test various whitespace and formatting scenarios
603        let test_cases = vec![("prefix%PDF-1.7\n", (1, 7))];
604
605        for (input, expected) in test_cases {
606            let header = PdfHeader::parse(Cursor::new(input.as_bytes())).unwrap();
607            assert_eq!(header.version.major, expected.0);
608            assert_eq!(header.version.minor, expected.1);
609        }
610    }
611
612    #[test]
613    fn test_header_with_extra_text() {
614        // Test header with additional text after version
615        let input = b"%PDF-1.4   extra text\n";
616        let header = PdfHeader::parse(Cursor::new(input)).unwrap();
617
618        assert_eq!(header.version.major, 1);
619        assert_eq!(header.version.minor, 4);
620    }
621}