Skip to main content

fop_render/pdf/
validator.rs

1//! PDF validation and quality checks
2//!
3//! Validates generated PDFs for correctness and quality.
4
5use std::collections::HashSet;
6
7/// PDF validator for checking structural integrity and quality
8pub struct PdfValidator {
9    /// Strict mode (fail on warnings)
10    strict: bool,
11}
12
13/// Validation result
14#[derive(Debug, Clone, PartialEq, Eq)]
15pub enum ValidationResult {
16    /// PDF is valid with no issues
17    Valid,
18    /// PDF is valid but has warnings
19    Warning(Vec<String>),
20    /// PDF is invalid with errors
21    Error(Vec<String>),
22}
23
24impl ValidationResult {
25    /// Check if validation passed (valid or warning only)
26    pub fn is_ok(&self) -> bool {
27        matches!(self, ValidationResult::Valid | ValidationResult::Warning(_))
28    }
29
30    /// Check if there are errors
31    pub fn has_errors(&self) -> bool {
32        matches!(self, ValidationResult::Error(_))
33    }
34
35    /// Get all issues (warnings or errors)
36    pub fn issues(&self) -> Vec<String> {
37        match self {
38            ValidationResult::Valid => Vec::new(),
39            ValidationResult::Warning(warnings) => warnings.clone(),
40            ValidationResult::Error(errors) => errors.clone(),
41        }
42    }
43}
44
45impl PdfValidator {
46    /// Create a new PDF validator
47    pub fn new() -> Self {
48        Self { strict: false }
49    }
50
51    /// Create a new strict PDF validator (warnings are treated as errors)
52    pub fn new_strict() -> Self {
53        Self { strict: true }
54    }
55
56    /// Validate a PDF document
57    ///
58    /// # Arguments
59    /// * `pdf_bytes` - Raw PDF file bytes
60    ///
61    /// # Returns
62    /// ValidationResult indicating success, warnings, or errors
63    pub fn validate_pdf(&self, pdf_bytes: &[u8]) -> ValidationResult {
64        let mut warnings = Vec::new();
65        let mut errors = Vec::new();
66
67        // Check minimum size
68        if pdf_bytes.len() < 20 {
69            errors.push("PDF file is too small (minimum 20 bytes)".to_string());
70            return ValidationResult::Error(errors);
71        }
72
73        // Check PDF header
74        if let Err(e) = self.check_header(pdf_bytes) {
75            errors.push(e);
76        }
77
78        // Check EOF marker
79        if let Err(e) = self.check_eof(pdf_bytes) {
80            errors.push(e);
81        }
82
83        // Find xref offset
84        let xref_offset = match self.find_xref_offset(pdf_bytes) {
85            Ok(offset) => offset,
86            Err(e) => {
87                errors.push(e);
88                return ValidationResult::Error(errors);
89            }
90        };
91
92        // Validate cross-reference table
93        match self.validate_xref(pdf_bytes, xref_offset) {
94            Ok(warns) => warnings.extend(warns),
95            Err(e) => errors.push(e),
96        }
97
98        // Validate trailer
99        match self.validate_trailer(pdf_bytes, xref_offset) {
100            Ok(warns) => warnings.extend(warns),
101            Err(e) => errors.push(e),
102        }
103
104        // Validate object structure
105        match self.validate_objects(pdf_bytes) {
106            Ok(warns) => warnings.extend(warns),
107            Err(e) => errors.push(e),
108        }
109
110        // Check catalog
111        match self.check_catalog(pdf_bytes) {
112            Ok(warns) => warnings.extend(warns),
113            Err(e) => errors.push(e),
114        }
115
116        // Check pages
117        match self.check_pages(pdf_bytes) {
118            Ok(warns) => warnings.extend(warns),
119            Err(e) => errors.push(e),
120        }
121
122        // Check file size reasonableness
123        if let Some(warning) = self.check_size(pdf_bytes) {
124            warnings.push(warning);
125        }
126
127        // Return result
128        if !errors.is_empty() {
129            ValidationResult::Error(errors)
130        } else if !warnings.is_empty() {
131            if self.strict {
132                ValidationResult::Error(warnings)
133            } else {
134                ValidationResult::Warning(warnings)
135            }
136        } else {
137            ValidationResult::Valid
138        }
139    }
140
141    /// Check PDF header for correct version
142    fn check_header(&self, pdf_bytes: &[u8]) -> Result<(), String> {
143        if !pdf_bytes.starts_with(b"%PDF-") {
144            return Err("PDF header missing or invalid (expected %PDF-)".to_string());
145        }
146
147        // Extract version
148        if pdf_bytes.len() < 8 {
149            return Err("PDF header truncated".to_string());
150        }
151
152        let header_line = match find_line_end(pdf_bytes, 0) {
153            Some(end) => &pdf_bytes[0..end],
154            None => return Err("PDF header line incomplete".to_string()),
155        };
156
157        let header_str = String::from_utf8_lossy(header_line);
158
159        // Check for valid version numbers
160        if !header_str.starts_with("%PDF-1.") && !header_str.starts_with("%PDF-2.") {
161            return Err(format!("Unsupported PDF version: {}", header_str));
162        }
163
164        Ok(())
165    }
166
167    /// Check EOF marker
168    fn check_eof(&self, pdf_bytes: &[u8]) -> Result<(), String> {
169        // Find %%EOF from the end (allowing trailing whitespace)
170        let trimmed = trim_end_whitespace(pdf_bytes);
171
172        if !trimmed.ends_with(b"%%EOF") {
173            return Err("PDF file missing %%EOF marker".to_string());
174        }
175
176        Ok(())
177    }
178
179    /// Find startxref offset value
180    fn find_xref_offset(&self, pdf_bytes: &[u8]) -> Result<usize, String> {
181        // Search for "startxref" from the end
182        let content = String::from_utf8_lossy(pdf_bytes);
183
184        if let Some(pos) = content.rfind("startxref") {
185            // Read the number after startxref
186            let after_keyword = &content[pos + 9..];
187
188            // Find the first line with a number
189            for line in after_keyword.lines() {
190                let trimmed = line.trim();
191                if !trimmed.is_empty() {
192                    if let Ok(offset) = trimmed.parse::<usize>() {
193                        return Ok(offset);
194                    }
195                }
196            }
197
198            return Err("startxref value not found or invalid".to_string());
199        }
200
201        Err("startxref keyword not found".to_string())
202    }
203
204    /// Validate cross-reference table
205    fn validate_xref(&self, pdf_bytes: &[u8], xref_offset: usize) -> Result<Vec<String>, String> {
206        let mut warnings = Vec::new();
207
208        if xref_offset >= pdf_bytes.len() {
209            return Err("xref offset points beyond file end".to_string());
210        }
211
212        let xref_section = &pdf_bytes[xref_offset..];
213        let xref_str = String::from_utf8_lossy(xref_section);
214
215        // Check for "xref" keyword (with optional leading whitespace/newline)
216        let trimmed_xref = xref_str.trim_start();
217        if !trimmed_xref.starts_with("xref") {
218            return Err(format!(
219                "xref table missing 'xref' keyword (found: {:?})",
220                &xref_str.chars().take(20).collect::<String>()
221            ));
222        }
223
224        // Parse xref header (object number and count)
225        let mut lines = xref_str.lines();
226        let _ = lines.next(); // Skip "xref"
227
228        if let Some(header_line) = lines.next() {
229            let parts: Vec<&str> = header_line.split_whitespace().collect();
230            if parts.len() != 2 {
231                return Err("xref subsection header invalid".to_string());
232            }
233
234            // Parse object count
235            let count = parts[1]
236                .parse::<usize>()
237                .map_err(|_| "xref object count invalid".to_string())?;
238
239            // Validate xref entries (each should be 20 bytes: "nnnnnnnnnn ggggg x \n")
240            let mut entry_count = 0;
241            for line in lines {
242                if line.trim().starts_with("trailer") {
243                    break;
244                }
245
246                let trimmed = line.trim();
247                if trimmed.is_empty() {
248                    continue;
249                }
250
251                // Each entry should be: offset(10) space gen(5) space flag(1)
252                let parts: Vec<&str> = trimmed.split_whitespace().collect();
253                if parts.len() != 3 {
254                    warnings.push(format!("xref entry malformed: {}", trimmed));
255                    continue;
256                }
257
258                // Validate offset (10 digits)
259                if parts[0].len() != 10 {
260                    warnings.push(format!("xref offset not 10 digits: {}", parts[0]));
261                }
262
263                // Validate generation (5 digits)
264                if parts[1].len() != 5 {
265                    warnings.push(format!("xref generation not 5 digits: {}", parts[1]));
266                }
267
268                // Validate flag (n or f)
269                if parts[2] != "n" && parts[2] != "f" {
270                    warnings.push(format!(
271                        "xref flag invalid (expected 'n' or 'f'): {}",
272                        parts[2]
273                    ));
274                }
275
276                entry_count += 1;
277            }
278
279            // Check if entry count matches declared count
280            if entry_count != count {
281                warnings.push(format!(
282                    "xref entry count mismatch (declared: {}, found: {})",
283                    count, entry_count
284                ));
285            }
286        } else {
287            return Err("xref subsection header missing".to_string());
288        }
289
290        Ok(warnings)
291    }
292
293    /// Validate trailer dictionary
294    fn validate_trailer(
295        &self,
296        pdf_bytes: &[u8],
297        xref_offset: usize,
298    ) -> Result<Vec<String>, String> {
299        let mut warnings = Vec::new();
300
301        let xref_section = &pdf_bytes[xref_offset..];
302        let xref_str = String::from_utf8_lossy(xref_section);
303
304        // Find trailer keyword
305        if let Some(trailer_pos) = xref_str.find("trailer") {
306            let trailer_section = &xref_str[trailer_pos..];
307
308            // Check for required entries
309            if !trailer_section.contains("/Size") {
310                return Err("trailer missing required /Size entry".to_string());
311            }
312
313            if !trailer_section.contains("/Root") {
314                return Err("trailer missing required /Root entry".to_string());
315            }
316
317            // Optional but recommended
318            if !trailer_section.contains("/Info") {
319                warnings.push("trailer missing /Info dictionary (metadata)".to_string());
320            }
321
322            // Check for valid dictionary format
323            if !trailer_section.contains("<<") || !trailer_section.contains(">>") {
324                return Err("trailer dictionary malformed".to_string());
325            }
326        } else {
327            return Err("trailer keyword not found".to_string());
328        }
329
330        Ok(warnings)
331    }
332
333    /// Validate object structure
334    fn validate_objects(&self, pdf_bytes: &[u8]) -> Result<Vec<String>, String> {
335        let mut warnings = Vec::new();
336        let content = String::from_utf8_lossy(pdf_bytes);
337
338        // Find all obj...endobj pairs
339        let obj_starts: Vec<_> = content.match_indices(" obj\n").collect();
340        let obj_ends: Vec<_> = content.match_indices("\nendobj\n").collect();
341
342        if obj_starts.len() != obj_ends.len() {
343            return Err(format!(
344                "Mismatched obj/endobj pairs (obj: {}, endobj: {})",
345                obj_starts.len(),
346                obj_ends.len()
347            ));
348        }
349
350        // Validate each object has proper ID format
351        for (pos, _) in &obj_starts {
352            // Look backward for object ID (format: "n g obj")
353            let before = &content[..=*pos];
354            if let Some(line_start) = before.rfind('\n') {
355                let obj_line = &before[line_start + 1..=*pos];
356                let parts: Vec<&str> = obj_line.split_whitespace().collect();
357
358                if parts.len() < 3 {
359                    warnings.push(format!("Object header malformed near offset {}", pos));
360                    continue;
361                }
362
363                // Validate object number
364                if parts[0].parse::<u32>().is_err() {
365                    warnings.push(format!("Invalid object number: {}", parts[0]));
366                }
367
368                // Validate generation number
369                if parts[1].parse::<u16>().is_err() {
370                    warnings.push(format!("Invalid generation number: {}", parts[1]));
371                }
372            }
373        }
374
375        // Check for stream objects
376        let stream_count = content.matches("\nstream\n").count();
377        let endstream_count = content.matches("\nendstream\n").count();
378
379        if stream_count != endstream_count {
380            warnings.push(format!(
381                "Mismatched stream/endstream pairs (stream: {}, endstream: {})",
382                stream_count, endstream_count
383            ));
384        }
385
386        Ok(warnings)
387    }
388
389    /// Validate catalog dictionary
390    fn check_catalog(&self, pdf_bytes: &[u8]) -> Result<Vec<String>, String> {
391        let mut warnings = Vec::new();
392        let content = String::from_utf8_lossy(pdf_bytes);
393
394        // Find catalog object (should be referenced in trailer as /Root)
395        if !content.contains("/Type /Catalog") {
396            return Err("Catalog object (/Type /Catalog) not found".to_string());
397        }
398
399        // Check for required entries in catalog
400        if !content.contains("/Pages") {
401            return Err("Catalog missing required /Pages entry".to_string());
402        }
403
404        // Optional but useful entries
405        if !content.contains("/Outlines") {
406            warnings.push("Catalog missing /Outlines (bookmarks not present)".to_string());
407        }
408
409        Ok(warnings)
410    }
411
412    /// Validate page tree structure
413    fn check_pages(&self, pdf_bytes: &[u8]) -> Result<Vec<String>, String> {
414        let mut warnings = Vec::new();
415        let content = String::from_utf8_lossy(pdf_bytes);
416
417        // Find pages object
418        if !content.contains("/Type /Pages") {
419            return Err("Pages object (/Type /Pages) not found".to_string());
420        }
421
422        // Check for required entries
423        if !content.contains("/Kids") {
424            return Err("Pages object missing required /Kids array".to_string());
425        }
426
427        if !content.contains("/Count") {
428            return Err("Pages object missing required /Count entry".to_string());
429        }
430
431        // Find individual page objects
432        let page_count = content.matches("/Type /Page\n").count();
433
434        if page_count == 0 {
435            warnings.push("No page objects found in document".to_string());
436        }
437
438        // Validate page objects have required entries
439        let page_positions: Vec<_> = content.match_indices("/Type /Page\n").collect();
440
441        for (pos, _) in page_positions {
442            // Find the object containing this page
443            let before = &content[..pos];
444            if let Some(obj_start) = before.rfind(" obj\n") {
445                let after = &content[pos..];
446                if let Some(obj_end) = after.find("\nendobj\n") {
447                    let page_obj = &content[obj_start..pos + obj_end];
448
449                    // Check required page entries
450                    if !page_obj.contains("/Parent") {
451                        warnings.push("Page object missing /Parent reference".to_string());
452                    }
453
454                    if !page_obj.contains("/MediaBox") && !page_obj.contains("/CropBox") {
455                        warnings.push("Page object missing /MediaBox or /CropBox".to_string());
456                    }
457
458                    if !page_obj.contains("/Resources") {
459                        warnings.push("Page object missing /Resources dictionary".to_string());
460                    }
461                }
462            }
463        }
464
465        Ok(warnings)
466    }
467
468    /// Check if resources are properly referenced
469    #[allow(dead_code)]
470    fn check_resources(&self, pdf_bytes: &[u8]) -> Result<Vec<String>, String> {
471        let mut warnings = Vec::new();
472        let content = String::from_utf8_lossy(pdf_bytes);
473
474        // Check for font resources
475        let font_refs: Vec<_> = content.match_indices("/Font").collect();
476        let type1_fonts = content.matches("/Type1").count();
477        let truetype_fonts = content.matches("/TrueType").count();
478
479        if font_refs.is_empty() {
480            warnings.push("No font resources defined".to_string());
481        } else if type1_fonts == 0 && truetype_fonts == 0 {
482            warnings.push("Font resources defined but no font types found".to_string());
483        }
484
485        // Check for XObject resources (images, etc.)
486        if content.contains("/XObject") {
487            // Validate XObject dictionary exists
488            if !content.contains("/Type /XObject") {
489                warnings.push("XObject referenced but no XObject definitions found".to_string());
490            }
491        }
492
493        Ok(warnings)
494    }
495
496    /// Validate stream dictionaries
497    #[allow(dead_code)]
498    fn validate_stream(&self, pdf_bytes: &[u8]) -> Result<Vec<String>, String> {
499        let mut warnings = Vec::new();
500        let content = String::from_utf8_lossy(pdf_bytes);
501
502        // Find all stream objects
503        let stream_positions: Vec<_> = content.match_indices("\nstream\n").collect();
504
505        for (pos, _) in stream_positions {
506            // Look backward for the stream dictionary
507            let before = &content[..pos];
508
509            // Should have a /Length entry
510            if let Some(dict_start) = before.rfind("<<") {
511                let dict_section = &before[dict_start..];
512
513                if !dict_section.contains("/Length") {
514                    warnings.push("Stream dictionary missing /Length entry".to_string());
515                }
516            } else {
517                warnings.push("Stream missing dictionary".to_string());
518            }
519        }
520
521        Ok(warnings)
522    }
523
524    /// Validate object references
525    #[allow(dead_code)]
526    fn validate_object(&self, pdf_bytes: &[u8]) -> Result<Vec<String>, String> {
527        let mut warnings = Vec::new();
528        let content = String::from_utf8_lossy(pdf_bytes);
529
530        // Collect all defined object IDs
531        let mut defined_objects = HashSet::new();
532
533        for line in content.lines() {
534            if line.trim().ends_with(" obj") {
535                let parts: Vec<&str> = line.split_whitespace().collect();
536                if parts.len() >= 3 {
537                    if let Ok(obj_num) = parts[0].parse::<u32>() {
538                        defined_objects.insert(obj_num);
539                    }
540                }
541            }
542        }
543
544        // Find all object references (format: "n 0 R")
545        // Simple pattern matching without regex
546        for line in content.lines() {
547            for word_group in line.split_whitespace().collect::<Vec<_>>().windows(3) {
548                if word_group.len() == 3 && word_group[1] == "0" && word_group[2] == "R" {
549                    if let Ok(obj_num) = word_group[0].parse::<u32>() {
550                        if !defined_objects.contains(&obj_num) && obj_num > 0 {
551                            warnings
552                                .push(format!("Reference to undefined object: {} 0 R", obj_num));
553                        }
554                    }
555                }
556            }
557        }
558
559        Ok(warnings)
560    }
561
562    /// Check file size is reasonable
563    fn check_size(&self, pdf_bytes: &[u8]) -> Option<String> {
564        const MAX_REASONABLE_SIZE: usize = 100 * 1024 * 1024; // 100 MB
565        const MIN_REASONABLE_SIZE: usize = 100; // 100 bytes
566
567        let size = pdf_bytes.len();
568
569        if size < MIN_REASONABLE_SIZE {
570            Some(format!(
571                "PDF file is very small ({} bytes), may be incomplete",
572                size
573            ))
574        } else if size > MAX_REASONABLE_SIZE {
575            Some(format!(
576                "PDF file is very large ({} MB), consider optimization",
577                size / (1024 * 1024)
578            ))
579        } else {
580            None
581        }
582    }
583}
584
585impl Default for PdfValidator {
586    fn default() -> Self {
587        Self::new()
588    }
589}
590
591/// Find the end of a line (LF or CRLF)
592fn find_line_end(data: &[u8], start: usize) -> Option<usize> {
593    for (i, &byte) in data.iter().enumerate().skip(start) {
594        if byte == b'\n' {
595            return Some(i);
596        }
597    }
598    None
599}
600
601/// Trim trailing whitespace from byte slice
602fn trim_end_whitespace(data: &[u8]) -> &[u8] {
603    let mut end = data.len();
604
605    while end > 0 && matches!(data[end - 1], b' ' | b'\t' | b'\n' | b'\r') {
606        end -= 1;
607    }
608
609    &data[..end]
610}
611
612#[cfg(test)]
613mod tests {
614    use super::*;
615
616    #[test]
617    fn test_validator_creation() {
618        let validator = PdfValidator::new();
619        assert!(!validator.strict);
620
621        let strict_validator = PdfValidator::new_strict();
622        assert!(strict_validator.strict);
623    }
624
625    #[test]
626    fn test_validation_result() {
627        let valid = ValidationResult::Valid;
628        assert!(valid.is_ok());
629        assert!(!valid.has_errors());
630
631        let warning = ValidationResult::Warning(vec!["test warning".to_string()]);
632        assert!(warning.is_ok());
633        assert!(!warning.has_errors());
634
635        let error = ValidationResult::Error(vec!["test error".to_string()]);
636        assert!(!error.is_ok());
637        assert!(error.has_errors());
638    }
639
640    #[test]
641    fn test_empty_pdf() {
642        let validator = PdfValidator::new();
643        let result = validator.validate_pdf(b"");
644        assert!(result.has_errors());
645    }
646
647    #[test]
648    fn test_minimal_pdf() {
649        let validator = PdfValidator::new();
650
651        // Too small
652        let result = validator.validate_pdf(b"%PDF-1.4");
653        assert!(result.has_errors());
654    }
655
656    #[test]
657    fn test_invalid_header() {
658        let validator = PdfValidator::new();
659        let invalid_pdf = b"INVALID HEADER\n%%EOF\n";
660        let result = validator.validate_pdf(invalid_pdf);
661        assert!(result.has_errors());
662    }
663
664    #[test]
665    fn test_missing_eof() {
666        let validator = PdfValidator::new();
667        let pdf = b"%PDF-1.4\n1 0 obj\n<< /Type /Catalog >>\nendobj\n";
668        let result = validator.validate_pdf(pdf);
669        assert!(result.has_errors());
670    }
671
672    #[test]
673    fn test_basic_valid_pdf() {
674        let validator = PdfValidator::new();
675
676        // Create a minimal but valid PDF structure
677        let pdf = b"%PDF-1.4\n\
678            1 0 obj\n\
679            << /Type /Catalog /Pages 2 0 R >>\n\
680            endobj\n\
681            2 0 obj\n\
682            << /Type /Pages /Kids [3 0 R] /Count 1 >>\n\
683            endobj\n\
684            3 0 obj\n\
685            << /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792] /Resources << /Font << /F1 4 0 R >> >> /Contents 5 0 R >>\n\
686            endobj\n\
687            4 0 obj\n\
688            << /Type /Font /Subtype /Type1 /BaseFont /Helvetica >>\n\
689            endobj\n\
690            5 0 obj\n\
691            << /Length 44 >>\n\
692            stream\n\
693            BT\n/F1 12 Tf\n100 700 Td\n(Hello World) Tj\nET\n\
694            endstream\n\
695            endobj\n\
696            xref\n\
697            0 6\n\
698            0000000000 65535 f \n\
699            0000000009 00000 n \n\
700            0000000058 00000 n \n\
701            0000000115 00000 n \n\
702            0000000261 00000 n \n\
703            0000000339 00000 n \n\
704            trailer\n\
705            << /Size 6 /Root 1 0 R >>\n\
706            startxref\n\
707            404\n\
708            %%EOF\n";
709
710        let result = validator.validate_pdf(pdf);
711
712        // Debug print issues if validation fails
713        if !result.is_ok() {
714            eprintln!("Validation result: {:?}", result);
715            for issue in result.issues() {
716                eprintln!("  - {}", issue);
717            }
718        }
719
720        assert!(result.is_ok());
721    }
722
723    #[test]
724    fn test_find_line_end() {
725        let data = b"Hello\nWorld";
726        assert_eq!(find_line_end(data, 0), Some(5));
727
728        let no_newline = b"Hello";
729        assert_eq!(find_line_end(no_newline, 0), None);
730    }
731
732    #[test]
733    fn test_trim_end_whitespace() {
734        let data = b"Hello  \n\r\t";
735        let trimmed = trim_end_whitespace(data);
736        assert_eq!(trimmed, b"Hello");
737
738        let no_whitespace = b"Hello";
739        let trimmed2 = trim_end_whitespace(no_whitespace);
740        assert_eq!(trimmed2, b"Hello");
741    }
742
743    #[test]
744    fn test_check_header() {
745        let validator = PdfValidator::new();
746
747        // Valid headers
748        assert!(validator.check_header(b"%PDF-1.4\n").is_ok());
749        assert!(validator.check_header(b"%PDF-1.7\n").is_ok());
750        assert!(validator.check_header(b"%PDF-2.0\n").is_ok());
751
752        // Invalid headers
753        assert!(validator.check_header(b"PDF-1.4\n").is_err());
754        assert!(validator.check_header(b"%PDF-\n").is_err());
755    }
756
757    #[test]
758    fn test_check_size() {
759        let validator = PdfValidator::new();
760
761        // Too small
762        let small = vec![0u8; 50];
763        assert!(validator.check_size(&small).is_some());
764
765        // Normal size
766        let normal = vec![0u8; 1024];
767        assert!(validator.check_size(&normal).is_none());
768
769        // Very large (would test but don't want to allocate 100MB in test)
770    }
771}
772
773#[cfg(test)]
774mod tests_extended {
775    use super::*;
776
777    // ── ValidationResult API ─────────────────────────────────────────────────
778
779    #[test]
780    fn test_valid_is_ok() {
781        assert!(ValidationResult::Valid.is_ok());
782    }
783
784    #[test]
785    fn test_valid_has_no_errors() {
786        assert!(!ValidationResult::Valid.has_errors());
787    }
788
789    #[test]
790    fn test_valid_issues_empty() {
791        assert!(ValidationResult::Valid.issues().is_empty());
792    }
793
794    #[test]
795    fn test_warning_is_ok() {
796        let w = ValidationResult::Warning(vec!["w1".to_string()]);
797        assert!(w.is_ok());
798    }
799
800    #[test]
801    fn test_warning_has_no_errors() {
802        let w = ValidationResult::Warning(vec!["w1".to_string()]);
803        assert!(!w.has_errors());
804    }
805
806    #[test]
807    fn test_warning_issues_returns_messages() {
808        let w = ValidationResult::Warning(vec!["a".to_string(), "b".to_string()]);
809        let issues = w.issues();
810        assert_eq!(issues.len(), 2);
811        assert_eq!(issues[0], "a");
812        assert_eq!(issues[1], "b");
813    }
814
815    #[test]
816    fn test_error_not_ok() {
817        let e = ValidationResult::Error(vec!["bad".to_string()]);
818        assert!(!e.is_ok());
819    }
820
821    #[test]
822    fn test_error_has_errors() {
823        let e = ValidationResult::Error(vec!["bad".to_string()]);
824        assert!(e.has_errors());
825    }
826
827    #[test]
828    fn test_error_issues_returns_messages() {
829        let e = ValidationResult::Error(vec!["e1".to_string(), "e2".to_string()]);
830        let issues = e.issues();
831        assert_eq!(issues.len(), 2);
832        assert_eq!(issues[0], "e1");
833    }
834
835    // ── PdfValidator creation ────────────────────────────────────────────────
836
837    #[test]
838    fn test_default_is_non_strict() {
839        let v = PdfValidator::default();
840        assert!(!v.strict);
841    }
842
843    #[test]
844    fn test_new_strict_is_strict() {
845        let v = PdfValidator::new_strict();
846        assert!(v.strict);
847    }
848
849    // ── Header validation ────────────────────────────────────────────────────
850
851    #[test]
852    fn test_pdf_1_0_header_unsupported() {
853        // %PDF-1.0 is technically valid but our check passes it since it
854        // starts with "%PDF-1."
855        let v = PdfValidator::new();
856        assert!(v.check_header(b"%PDF-1.0\n").is_ok());
857    }
858
859    #[test]
860    fn test_pdf_1_7_header_valid() {
861        let v = PdfValidator::new();
862        assert!(v.check_header(b"%PDF-1.7\n").is_ok());
863    }
864
865    #[test]
866    fn test_pdf_2_0_header_valid() {
867        let v = PdfValidator::new();
868        assert!(v.check_header(b"%PDF-2.0\n").is_ok());
869    }
870
871    #[test]
872    fn test_missing_percent_header_invalid() {
873        let v = PdfValidator::new();
874        assert!(v.check_header(b"PDF-1.4\n").is_err());
875    }
876
877    #[test]
878    fn test_garbage_header_invalid() {
879        let v = PdfValidator::new();
880        assert!(v.check_header(b"garbage\n").is_err());
881    }
882
883    // ── EOF validation ───────────────────────────────────────────────────────
884
885    #[test]
886    fn test_check_eof_present() {
887        let v = PdfValidator::new();
888        // check_eof only looks at the bytes passed to it
889        let data = b"...content...%%EOF";
890        assert!(v.check_eof(data).is_ok());
891    }
892
893    #[test]
894    fn test_check_eof_missing() {
895        let v = PdfValidator::new();
896        let data = b"...content...no-eof-marker";
897        assert!(v.check_eof(data).is_err());
898    }
899
900    #[test]
901    fn test_check_eof_with_trailing_whitespace() {
902        let v = PdfValidator::new();
903        let data = b"%%EOF\n\r\n";
904        assert!(v.check_eof(data).is_ok());
905    }
906
907    // ── Size validation ──────────────────────────────────────────────────────
908
909    #[test]
910    fn test_tiny_pdf_triggers_size_warning() {
911        let v = PdfValidator::new();
912        let tiny = vec![0u8; 10];
913        assert!(v.check_size(&tiny).is_some());
914    }
915
916    #[test]
917    fn test_reasonable_size_no_warning() {
918        let v = PdfValidator::new();
919        let normal = vec![0u8; 4096];
920        assert!(v.check_size(&normal).is_none());
921    }
922
923    // ── Full validation — well-formed minimal PDF ────────────────────────────
924
925    /// Build a minimal well-formed PDF bytes string for validation tests.
926    ///
927    /// We build each object as a separate `Vec<u8>` so that Rust string
928    /// continuation (`\` at end of line) does NOT add whitespace and the
929    /// byte offsets we store in the xref table are exact.
930    fn minimal_valid_pdf() -> Vec<u8> {
931        let obj1: &[u8] = b"1 0 obj\n<< /Type /Catalog /Pages 2 0 R >>\nendobj\n";
932        let obj2: &[u8] = b"2 0 obj\n<< /Type /Pages /Kids [3 0 R] /Count 1 >>\nendobj\n";
933        let obj3: &[u8] = b"3 0 obj\n<< /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792] /Resources << /Font << /F1 4 0 R >> >> >>\nendobj\n";
934        let obj4: &[u8] =
935            b"4 0 obj\n<< /Type /Font /Subtype /Type1 /BaseFont /Helvetica >>\nendobj\n";
936
937        let header: &[u8] = b"%PDF-1.4\n";
938        let o1_off = header.len(); // 9
939        let o2_off = o1_off + obj1.len(); // 58
940        let o3_off = o2_off + obj2.len(); // 115
941        let o4_off = o3_off + obj3.len(); // 225
942        let xref_off = o4_off + obj4.len(); // 295
943
944        let xref = format!(
945            "xref\n             0 5\n             {o0:010} 65535 f \n             {o1:010} 00000 n \n             {o2:010} 00000 n \n             {o3:010} 00000 n \n             {o4:010} 00000 n \n             trailer\n             << /Size 5 /Root 1 0 R >>\n             startxref\n             {xref_off}\n             %%EOF\n",
946            o0 = 0,
947            o1 = o1_off,
948            o2 = o2_off,
949            o3 = o3_off,
950            o4 = o4_off,
951            xref_off = xref_off,
952        );
953
954        let mut pdf = Vec::new();
955        pdf.extend_from_slice(header);
956        pdf.extend_from_slice(obj1);
957        pdf.extend_from_slice(obj2);
958        pdf.extend_from_slice(obj3);
959        pdf.extend_from_slice(obj4);
960        pdf.extend_from_slice(xref.as_bytes());
961        pdf
962    }
963
964    #[test]
965    fn test_valid_pdf_passes_validation() {
966        let v = PdfValidator::new();
967        let result = v.validate_pdf(&minimal_valid_pdf());
968        assert!(result.is_ok(), "validation failed: {:?}", result);
969    }
970
971    #[test]
972    fn test_invalid_pdf_too_short() {
973        let v = PdfValidator::new();
974        let result = v.validate_pdf(b"%PDF-1");
975        assert!(result.has_errors());
976    }
977
978    #[test]
979    fn test_invalid_pdf_no_header() {
980        let v = PdfValidator::new();
981        let result = v.validate_pdf(b"JUNK JUNK JUNK JUNK JUNK\n%%EOF\n");
982        assert!(result.has_errors());
983    }
984
985    #[test]
986    fn test_invalid_pdf_no_eof() {
987        let v = PdfValidator::new();
988        let result = v.validate_pdf(b"%PDF-1.4\nsome content without eof marker");
989        assert!(result.has_errors());
990    }
991
992    // ── Strict mode ──────────────────────────────────────────────────────────
993
994    #[test]
995    fn test_strict_mode_treats_warnings_as_errors() {
996        // A PDF that is structurally OK but triggers a warning (no /Info dict).
997        // In strict mode warnings become errors.
998        let v_strict = PdfValidator::new_strict();
999        let pdf = minimal_valid_pdf(); // The minimal PDF has no /Info → warning
1000        let result = v_strict.validate_pdf(&pdf);
1001        // Strict mode: warning(s) → error
1002        // The minimal PDF will produce at least the /Info warning plus
1003        // the /Outlines warning → should be Error in strict mode.
1004        assert!(
1005            result.has_errors(),
1006            "strict mode should have errors (warnings promoted): {:?}",
1007            result
1008        );
1009    }
1010
1011    #[test]
1012    fn test_non_strict_warnings_not_errors() {
1013        let v = PdfValidator::new();
1014        let pdf = minimal_valid_pdf();
1015        let result = v.validate_pdf(&pdf);
1016        // Non-strict: warnings are ok, not errors
1017        assert!(result.is_ok(), "non-strict should be ok: {:?}", result);
1018    }
1019
1020    // ── trim_end_whitespace helper ───────────────────────────────────────────
1021
1022    #[test]
1023    fn test_trim_end_whitespace_empty() {
1024        assert_eq!(trim_end_whitespace(b""), b"");
1025    }
1026
1027    #[test]
1028    fn test_trim_end_whitespace_only_spaces() {
1029        assert_eq!(trim_end_whitespace(b"   "), b"");
1030    }
1031
1032    #[test]
1033    fn test_trim_end_whitespace_preserves_content() {
1034        assert_eq!(trim_end_whitespace(b"abc\n\r"), b"abc");
1035    }
1036
1037    // ── find_line_end helper ─────────────────────────────────────────────────
1038
1039    #[test]
1040    fn test_find_line_end_at_start() {
1041        assert_eq!(find_line_end(b"\nrest", 0), Some(0));
1042    }
1043
1044    #[test]
1045    fn test_find_line_end_mid_string() {
1046        assert_eq!(find_line_end(b"ab\ncd", 0), Some(2));
1047    }
1048
1049    #[test]
1050    fn test_find_line_end_none_without_newline() {
1051        assert_eq!(find_line_end(b"abcdef", 0), None);
1052    }
1053
1054    #[test]
1055    fn test_find_line_end_with_offset() {
1056        // Skip past first newline
1057        assert_eq!(find_line_end(b"a\nb\nc", 2), Some(3));
1058    }
1059}