1use crate::error::Result;
4use std::fs::File;
5use std::io::{BufReader, Read, Seek, SeekFrom};
6use std::path::Path;
7
8#[derive(Debug, Clone, PartialEq)]
10pub enum CorruptionType {
11 InvalidHeader,
13 CorruptXRef,
15 MissingEOF,
17 BrokenReferences,
19 CorruptStreams,
21 InvalidPageTree,
23 TruncatedFile,
25 Multiple(Vec<CorruptionType>),
27 Unknown,
29}
30
31#[derive(Debug)]
33pub struct CorruptionReport {
34 pub corruption_type: CorruptionType,
36 pub severity: u8,
38 pub errors: Vec<String>,
40 pub recoverable_sections: Vec<RecoverableSection>,
42 pub file_stats: FileStats,
44}
45
46#[derive(Debug)]
48pub struct RecoverableSection {
49 pub section_type: SectionType,
51 pub start_offset: u64,
53 pub end_offset: u64,
55 pub confidence: f32,
57}
58
59#[derive(Debug, Clone)]
61pub enum SectionType {
62 Header,
63 Body,
64 XRef,
65 Trailer,
66 Page(u32),
67 Object(u32),
68 Stream(u32),
69}
70
71#[derive(Debug, Default)]
73pub struct FileStats {
74 pub file_size: u64,
76 pub readable_bytes: u64,
78 pub estimated_objects: usize,
80 pub found_pages: usize,
82}
83
84pub fn detect_corruption<P: AsRef<Path>>(path: P) -> Result<CorruptionReport> {
86 let mut file = File::open(path)?;
87 let mut reader = BufReader::new(&mut file);
88
89 let file_size = reader.seek(SeekFrom::End(0))?;
90 reader.seek(SeekFrom::Start(0))?;
91
92 let mut report = CorruptionReport {
93 corruption_type: CorruptionType::Unknown,
94 severity: 0,
95 errors: Vec::new(),
96 recoverable_sections: Vec::new(),
97 file_stats: FileStats {
98 file_size,
99 ..Default::default()
100 },
101 };
102
103 if !check_header(&mut reader, &mut report)? {
105 report.corruption_type = CorruptionType::InvalidHeader;
106 report.severity = 10;
107 return Ok(report);
108 }
109
110 check_eof(&mut reader, &mut report)?;
112
113 scan_xref(&mut reader, &mut report)?;
115
116 analyze_objects(&mut reader, &mut report)?;
118
119 determine_corruption_type(&mut report);
121
122 Ok(report)
123}
124
125fn check_header<R: Read + Seek>(reader: &mut R, report: &mut CorruptionReport) -> Result<bool> {
126 let mut header = [0u8; 8];
127 reader.seek(SeekFrom::Start(0))?;
128
129 match reader.read_exact(&mut header) {
130 Ok(_) => {
131 if &header[0..5] == b"%PDF-" {
132 report.recoverable_sections.push(RecoverableSection {
133 section_type: SectionType::Header,
134 start_offset: 0,
135 end_offset: 8,
136 confidence: 1.0,
137 });
138 Ok(true)
139 } else {
140 report.errors.push("Invalid PDF header".to_string());
141 Ok(false)
142 }
143 }
144 Err(e) => {
145 report.errors.push(format!("Cannot read header: {e}"));
146 Ok(false)
147 }
148 }
149}
150
151fn check_eof<R: Read + Seek>(reader: &mut R, report: &mut CorruptionReport) -> Result<()> {
152 let check_size = 1024.min(report.file_stats.file_size);
154 let start_pos = report.file_stats.file_size.saturating_sub(check_size);
155
156 reader.seek(SeekFrom::Start(start_pos))?;
157 let mut buffer = vec![0u8; check_size as usize];
158 reader.read_exact(&mut buffer)?;
159
160 if !buffer.windows(5).any(|w| w == b"%%EOF") {
161 report.errors.push("Missing %%EOF marker".to_string());
162 report.severity = report.severity.max(5);
163 }
164
165 if report.errors.is_empty() && report.severity == 0 {
167 report
168 .errors
169 .push("PDF structure analysis complete".to_string());
170 }
171
172 Ok(())
173}
174
175fn scan_xref<R: Read + Seek>(reader: &mut R, report: &mut CorruptionReport) -> Result<()> {
176 reader.seek(SeekFrom::Start(0))?;
177 let mut buffer = Vec::new();
178 reader.read_to_end(&mut buffer)?;
179
180 let mut xref_count = 0;
182 let mut pos = 0;
183
184 while let Some(xref_pos) = find_pattern(&buffer[pos..], b"xref") {
185 let absolute_pos = pos + xref_pos;
186 xref_count += 1;
187
188 report.recoverable_sections.push(RecoverableSection {
189 section_type: SectionType::XRef,
190 start_offset: absolute_pos as u64,
191 end_offset: (absolute_pos + 100) as u64, confidence: 0.8,
193 });
194
195 pos = absolute_pos + 4;
196 }
197
198 if xref_count == 0 {
199 report
200 .errors
201 .push("No cross-reference tables found".to_string());
202 report.severity = report.severity.max(8);
203 }
204
205 Ok(())
206}
207
208fn analyze_objects<R: Read + Seek>(reader: &mut R, report: &mut CorruptionReport) -> Result<()> {
209 reader.seek(SeekFrom::Start(0))?;
210 let mut buffer = Vec::new();
211 reader.read_to_end(&mut buffer)?;
212
213 let mut object_count = 0;
215 let mut page_count = 0;
216 let mut pos = 0;
217
218 while pos < buffer.len() {
220 if let Some(obj_pos) = find_pattern(&buffer[pos..], b" obj") {
221 let absolute_pos = pos + obj_pos;
222 object_count += 1;
223
224 let check_end = (absolute_pos + 200).min(buffer.len());
226 if find_pattern(&buffer[absolute_pos..check_end], b"/Type /Page").is_some() {
227 page_count += 1;
228 }
229
230 pos = absolute_pos + 4;
231 } else {
232 break;
233 }
234 }
235
236 report.file_stats.estimated_objects = object_count;
237 report.file_stats.found_pages = page_count;
238 report.file_stats.readable_bytes = buffer.len() as u64;
239
240 if object_count == 0 {
241 report.errors.push("No PDF objects found".to_string());
242 report.severity = 10;
243 }
244
245 Ok(())
246}
247
248fn determine_corruption_type(report: &mut CorruptionReport) {
249 let mut types = Vec::new();
250
251 for error in &report.errors {
252 if error.contains("header") {
253 types.push(CorruptionType::InvalidHeader);
254 } else if error.contains("EOF") {
255 types.push(CorruptionType::MissingEOF);
256 } else if error.contains("cross-reference") || error.contains("xref") {
257 types.push(CorruptionType::CorruptXRef);
258 }
259 }
260
261 if types.is_empty() && report.severity > 0 {
262 report.corruption_type = CorruptionType::Unknown;
263 } else if types.len() == 1 {
264 if let Some(corruption_type) = types.into_iter().next() {
266 report.corruption_type = corruption_type;
267 }
268 } else if types.len() > 1 {
269 report.corruption_type = CorruptionType::Multiple(types);
270 }
271}
272
273fn find_pattern(haystack: &[u8], needle: &[u8]) -> Option<usize> {
274 if needle.is_empty() {
275 return Some(0);
276 }
277 haystack
278 .windows(needle.len())
279 .position(|window| window == needle)
280}
281
282pub fn is_corrupted<P: AsRef<Path>>(path: P) -> bool {
284 detect_corruption(path)
285 .map(|report| report.severity > 0)
286 .unwrap_or(true)
287}
288
289#[cfg(test)]
290mod tests {
291 use super::*;
292
293 #[test]
294 fn test_corruption_type() {
295 let corruption = CorruptionType::InvalidHeader;
296 assert_eq!(corruption, CorruptionType::InvalidHeader);
297
298 let multiple = CorruptionType::Multiple(vec![
299 CorruptionType::InvalidHeader,
300 CorruptionType::CorruptXRef,
301 ]);
302 assert!(matches!(multiple, CorruptionType::Multiple(_)));
303 }
304
305 #[test]
306 fn test_find_pattern() {
307 let haystack = b"Hello PDF world";
308 assert_eq!(find_pattern(haystack, b"PDF"), Some(6));
309 assert_eq!(find_pattern(haystack, b"XYZ"), None);
310 }
311
312 #[test]
313 fn test_file_stats_default() {
314 let stats = FileStats::default();
315 assert_eq!(stats.file_size, 0);
316 assert_eq!(stats.readable_bytes, 0);
317 assert_eq!(stats.estimated_objects, 0);
318 assert_eq!(stats.found_pages, 0);
319 }
320
321 #[test]
322 fn test_corruption_type_debug_clone_eq() {
323 let corruption = CorruptionType::InvalidHeader;
324 let debug_str = format!("{corruption:?}");
325 assert!(debug_str.contains("InvalidHeader"));
326
327 let cloned = corruption.clone();
328 assert_eq!(corruption, cloned);
329
330 let variants = vec![
332 CorruptionType::InvalidHeader,
333 CorruptionType::CorruptXRef,
334 CorruptionType::MissingEOF,
335 CorruptionType::BrokenReferences,
336 CorruptionType::CorruptStreams,
337 CorruptionType::InvalidPageTree,
338 CorruptionType::TruncatedFile,
339 CorruptionType::Unknown,
340 ];
341
342 for variant in variants {
343 let _ = format!("{variant:?}");
344 let _ = variant.clone();
345 }
346 }
347
348 #[test]
349 fn test_corruption_type_multiple() {
350 let types = vec![
351 CorruptionType::InvalidHeader,
352 CorruptionType::CorruptXRef,
353 CorruptionType::MissingEOF,
354 ];
355 let multiple = CorruptionType::Multiple(types);
356
357 match &multiple {
358 CorruptionType::Multiple(inner) => {
359 assert_eq!(inner.len(), 3);
360 assert_eq!(inner[0], CorruptionType::InvalidHeader);
361 }
362 _ => panic!("Should be Multiple variant"),
363 }
364 }
365
366 #[test]
367 fn test_section_type_debug_clone() {
368 let sections = vec![
369 SectionType::Header,
370 SectionType::Body,
371 SectionType::XRef,
372 SectionType::Trailer,
373 SectionType::Page(42),
374 SectionType::Object(123),
375 SectionType::Stream(456),
376 ];
377
378 for section in sections {
379 let debug_str = format!("{section:?}");
380 assert!(!debug_str.is_empty());
381
382 let cloned = section.clone();
383 match (section, cloned) {
384 (SectionType::Page(n1), SectionType::Page(n2)) => assert_eq!(n1, n2),
385 (SectionType::Object(n1), SectionType::Object(n2)) => assert_eq!(n1, n2),
386 (SectionType::Stream(n1), SectionType::Stream(n2)) => assert_eq!(n1, n2),
387 _ => {}
388 }
389 }
390 }
391
392 #[test]
393 fn test_recoverable_section_creation() {
394 let section = RecoverableSection {
395 section_type: SectionType::Page(1),
396 start_offset: 100,
397 end_offset: 500,
398 confidence: 0.95,
399 };
400
401 assert_eq!(section.start_offset, 100);
402 assert_eq!(section.end_offset, 500);
403 assert_eq!(section.confidence, 0.95);
404
405 let debug_str = format!("{section:?}");
406 assert!(debug_str.contains("RecoverableSection"));
407 }
408
409 #[test]
410 fn test_corruption_report_creation() {
411 let report = CorruptionReport {
412 corruption_type: CorruptionType::CorruptXRef,
413 severity: 7,
414 errors: vec!["Error 1".to_string(), "Error 2".to_string()],
415 recoverable_sections: vec![RecoverableSection {
416 section_type: SectionType::Header,
417 start_offset: 0,
418 end_offset: 10,
419 confidence: 1.0,
420 }],
421 file_stats: FileStats {
422 file_size: 1000,
423 readable_bytes: 900,
424 estimated_objects: 10,
425 found_pages: 3,
426 },
427 };
428
429 assert_eq!(report.severity, 7);
430 assert_eq!(report.errors.len(), 2);
431 assert_eq!(report.recoverable_sections.len(), 1);
432 assert_eq!(report.file_stats.file_size, 1000);
433 }
434
435 #[test]
436 fn test_find_pattern_various_cases() {
437 assert_eq!(find_pattern(b"xref table", b"xref"), Some(0));
439
440 assert_eq!(find_pattern(b"table xref", b"xref"), Some(6));
442
443 assert_eq!(find_pattern(b"PDF xref table", b"xref"), Some(4));
445
446 assert_eq!(find_pattern(b"PDF table", b"xref"), None);
448
449 assert_eq!(find_pattern(b"", b"xref"), None);
451
452 assert_eq!(find_pattern(b"test", b""), Some(0));
454 }
455
456 #[test]
457 fn test_determine_corruption_type_single() {
458 let mut report = CorruptionReport {
459 corruption_type: CorruptionType::Unknown,
460 severity: 5,
461 errors: vec!["Invalid header found".to_string()],
462 recoverable_sections: vec![],
463 file_stats: FileStats::default(),
464 };
465
466 determine_corruption_type(&mut report);
467 assert_eq!(report.corruption_type, CorruptionType::InvalidHeader);
468 }
469
470 #[test]
471 fn test_determine_corruption_type_multiple() {
472 let mut report = CorruptionReport {
473 corruption_type: CorruptionType::Unknown,
474 severity: 8,
475 errors: vec![
476 "Invalid header".to_string(),
477 "Missing EOF marker".to_string(),
478 "Corrupt cross-reference table".to_string(),
479 ],
480 recoverable_sections: vec![],
481 file_stats: FileStats::default(),
482 };
483
484 determine_corruption_type(&mut report);
485 match report.corruption_type {
486 CorruptionType::Multiple(types) => {
487 assert_eq!(types.len(), 3);
488 assert!(types.contains(&CorruptionType::InvalidHeader));
489 assert!(types.contains(&CorruptionType::MissingEOF));
490 assert!(types.contains(&CorruptionType::CorruptXRef));
491 }
492 _ => panic!("Should be Multiple corruption type"),
493 }
494 }
495
496 #[test]
497 fn test_determine_corruption_type_unknown() {
498 let mut report = CorruptionReport {
499 corruption_type: CorruptionType::Unknown,
500 severity: 3,
501 errors: vec!["Some generic error".to_string()],
502 recoverable_sections: vec![],
503 file_stats: FileStats::default(),
504 };
505
506 determine_corruption_type(&mut report);
507 assert_eq!(report.corruption_type, CorruptionType::Unknown);
508 }
509
510 #[test]
511 fn test_check_header_valid() {
512 use std::io::Cursor;
513
514 let data = b"%PDF-1.7\nrest of content";
515 let mut cursor = Cursor::new(data);
516 let mut report = CorruptionReport {
517 corruption_type: CorruptionType::Unknown,
518 severity: 0,
519 errors: vec![],
520 recoverable_sections: vec![],
521 file_stats: FileStats::default(),
522 };
523
524 let result = check_header(&mut cursor, &mut report).unwrap();
525 assert!(result);
526 assert_eq!(report.recoverable_sections.len(), 1);
527 assert_eq!(report.recoverable_sections[0].confidence, 1.0);
528 }
529
530 #[test]
531 fn test_check_header_invalid() {
532 use std::io::Cursor;
533
534 let data = b"INVALID HEADER\nrest of content";
535 let mut cursor = Cursor::new(data);
536 let mut report = CorruptionReport {
537 corruption_type: CorruptionType::Unknown,
538 severity: 0,
539 errors: vec![],
540 recoverable_sections: vec![],
541 file_stats: FileStats::default(),
542 };
543
544 let result = check_header(&mut cursor, &mut report).unwrap();
545 assert!(!result);
546 assert!(!report.errors.is_empty());
547 assert!(report.errors[0].contains("Invalid PDF header"));
548 }
549
550 #[test]
551 fn test_check_header_too_short() {
552 use std::io::Cursor;
553
554 let data = b"PDF"; let mut cursor = Cursor::new(data);
556 let mut report = CorruptionReport {
557 corruption_type: CorruptionType::Unknown,
558 severity: 0,
559 errors: vec![],
560 recoverable_sections: vec![],
561 file_stats: FileStats::default(),
562 };
563
564 let result = check_header(&mut cursor, &mut report).unwrap();
565 assert!(!result);
566 assert!(!report.errors.is_empty());
567 }
568
569 #[test]
570 fn test_check_eof_present() {
571 use std::io::Cursor;
572
573 let data = b"%PDF-1.7\nsome content\n%%EOF\n";
574 let mut cursor = Cursor::new(data);
575 let mut report = CorruptionReport {
576 corruption_type: CorruptionType::Unknown,
577 severity: 0,
578 errors: vec![],
579 recoverable_sections: vec![],
580 file_stats: FileStats {
581 file_size: data.len() as u64,
582 ..Default::default()
583 },
584 };
585
586 check_eof(&mut cursor, &mut report).unwrap();
587 assert_eq!(report.errors.len(), 1);
589 assert!(report.errors[0].contains("analysis complete"));
590 assert_eq!(report.severity, 0);
591 }
592
593 #[test]
594 fn test_check_eof_missing() {
595 use std::io::Cursor;
596
597 let data = b"%PDF-1.7\nsome content without eof";
598 let mut cursor = Cursor::new(data);
599 let mut report = CorruptionReport {
600 corruption_type: CorruptionType::Unknown,
601 severity: 0,
602 errors: vec![],
603 recoverable_sections: vec![],
604 file_stats: FileStats {
605 file_size: data.len() as u64,
606 ..Default::default()
607 },
608 };
609
610 check_eof(&mut cursor, &mut report).unwrap();
611 assert!(!report.errors.is_empty());
612 assert!(report.errors[0].contains("Missing %%EOF"));
613 assert_eq!(report.severity, 5);
614 }
615
616 #[test]
617 fn test_scan_xref_found() {
618 use std::io::Cursor;
619
620 let data = b"%PDF-1.7\nxref\n0 1\n0000000000 65535 f\ntrailer\n";
621 let mut cursor = Cursor::new(data);
622 let mut report = CorruptionReport {
623 corruption_type: CorruptionType::Unknown,
624 severity: 0,
625 errors: vec![],
626 recoverable_sections: vec![],
627 file_stats: FileStats::default(),
628 };
629
630 scan_xref(&mut cursor, &mut report).unwrap();
631 assert!(report
632 .recoverable_sections
633 .iter()
634 .any(|s| matches!(s.section_type, SectionType::XRef)));
635 assert!(report.errors.is_empty() || !report.errors[0].contains("No cross-reference"));
636 }
637
638 #[test]
639 fn test_scan_xref_not_found() {
640 use std::io::Cursor;
641
642 let data = b"%PDF-1.7\nNo cross reference table here";
643 let mut cursor = Cursor::new(data);
644 let mut report = CorruptionReport {
645 corruption_type: CorruptionType::Unknown,
646 severity: 0,
647 errors: vec![],
648 recoverable_sections: vec![],
649 file_stats: FileStats::default(),
650 };
651
652 scan_xref(&mut cursor, &mut report).unwrap();
653 assert!(!report.errors.is_empty());
654 assert!(report.errors[0].contains("No cross-reference tables found"));
655 assert_eq!(report.severity, 8);
656 }
657
658 #[test]
659 fn test_analyze_objects_with_pages() {
660 use std::io::Cursor;
661
662 let data = b"1 0 obj\n<< /Type /Page >>\nendobj\n2 0 obj\n<< /Type /Catalog >>\nendobj";
663 let mut cursor = Cursor::new(data);
664 let mut report = CorruptionReport {
665 corruption_type: CorruptionType::Unknown,
666 severity: 0,
667 errors: vec![],
668 recoverable_sections: vec![],
669 file_stats: FileStats::default(),
670 };
671
672 analyze_objects(&mut cursor, &mut report).unwrap();
673 assert_eq!(report.file_stats.estimated_objects, 2);
674 assert_eq!(report.file_stats.found_pages, 1);
675 assert_eq!(report.file_stats.readable_bytes, data.len() as u64);
676 }
677
678 #[test]
679 fn test_analyze_objects_no_objects() {
680 use std::io::Cursor;
681
682 let data = b"No PDF items here";
683 let mut cursor = Cursor::new(data);
684 let mut report = CorruptionReport {
685 corruption_type: CorruptionType::Unknown,
686 severity: 0,
687 errors: vec![],
688 recoverable_sections: vec![],
689 file_stats: FileStats::default(),
690 };
691
692 analyze_objects(&mut cursor, &mut report).unwrap();
693 assert_eq!(report.file_stats.estimated_objects, 0);
694 assert!(!report.errors.is_empty());
695 assert!(report.errors[0].contains("No PDF objects"));
696 assert_eq!(report.severity, 10);
697 }
698
699 #[test]
700 fn test_is_corrupted_valid_file() {
701 use std::fs::File;
702 use std::io::Write;
703
704 let temp_dir = std::env::temp_dir();
705 let temp_path = temp_dir.join("valid_test.pdf");
706 let mut file = File::create(&temp_path).unwrap();
707 file.write_all(b"%PDF-1.7\n1 0 obj\n<< >>\nendobj\nxref\n0 1\n0000000000 65535 f\ntrailer\n<< >>\nstartxref\n0\n%%EOF").unwrap();
708
709 let corrupted = is_corrupted(&temp_path);
710 let _ = corrupted;
712
713 let _ = std::fs::remove_file(temp_path);
715 }
716
717 #[test]
718 fn test_is_corrupted_invalid_file() {
719 use std::fs::File;
720 use std::io::Write;
721
722 let temp_dir = std::env::temp_dir();
723 let temp_path = temp_dir.join("invalid_test.pdf");
724 let mut file = File::create(&temp_path).unwrap();
725 file.write_all(b"This is not a PDF").unwrap();
726
727 let corrupted = is_corrupted(&temp_path);
728 assert!(corrupted);
729
730 let _ = std::fs::remove_file(temp_path);
732 }
733
734 #[test]
735 fn test_is_corrupted_nonexistent_file() {
736 let temp_dir = std::env::temp_dir();
737 let temp_path = temp_dir.join("nonexistent_test.pdf");
738
739 let corrupted = is_corrupted(&temp_path);
740 assert!(corrupted); }
742
743 #[test]
744 fn test_detect_corruption_comprehensive() {
745 use std::fs::File;
746 use std::io::Write;
747
748 let temp_dir = std::env::temp_dir();
749 let temp_path = temp_dir.join("comprehensive_test.pdf");
750 let mut file = File::create(&temp_path).unwrap();
751 file.write_all(b"%PDF-1.7\n1 0 obj\n<< /Type /Page >>\nendobj")
753 .unwrap();
754
755 let report = detect_corruption(&temp_path).unwrap();
756 assert!(report.severity > 0);
757 assert!(!report.errors.is_empty());
758
759 let _ = std::fs::remove_file(temp_path);
761 }
762
763 #[test]
764 fn test_file_stats_debug() {
765 let stats = FileStats {
766 file_size: 1000,
767 readable_bytes: 950,
768 estimated_objects: 10,
769 found_pages: 3,
770 };
771
772 let debug_str = format!("{stats:?}");
773 assert!(debug_str.contains("FileStats"));
774 assert!(debug_str.contains("1000"));
775 assert!(debug_str.contains("950"));
776 assert!(debug_str.contains("10"));
777 assert!(debug_str.contains("3"));
778 }
779
780 #[test]
781 fn test_corruption_report_debug() {
782 let report = CorruptionReport {
783 corruption_type: CorruptionType::Unknown,
784 severity: 5,
785 errors: vec!["Test error".to_string()],
786 recoverable_sections: vec![],
787 file_stats: FileStats::default(),
788 };
789
790 let debug_str = format!("{report:?}");
791 assert!(debug_str.contains("CorruptionReport"));
792 assert!(debug_str.contains("Unknown"));
793 assert!(debug_str.contains("5"));
794 }
795}