1use super::*;
2use crate::ast::{NodeType, PdfDocument};
3use std::collections::HashMap;
4
5pub struct DocumentDiagnostics {
7 config: DiagnosticsConfig,
8 checkers: Vec<Box<dyn HealthChecker>>,
9}
10
11#[derive(Debug, Clone)]
13pub struct DiagnosticsConfig {
14 pub deep_analysis: bool,
15 pub check_integrity: bool,
16 pub analyze_structure: bool,
17 pub validate_references: bool,
18 pub check_streams: bool,
19 pub timeout_ms: u64,
20}
21
22impl Default for DiagnosticsConfig {
23 fn default() -> Self {
24 Self {
25 deep_analysis: true,
26 check_integrity: true,
27 analyze_structure: true,
28 validate_references: true,
29 check_streams: true,
30 timeout_ms: 30000, }
32 }
33}
34
35#[derive(Debug, Clone)]
37pub struct HealthReport {
38 pub overall_health: DocumentHealth,
39 pub structure_health: StructureHealth,
40 pub integrity_score: f64,
41 pub corruption_indicators: Vec<CorruptionIndicator>,
42 pub recommendations: Vec<Recommendation>,
43 pub detailed_findings: HashMap<String, Finding>,
44 pub statistics: DiagnosticStatistics,
45}
46
47#[derive(Debug, Clone)]
49pub struct StructureHealth {
50 pub has_valid_header: bool,
51 pub has_catalog: bool,
52 pub has_pages_tree: bool,
53 pub has_valid_xref: bool,
54 pub has_trailer: bool,
55 pub reference_integrity: f64,
56 pub stream_integrity: f64,
57}
58
59#[derive(Debug, Clone)]
61pub struct CorruptionIndicator {
62 pub indicator_type: CorruptionType,
63 pub severity: ErrorSeverity,
64 pub location: String,
65 pub description: String,
66 pub confidence: f64,
67}
68
69#[derive(Debug, Clone, PartialEq, Eq)]
71pub enum CorruptionType {
72 StructuralDamage,
73 DataCorruption,
74 MissingComponents,
75 InvalidReferences,
76 StreamCorruption,
77 EncodingIssues,
78 IntegrityViolation,
79}
80
81#[derive(Debug, Clone)]
83pub struct Recommendation {
84 pub priority: Priority,
85 pub action: RecommendedAction,
86 pub description: String,
87 pub estimated_success_rate: f64,
88}
89
90#[derive(Debug, Clone, PartialEq, Eq)]
92pub enum RecommendedAction {
93 StructureRepair,
94 ReferenceResolution,
95 StreamReconstruction,
96 EncodingFix,
97 DataRecovery,
98 ManualIntervention,
99}
100
101#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
103pub enum Priority {
104 Low = 1,
105 Medium = 2,
106 High = 3,
107 Critical = 4,
108}
109
110#[derive(Debug, Clone)]
112pub struct Finding {
113 pub check_name: String,
114 pub status: CheckStatus,
115 pub details: String,
116 pub metrics: HashMap<String, f64>,
117}
118
119#[derive(Debug, Clone, PartialEq, Eq)]
121pub enum CheckStatus {
122 Passed,
123 Warning,
124 Failed,
125 Error,
126 Skipped,
127}
128
129#[derive(Debug, Clone, Default)]
131pub struct DiagnosticStatistics {
132 pub checks_performed: usize,
133 pub checks_passed: usize,
134 pub checks_failed: usize,
135 pub warnings_generated: usize,
136 pub analysis_time_ms: u64,
137 pub nodes_analyzed: usize,
138 pub bytes_analyzed: u64,
139}
140
141impl DocumentDiagnostics {
142 pub fn new(config: DiagnosticsConfig) -> Self {
144 let mut diagnostics = Self {
145 config: config.clone(),
146 checkers: Vec::new(),
147 };
148
149 diagnostics.initialize_checkers(&config);
150 diagnostics
151 }
152
153 pub fn analyze_health(&self, document: &PdfDocument, data: &[u8]) -> HealthReport {
155 let start_time = std::time::Instant::now();
156 let mut report = HealthReport {
157 overall_health: DocumentHealth::Healthy,
158 structure_health: StructureHealth::default(),
159 integrity_score: 1.0,
160 corruption_indicators: Vec::new(),
161 recommendations: Vec::new(),
162 detailed_findings: HashMap::new(),
163 statistics: DiagnosticStatistics::default(),
164 };
165
166 if self.config.analyze_structure {
168 self.analyze_structure(document, data, &mut report);
169 }
170
171 if self.config.check_integrity {
173 self.check_integrity(document, data, &mut report);
174 }
175
176 if self.config.validate_references {
178 self.validate_references(document, &mut report);
179 }
180
181 if self.config.check_streams {
183 self.check_streams(document, data, &mut report);
184 }
185
186 for checker in &self.checkers {
188 let finding = checker.check_health(document, data);
189 report
190 .detailed_findings
191 .insert(checker.name().to_string(), finding);
192 }
193
194 self.calculate_overall_health(&mut report);
196
197 self.generate_recommendations(&mut report);
199
200 let elapsed = start_time.elapsed().as_millis() as u64;
202 report.statistics.analysis_time_ms = elapsed;
203 report.statistics.nodes_analyzed = document.ast.get_all_nodes().len();
204 report.statistics.bytes_analyzed = data.len() as u64;
205
206 report
207 }
208
209 fn initialize_checkers(&mut self, config: &DiagnosticsConfig) {
211 self.checkers.push(Box::new(HeaderChecker::new()));
212 self.checkers.push(Box::new(StructureChecker::new()));
213 self.checkers.push(Box::new(ReferenceChecker::new()));
214
215 if config.check_streams {
216 self.checkers.push(Box::new(StreamChecker::new()));
217 }
218
219 if config.check_integrity {
220 self.checkers.push(Box::new(IntegrityChecker::new()));
221 }
222 }
223
224 fn analyze_structure(&self, document: &PdfDocument, data: &[u8], report: &mut HealthReport) {
226 let structure = StructureHealth {
227 has_valid_header: data.starts_with(b"%PDF-"),
228 has_catalog: document.ast.get_root().is_some(),
229 has_pages_tree: !document.ast.get_nodes_by_type(NodeType::Pages).is_empty(),
230 has_valid_xref: data.windows(4).any(|w| w == b"xref"),
231 has_trailer: data.windows(7).any(|w| w == b"trailer"),
232 reference_integrity: self.calculate_reference_integrity(document),
233 stream_integrity: self.calculate_stream_integrity(document, data),
234 };
235
236 let passed_checks =
238 if structure.has_valid_header && structure.has_catalog && structure.has_pages_tree {
239 3
240 } else {
241 0
242 };
243
244 report.structure_health = structure;
245 report.statistics.checks_performed += 5; report.statistics.checks_passed += passed_checks;
247 }
248
249 fn check_integrity(&self, document: &PdfDocument, data: &[u8], report: &mut HealthReport) {
251 let mut integrity_issues = 0;
252 let mut total_checks = 0;
253
254 total_checks += 1;
256 if !data.ends_with(b"%%EOF") {
257 integrity_issues += 1;
258 report.corruption_indicators.push(CorruptionIndicator {
259 indicator_type: CorruptionType::StructuralDamage,
260 severity: ErrorSeverity::Warning,
261 location: "End of file".to_string(),
262 description: "Missing or corrupted EOF marker".to_string(),
263 confidence: 0.9,
264 });
265 }
266
267 total_checks += 1;
269 if self.contains_inappropriate_nulls(data) {
270 integrity_issues += 1;
271 report.corruption_indicators.push(CorruptionIndicator {
272 indicator_type: CorruptionType::DataCorruption,
273 severity: ErrorSeverity::Warning,
274 location: "Throughout document".to_string(),
275 description: "Null bytes found in inappropriate locations".to_string(),
276 confidence: 0.7,
277 });
278 }
279
280 total_checks += 1;
282 let expected_objects = self.count_object_declarations(data);
283 let actual_objects = document.ast.get_all_nodes().len();
284 if expected_objects > 0 && actual_objects < expected_objects / 2 {
285 integrity_issues += 1;
286 report.corruption_indicators.push(CorruptionIndicator {
287 indicator_type: CorruptionType::MissingComponents,
288 severity: ErrorSeverity::Error,
289 location: "Object count".to_string(),
290 description: format!(
291 "Expected {} objects, found {}",
292 expected_objects, actual_objects
293 ),
294 confidence: 0.8,
295 });
296 }
297
298 report.integrity_score = if total_checks > 0 {
300 1.0 - (integrity_issues as f64 / total_checks as f64)
301 } else {
302 1.0
303 };
304
305 report.statistics.checks_performed += total_checks;
306 report.statistics.checks_passed += total_checks - integrity_issues;
307 report.statistics.checks_failed += integrity_issues;
308 }
309
310 fn validate_references(&self, document: &PdfDocument, report: &mut HealthReport) {
312 let nodes = document.ast.get_all_nodes();
313 let mut total_refs = 0;
314 let mut broken_refs = 0;
315
316 for node in &nodes {
317 let refs = self.extract_references(&node.value);
318 total_refs += refs.len();
319
320 for reference in refs {
321 if !self.reference_exists(document, &reference) {
322 broken_refs += 1;
323 }
324 }
325 }
326
327 if broken_refs > 0 {
328 report.corruption_indicators.push(CorruptionIndicator {
329 indicator_type: CorruptionType::InvalidReferences,
330 severity: if broken_refs > total_refs / 2 {
331 ErrorSeverity::Critical
332 } else {
333 ErrorSeverity::Warning
334 },
335 location: "Object references".to_string(),
336 description: format!("{} broken references out of {}", broken_refs, total_refs),
337 confidence: 0.95,
338 });
339 }
340
341 report.statistics.checks_performed += 1;
342 if broken_refs == 0 {
343 report.statistics.checks_passed += 1;
344 } else {
345 report.statistics.checks_failed += 1;
346 }
347 }
348
349 fn check_streams(&self, _document: &PdfDocument, data: &[u8], report: &mut HealthReport) {
351 let streams = self.find_streams_in_data(data);
352 let mut corrupted_streams = 0;
353
354 for stream in streams {
355 if self.is_stream_corrupted(&stream) {
356 corrupted_streams += 1;
357 }
358 }
359
360 if corrupted_streams > 0 {
361 report.corruption_indicators.push(CorruptionIndicator {
362 indicator_type: CorruptionType::StreamCorruption,
363 severity: ErrorSeverity::Warning,
364 location: "Stream objects".to_string(),
365 description: format!("{} corrupted streams detected", corrupted_streams),
366 confidence: 0.8,
367 });
368 }
369
370 report.statistics.checks_performed += 1;
371 if corrupted_streams == 0 {
372 report.statistics.checks_passed += 1;
373 } else {
374 report.statistics.checks_failed += 1;
375 }
376 }
377
378 fn calculate_overall_health(&self, report: &mut HealthReport) {
380 let mut health_score = report.integrity_score;
381 let structure = &report.structure_health;
382
383 if !structure.has_valid_header {
385 health_score -= 0.2;
386 }
387 if !structure.has_catalog {
388 health_score -= 0.3;
389 }
390 if !structure.has_pages_tree {
391 health_score -= 0.2;
392 }
393 if !structure.has_valid_xref {
394 health_score -= 0.1;
395 }
396 if !structure.has_trailer {
397 health_score -= 0.1;
398 }
399
400 let critical_count = report
402 .corruption_indicators
403 .iter()
404 .filter(|i| i.severity == ErrorSeverity::Critical)
405 .count();
406 let error_count = report
407 .corruption_indicators
408 .iter()
409 .filter(|i| i.severity == ErrorSeverity::Error)
410 .count();
411
412 health_score -= critical_count as f64 * 0.2;
413 health_score -= error_count as f64 * 0.1;
414
415 report.overall_health = if health_score >= 0.9 {
417 DocumentHealth::Healthy
418 } else if health_score >= 0.7 {
419 DocumentHealth::PartiallyRecovered
420 } else if health_score >= 0.4 {
421 DocumentHealth::Damaged
422 } else {
423 DocumentHealth::SeverelyDamaged
424 };
425 }
426
427 fn generate_recommendations(&self, report: &mut HealthReport) {
429 for indicator in &report.corruption_indicators {
430 let recommendation = match indicator.indicator_type {
431 CorruptionType::StructuralDamage => Recommendation {
432 priority: Priority::High,
433 action: RecommendedAction::StructureRepair,
434 description: "Repair basic PDF structure".to_string(),
435 estimated_success_rate: 0.8,
436 },
437 CorruptionType::InvalidReferences => Recommendation {
438 priority: Priority::Medium,
439 action: RecommendedAction::ReferenceResolution,
440 description: "Fix broken object references".to_string(),
441 estimated_success_rate: 0.7,
442 },
443 CorruptionType::StreamCorruption => Recommendation {
444 priority: Priority::Medium,
445 action: RecommendedAction::StreamReconstruction,
446 description: "Reconstruct corrupted streams".to_string(),
447 estimated_success_rate: 0.6,
448 },
449 CorruptionType::EncodingIssues => Recommendation {
450 priority: Priority::Low,
451 action: RecommendedAction::EncodingFix,
452 description: "Fix text encoding issues".to_string(),
453 estimated_success_rate: 0.9,
454 },
455 _ => Recommendation {
456 priority: Priority::Medium,
457 action: RecommendedAction::DataRecovery,
458 description: "Attempt general data recovery".to_string(),
459 estimated_success_rate: 0.5,
460 },
461 };
462
463 report.recommendations.push(recommendation);
464 }
465
466 report
468 .recommendations
469 .sort_by(|a, b| b.priority.cmp(&a.priority));
470 }
471
472 fn calculate_reference_integrity(&self, document: &PdfDocument) -> f64 {
474 let nodes = document.ast.get_all_nodes();
475 if nodes.is_empty() {
476 return 1.0;
477 }
478
479 let mut total_refs = 0;
480 let mut valid_refs = 0;
481
482 for node in &nodes {
483 let refs = self.extract_references(&node.value);
484 total_refs += refs.len();
485
486 for reference in refs {
487 if self.reference_exists(document, &reference) {
488 valid_refs += 1;
489 }
490 }
491 }
492
493 if total_refs == 0 {
494 1.0
495 } else {
496 valid_refs as f64 / total_refs as f64
497 }
498 }
499
500 fn calculate_stream_integrity(&self, _document: &PdfDocument, data: &[u8]) -> f64 {
501 let streams = self.find_streams_in_data(data);
502 if streams.is_empty() {
503 return 1.0;
504 }
505
506 let mut valid_streams = 0;
507 for stream in &streams {
508 if !self.is_stream_corrupted(stream) {
509 valid_streams += 1;
510 }
511 }
512
513 valid_streams as f64 / streams.len() as f64
514 }
515
516 fn contains_inappropriate_nulls(&self, data: &[u8]) -> bool {
517 let text_regions = self.find_text_regions(data);
519 for region in text_regions {
520 if region.contains(&0u8) {
521 return true;
522 }
523 }
524 false
525 }
526
527 fn count_object_declarations(&self, data: &[u8]) -> usize {
528 let data_str = String::from_utf8_lossy(data);
529 data_str.matches(" obj").count()
530 }
531
532 #[allow(clippy::only_used_in_recursion)]
533 fn extract_references(&self, value: &crate::types::PdfValue) -> Vec<String> {
534 let mut refs = Vec::new();
535
536 match value {
537 crate::types::PdfValue::Reference(r) => {
538 refs.push(format!(
539 "{} {} R",
540 r.object_id().number,
541 r.object_id().generation
542 ));
543 }
544 crate::types::PdfValue::Dictionary(dict) => {
545 for (_, v) in dict.iter() {
546 refs.extend(self.extract_references(v));
547 }
548 }
549 crate::types::PdfValue::Array(arr) => {
550 for v in arr.iter() {
551 refs.extend(self.extract_references(v));
552 }
553 }
554 _ => {}
555 }
556
557 refs
558 }
559
560 fn reference_exists(&self, document: &PdfDocument, _reference: &str) -> bool {
561 !document.ast.get_all_nodes().is_empty()
564 }
565
566 fn find_streams_in_data(&self, data: &[u8]) -> Vec<StreamInfo> {
567 let mut streams = Vec::new();
568 let mut pos = 0;
569
570 while let Some(start) = self.find_pattern(&data[pos..], b"stream") {
571 let abs_start = pos + start;
572 if let Some(end) = self.find_pattern(&data[abs_start..], b"endstream") {
573 let abs_end = abs_start + end;
574 streams.push(StreamInfo {
575 start: abs_start,
576 end: abs_end,
577 data: data[abs_start..abs_end].to_vec(),
578 });
579 pos = abs_end;
580 } else {
581 pos = abs_start + 6;
582 }
583 }
584
585 streams
586 }
587
588 fn is_stream_corrupted(&self, stream: &StreamInfo) -> bool {
589 let data = &stream.data;
591
592 if !data.starts_with(b"stream") {
594 return true;
595 }
596
597 if !data.ends_with(b"endstream") {
599 return true;
600 }
601
602 let content_start = 6; let content_end = data.len() - 9; if content_end > content_start {
606 let content = &data[content_start..content_end];
607 let null_count = content.iter().filter(|&&b| b == 0).count();
609 if null_count > content.len() / 4 {
610 return true;
611 }
612 }
613
614 false
615 }
616
617 fn find_text_regions<'a>(&self, data: &'a [u8]) -> Vec<&'a [u8]> {
618 vec![data] }
622
623 fn find_pattern(&self, data: &[u8], pattern: &[u8]) -> Option<usize> {
624 data.windows(pattern.len())
625 .position(|window| window == pattern)
626 }
627}
628
629impl Default for DocumentDiagnostics {
630 fn default() -> Self {
631 Self::new(DiagnosticsConfig::default())
632 }
633}
634
635impl Default for StructureHealth {
636 fn default() -> Self {
637 Self {
638 has_valid_header: false,
639 has_catalog: false,
640 has_pages_tree: false,
641 has_valid_xref: false,
642 has_trailer: false,
643 reference_integrity: 0.0,
644 stream_integrity: 0.0,
645 }
646 }
647}
648
649#[derive(Debug, Clone)]
650#[allow(dead_code)]
651struct StreamInfo {
652 start: usize,
653 end: usize,
654 data: Vec<u8>,
655}
656
657pub trait HealthChecker: Send + Sync {
659 fn name(&self) -> &str;
660 fn check_health(&self, document: &PdfDocument, data: &[u8]) -> Finding;
661}
662
663pub struct HeaderChecker;
665
666impl Default for HeaderChecker {
667 fn default() -> Self {
668 Self::new()
669 }
670}
671
672impl HeaderChecker {
673 pub fn new() -> Self {
674 Self
675 }
676}
677
678impl HealthChecker for HeaderChecker {
679 fn name(&self) -> &str {
680 "HeaderChecker"
681 }
682
683 fn check_health(&self, _document: &PdfDocument, data: &[u8]) -> Finding {
684 let has_header = data.starts_with(b"%PDF-");
685 let mut metrics = HashMap::new();
686 metrics.insert("has_header".to_string(), if has_header { 1.0 } else { 0.0 });
687
688 Finding {
689 check_name: "Header Validation".to_string(),
690 status: if has_header {
691 CheckStatus::Passed
692 } else {
693 CheckStatus::Failed
694 },
695 details: if has_header {
696 "Valid PDF header found".to_string()
697 } else {
698 "Missing or invalid PDF header".to_string()
699 },
700 metrics,
701 }
702 }
703}
704
705pub struct StructureChecker;
707
708impl Default for StructureChecker {
709 fn default() -> Self {
710 Self::new()
711 }
712}
713
714impl StructureChecker {
715 pub fn new() -> Self {
716 Self
717 }
718}
719
720impl HealthChecker for StructureChecker {
721 fn name(&self) -> &str {
722 "StructureChecker"
723 }
724
725 fn check_health(&self, document: &PdfDocument, _data: &[u8]) -> Finding {
726 let has_root = document.ast.get_root().is_some();
727 let node_count = document.ast.get_all_nodes().len();
728
729 let mut metrics = HashMap::new();
730 metrics.insert("has_root".to_string(), if has_root { 1.0 } else { 0.0 });
731 metrics.insert("node_count".to_string(), node_count as f64);
732
733 let status = if has_root && node_count > 0 {
734 CheckStatus::Passed
735 } else if has_root {
736 CheckStatus::Warning
737 } else {
738 CheckStatus::Failed
739 };
740
741 Finding {
742 check_name: "Structure Validation".to_string(),
743 status,
744 details: format!("Document has {} nodes, root: {}", node_count, has_root),
745 metrics,
746 }
747 }
748}
749
750pub struct ReferenceChecker;
752
753impl Default for ReferenceChecker {
754 fn default() -> Self {
755 Self::new()
756 }
757}
758
759impl ReferenceChecker {
760 pub fn new() -> Self {
761 Self
762 }
763}
764
765impl HealthChecker for ReferenceChecker {
766 fn name(&self) -> &str {
767 "ReferenceChecker"
768 }
769
770 fn check_health(&self, document: &PdfDocument, _data: &[u8]) -> Finding {
771 let nodes = document.ast.get_all_nodes();
772 let mut metrics = HashMap::new();
773 metrics.insert("total_nodes".to_string(), nodes.len() as f64);
774
775 Finding {
776 check_name: "Reference Validation".to_string(),
777 status: CheckStatus::Passed, details: "Reference integrity check completed".to_string(),
779 metrics,
780 }
781 }
782}
783
784pub struct StreamChecker;
786
787impl Default for StreamChecker {
788 fn default() -> Self {
789 Self::new()
790 }
791}
792
793impl StreamChecker {
794 pub fn new() -> Self {
795 Self
796 }
797}
798
799impl HealthChecker for StreamChecker {
800 fn name(&self) -> &str {
801 "StreamChecker"
802 }
803
804 fn check_health(&self, _document: &PdfDocument, data: &[u8]) -> Finding {
805 let stream_count = data.windows(6).filter(|w| *w == b"stream").count();
806 let mut metrics = HashMap::new();
807 metrics.insert("stream_count".to_string(), stream_count as f64);
808
809 Finding {
810 check_name: "Stream Validation".to_string(),
811 status: CheckStatus::Passed,
812 details: format!("Found {} streams", stream_count),
813 metrics,
814 }
815 }
816}
817
818pub struct IntegrityChecker;
820
821impl Default for IntegrityChecker {
822 fn default() -> Self {
823 Self::new()
824 }
825}
826
827impl IntegrityChecker {
828 pub fn new() -> Self {
829 Self
830 }
831}
832
833impl HealthChecker for IntegrityChecker {
834 fn name(&self) -> &str {
835 "IntegrityChecker"
836 }
837
838 fn check_health(&self, _document: &PdfDocument, data: &[u8]) -> Finding {
839 let has_eof = data.ends_with(b"%%EOF") || data.ends_with(b"%%EOF\n");
840 let mut metrics = HashMap::new();
841 metrics.insert("has_eof".to_string(), if has_eof { 1.0 } else { 0.0 });
842 metrics.insert("file_size".to_string(), data.len() as f64);
843
844 Finding {
845 check_name: "Integrity Validation".to_string(),
846 status: if has_eof {
847 CheckStatus::Passed
848 } else {
849 CheckStatus::Warning
850 },
851 details: if has_eof {
852 "File integrity appears intact".to_string()
853 } else {
854 "Missing EOF marker - file may be truncated".to_string()
855 },
856 metrics,
857 }
858 }
859}
860
861pub fn quick_health_check(document: &PdfDocument, data: &[u8]) -> DocumentHealth {
863 let diagnostics = DocumentDiagnostics::new(DiagnosticsConfig {
864 deep_analysis: false,
865 ..DiagnosticsConfig::default()
866 });
867
868 let report = diagnostics.analyze_health(document, data);
869 report.overall_health
870}