Skip to main content

pdf_ast/recovery/
mod.rs

1/// Advanced error recovery and resilient parsing
2///
3/// This module provides sophisticated error recovery mechanisms that allow
4/// the parser to continue processing even when encountering malformed or
5/// corrupted PDF data, making it suitable for forensic analysis and
6/// handling real-world, imperfect PDF documents.
7use crate::ast::{AstNode, AstResult, NodeId, NodeMetadata, NodeType, PdfDocument};
8use crate::parser::PdfParser;
9use crate::types::PdfValue;
10use std::collections::HashMap;
11use std::io::Cursor;
12
13pub mod diagnostics;
14pub mod reconstruction;
15pub mod strategies;
16
17pub use diagnostics::*;
18pub use reconstruction::*;
19pub use strategies::*;
20
21/// Recovery-enabled parser that can handle malformed PDFs
22pub struct RecoveryParser {
23    base_parser: PdfParser,
24    recovery_config: RecoveryConfig,
25    recovery_strategies: Vec<Box<dyn RecoveryStrategy>>,
26    error_log: Vec<RecoveryError>,
27    statistics: RecoveryStatistics,
28}
29
30/// Configuration for error recovery
31#[derive(Debug, Clone)]
32pub struct RecoveryConfig {
33    pub max_errors: usize,
34    pub skip_corrupted_objects: bool,
35    pub attempt_structure_reconstruction: bool,
36    pub use_heuristic_parsing: bool,
37    pub preserve_partial_objects: bool,
38    pub enable_fuzzy_matching: bool,
39    pub recovery_aggressiveness: RecoveryLevel,
40    pub timeout_ms: u64,
41}
42
43/// Level of recovery aggressiveness
44#[derive(Debug, Clone, Copy, PartialEq, Eq)]
45pub enum RecoveryLevel {
46    Conservative, // Only fix obvious errors
47    Moderate,     // Apply common fixes and heuristics
48    Aggressive,   // Try all available recovery strategies
49    Experimental, // Use experimental and risky recovery methods
50}
51
52impl Default for RecoveryConfig {
53    fn default() -> Self {
54        Self {
55            max_errors: 1000,
56            skip_corrupted_objects: true,
57            attempt_structure_reconstruction: true,
58            use_heuristic_parsing: true,
59            preserve_partial_objects: true,
60            enable_fuzzy_matching: true,
61            recovery_aggressiveness: RecoveryLevel::Moderate,
62            timeout_ms: 60000, // 1 minute
63        }
64    }
65}
66
67/// Statistics for recovery operations
68#[derive(Debug, Clone, Default)]
69pub struct RecoveryStatistics {
70    pub total_errors_encountered: usize,
71    pub errors_recovered: usize,
72    pub objects_skipped: usize,
73    pub objects_reconstructed: usize,
74    pub heuristic_fixes_applied: usize,
75    pub fuzzy_matches: usize,
76    pub recovery_time_ms: u64,
77    pub success_rate: f64,
78}
79
80/// Error encountered during parsing with recovery context
81#[derive(Debug, Clone)]
82pub struct RecoveryError {
83    pub error_type: RecoveryErrorType,
84    pub location: ErrorLocation,
85    pub original_error: String,
86    pub recovery_attempt: Option<RecoveryAttempt>,
87    pub severity: ErrorSeverity,
88    pub context: ErrorContext,
89}
90
91/// Type of recovery error
92#[derive(Debug, Clone, PartialEq, Eq)]
93pub enum RecoveryErrorType {
94    ParseError,
95    StructuralError,
96    ReferenceError,
97    StreamError,
98    EncodingError,
99    IntegrityError,
100    UnknownFormat,
101}
102
103/// Location where error occurred
104#[derive(Debug, Clone)]
105pub struct ErrorLocation {
106    pub byte_offset: u64,
107    pub line_number: Option<usize>,
108    pub object_number: Option<u32>,
109    pub context_description: String,
110}
111
112/// Recovery attempt information
113#[derive(Debug, Clone)]
114pub struct RecoveryAttempt {
115    pub strategy_used: String,
116    pub success: bool,
117    pub result_description: String,
118    pub time_taken_ms: u64,
119}
120
121/// Severity of an error
122#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
123pub enum ErrorSeverity {
124    Info,
125    Warning,
126    Error,
127    Critical,
128    Fatal,
129}
130
131/// Context information for error recovery
132#[derive(Debug, Clone)]
133pub struct ErrorContext {
134    pub surrounding_data: Vec<u8>,
135    pub object_hierarchy: Vec<String>,
136    pub reference_chain: Vec<String>,
137    pub hints: HashMap<String, String>,
138}
139
140impl RecoveryParser {
141    /// Create a new recovery parser
142    pub fn new(config: RecoveryConfig) -> Self {
143        let mut parser = Self {
144            base_parser: PdfParser::new(),
145            recovery_config: config.clone(),
146            recovery_strategies: Vec::new(),
147            error_log: Vec::new(),
148            statistics: RecoveryStatistics::default(),
149        };
150
151        parser.initialize_strategies(&config);
152        parser
153    }
154
155    /// Parse a PDF document with error recovery
156    pub fn parse_with_recovery(&mut self, data: &[u8]) -> AstResult<(PdfDocument, RecoveryReport)> {
157        let start_time = std::time::Instant::now();
158
159        // First attempt normal parsing
160        match self.base_parser.parse(&mut Cursor::new(data)) {
161            Ok(document) => {
162                // Normal parsing succeeded
163                let report = RecoveryReport {
164                    success: true,
165                    errors_encountered: Vec::new(),
166                    recovery_actions: Vec::new(),
167                    statistics: self.statistics.clone(),
168                    final_document_health: DocumentHealth::Healthy,
169                };
170                return Ok((document, report));
171            }
172            Err(initial_error) => {
173                // Normal parsing failed, begin recovery
174                self.log_error(RecoveryError {
175                    error_type: RecoveryErrorType::ParseError,
176                    location: ErrorLocation {
177                        byte_offset: 0,
178                        line_number: None,
179                        object_number: None,
180                        context_description: "Initial parse attempt".to_string(),
181                    },
182                    original_error: format!("{:?}", initial_error),
183                    recovery_attempt: None,
184                    severity: ErrorSeverity::Error,
185                    context: ErrorContext {
186                        surrounding_data: data.get(0..100).unwrap_or(data).to_vec(),
187                        object_hierarchy: Vec::new(),
188                        reference_chain: Vec::new(),
189                        hints: HashMap::new(),
190                    },
191                });
192            }
193        }
194
195        // Begin recovery process
196        let recovery_result = self.attempt_recovery(data)?;
197        let elapsed = start_time.elapsed().as_millis() as u64;
198        self.statistics.recovery_time_ms = elapsed;
199
200        // Calculate success rate
201        if self.statistics.total_errors_encountered > 0 {
202            self.statistics.success_rate = self.statistics.errors_recovered as f64
203                / self.statistics.total_errors_encountered as f64;
204        }
205
206        Ok(recovery_result)
207    }
208
209    /// Attempt recovery using all available strategies
210    fn attempt_recovery(&mut self, data: &[u8]) -> AstResult<(PdfDocument, RecoveryReport)> {
211        let mut document = PdfDocument::new(crate::ast::PdfVersion { major: 1, minor: 4 });
212        let mut recovery_actions = Vec::new();
213        let mut current_data = data.to_vec();
214
215        // Apply recovery strategies in order of preference
216        for strategy in &self.recovery_strategies {
217            let context = RecoveryContext {
218                original_data: data,
219                current_data: &current_data,
220                document: &document,
221                config: &self.recovery_config,
222                error_log: &self.error_log,
223            };
224
225            match strategy.apply_recovery(context) {
226                Ok(result) => {
227                    recovery_actions.push(RecoveryAction {
228                        strategy_name: strategy.name().to_string(),
229                        action_type: result.action_type,
230                        description: result.description,
231                        success: true,
232                        data_modified: result.data_changed,
233                    });
234
235                    if result.data_changed {
236                        current_data = result.modified_data.unwrap_or(current_data);
237                    }
238
239                    if result.document_changed {
240                        if let Some(new_doc) = result.modified_document {
241                            document = new_doc;
242                        }
243                    }
244
245                    self.statistics.errors_recovered += 1;
246                }
247                Err(e) => {
248                    recovery_actions.push(RecoveryAction {
249                        strategy_name: strategy.name().to_string(),
250                        action_type: RecoveryActionType::Failed,
251                        description: format!("Strategy failed: {:?}", e),
252                        success: false,
253                        data_modified: false,
254                    });
255                }
256            }
257        }
258
259        // Final parsing attempt with recovered data
260        let final_document = match self.base_parser.parse(&mut Cursor::new(&current_data)) {
261            Ok(doc) => doc,
262            Err(_) => {
263                // If all recovery failed, return best-effort document
264                self.create_best_effort_document(&current_data)?
265            }
266        };
267
268        let health = self.assess_document_health(&final_document);
269
270        let report = RecoveryReport {
271            success: !recovery_actions.is_empty(),
272            errors_encountered: self.error_log.clone(),
273            recovery_actions,
274            statistics: self.statistics.clone(),
275            final_document_health: health,
276        };
277
278        Ok((final_document, report))
279    }
280
281    /// Initialize recovery strategies based on configuration
282    fn initialize_strategies(&mut self, config: &RecoveryConfig) {
283        // Add strategies based on recovery level
284        match config.recovery_aggressiveness {
285            RecoveryLevel::Conservative => {
286                self.recovery_strategies
287                    .push(Box::new(BasicStructureRecovery::new()));
288                self.recovery_strategies
289                    .push(Box::new(ReferenceRecovery::new()));
290                self.recovery_strategies
291                    .push(Box::new(StructureRepairStrategy::new()));
292            }
293            RecoveryLevel::Moderate => {
294                self.recovery_strategies
295                    .push(Box::new(BasicStructureRecovery::new()));
296                self.recovery_strategies
297                    .push(Box::new(StructureRepairStrategy::new()));
298                self.recovery_strategies
299                    .push(Box::new(XRefRebuildStrategy::new()));
300                self.recovery_strategies
301                    .push(Box::new(ReferenceRecovery::new()));
302                self.recovery_strategies
303                    .push(Box::new(StreamRecovery::new()));
304                self.recovery_strategies
305                    .push(Box::new(StreamRepairStrategy::new()));
306                self.recovery_strategies
307                    .push(Box::new(DataRecoveryStrategy::new()));
308                self.recovery_strategies
309                    .push(Box::new(EncodingRecovery::new()));
310            }
311            RecoveryLevel::Aggressive => {
312                self.recovery_strategies
313                    .push(Box::new(BasicStructureRecovery::new()));
314                self.recovery_strategies
315                    .push(Box::new(StructureRepairStrategy::new()));
316                self.recovery_strategies
317                    .push(Box::new(XRefRebuildStrategy::new()));
318                self.recovery_strategies
319                    .push(Box::new(ReferenceRecovery::new()));
320                self.recovery_strategies
321                    .push(Box::new(StreamRecovery::new()));
322                self.recovery_strategies
323                    .push(Box::new(StreamRepairStrategy::new()));
324                self.recovery_strategies
325                    .push(Box::new(DataRecoveryStrategy::new()));
326                self.recovery_strategies
327                    .push(Box::new(EncodingRecovery::new()));
328                self.recovery_strategies
329                    .push(Box::new(HeuristicRecovery::new()));
330                self.recovery_strategies
331                    .push(Box::new(FuzzyMatchingRecovery::new()));
332            }
333            RecoveryLevel::Experimental => {
334                // Add all strategies including experimental ones
335                self.recovery_strategies
336                    .push(Box::new(BasicStructureRecovery::new()));
337                self.recovery_strategies
338                    .push(Box::new(StructureRepairStrategy::new()));
339                self.recovery_strategies
340                    .push(Box::new(XRefRebuildStrategy::new()));
341                self.recovery_strategies
342                    .push(Box::new(ReferenceRecovery::new()));
343                self.recovery_strategies
344                    .push(Box::new(StreamRecovery::new()));
345                self.recovery_strategies
346                    .push(Box::new(StreamRepairStrategy::new()));
347                self.recovery_strategies
348                    .push(Box::new(DataRecoveryStrategy::new()));
349                self.recovery_strategies
350                    .push(Box::new(EncodingRecovery::new()));
351                self.recovery_strategies
352                    .push(Box::new(HeuristicRecovery::new()));
353                self.recovery_strategies
354                    .push(Box::new(FuzzyMatchingRecovery::new()));
355                self.recovery_strategies
356                    .push(Box::new(ExperimentalRecovery::new()));
357            }
358        }
359    }
360
361    /// Log a recovery error
362    fn log_error(&mut self, error: RecoveryError) {
363        self.statistics.total_errors_encountered += 1;
364        self.error_log.push(error);
365
366        // Limit error log size
367        if self.error_log.len() > self.recovery_config.max_errors {
368            self.error_log.remove(0);
369        }
370    }
371
372    /// Create a best-effort document when all recovery fails
373    fn create_best_effort_document(&self, data: &[u8]) -> AstResult<PdfDocument> {
374        let mut document = PdfDocument::new(crate::ast::PdfVersion { major: 1, minor: 4 });
375
376        // Create minimal document structure
377        let catalog_id = document.ast.create_node(
378            NodeType::Catalog,
379            PdfValue::Dictionary({
380                let mut dict = crate::types::PdfDictionary::new();
381                dict.insert(
382                    "Type",
383                    PdfValue::Name(crate::types::PdfName::new("Catalog")),
384                );
385                dict
386            }),
387        );
388        document.ast.set_root(catalog_id);
389
390        // Try to extract any recognizable objects
391        let objects = self.extract_salvageable_objects(data);
392        for object in objects {
393            let node_id = document.ast.create_node(object.node_type, object.value);
394            // Link to catalog if possible
395            document
396                .ast
397                .add_edge(catalog_id, node_id, crate::ast::EdgeType::Child);
398        }
399
400        Ok(document)
401    }
402
403    /// Extract any objects that can be salvaged from corrupted data
404    fn extract_salvageable_objects(&self, data: &[u8]) -> Vec<AstNode> {
405        let mut objects = Vec::new();
406        let mut pos = 0;
407
408        while pos < data.len() {
409            // Look for object markers
410            if let Some(obj_start) = self.find_object_start(&data[pos..]) {
411                pos += obj_start;
412
413                if let Some(obj_end) = self.find_object_end(&data[pos..]) {
414                    let obj_data = &data[pos..pos + obj_end];
415
416                    // Try to parse this object
417                    if let Ok(node) = self.parse_partial_object(obj_data) {
418                        objects.push(node);
419                    }
420
421                    pos += obj_end;
422                } else {
423                    break;
424                }
425            } else {
426                break;
427            }
428        }
429
430        objects
431    }
432
433    /// Find the start of a PDF object
434    fn find_object_start(&self, data: &[u8]) -> Option<usize> {
435        // Look for pattern like "123 0 obj"
436        (0..data.len().saturating_sub(6)).find(|&i| {
437            data[i..].starts_with(b" obj") || (i > 0 && data[i - 1..].starts_with(b" obj"))
438        })
439    }
440
441    /// Find the end of a PDF object
442    fn find_object_end(&self, data: &[u8]) -> Option<usize> {
443        // Look for "endobj"
444        for i in 0..data.len().saturating_sub(6) {
445            if data[i..].starts_with(b"endobj") {
446                return Some(i + 6);
447            }
448        }
449        None
450    }
451
452    /// Parse a partial object with lenient rules
453    fn parse_partial_object(&self, data: &[u8]) -> AstResult<AstNode> {
454        // Very simplified object parsing for recovery
455        let node_id = NodeId(rand::random());
456
457        // Try to determine object type from content
458        let node_type = if data.windows(4).any(|w| w == b"Type") {
459            if data.windows(7).any(|w| w == b"Catalog") {
460                NodeType::Catalog
461            } else if data.windows(4).any(|w| w == b"Page") {
462                NodeType::Page
463            } else if data.windows(4).any(|w| w == b"Font") {
464                NodeType::Font
465            } else {
466                NodeType::Other
467            }
468        } else {
469            NodeType::Other
470        };
471
472        Ok(AstNode {
473            id: node_id,
474            node_type,
475            value: PdfValue::String(crate::types::PdfString::new_literal(data)),
476            metadata: NodeMetadata::default(),
477            children: Vec::new(),
478            references: Vec::new(),
479        })
480    }
481
482    /// Assess the health of the recovered document
483    fn assess_document_health(&self, document: &PdfDocument) -> DocumentHealth {
484        let nodes = document.ast.get_all_nodes();
485        let has_catalog = document.ast.get_root().is_some();
486        let error_rate = if self.statistics.total_errors_encountered > 0 {
487            1.0 - self.statistics.success_rate
488        } else {
489            0.0
490        };
491
492        if !has_catalog || nodes.is_empty() {
493            DocumentHealth::SeverelyDamaged
494        } else if error_rate > 0.5 {
495            DocumentHealth::Damaged
496        } else if error_rate > 0.1 {
497            DocumentHealth::PartiallyRecovered
498        } else {
499            DocumentHealth::Healthy
500        }
501    }
502
503    /// Get recovery statistics
504    pub fn get_statistics(&self) -> &RecoveryStatistics {
505        &self.statistics
506    }
507
508    /// Get error log
509    pub fn get_error_log(&self) -> &[RecoveryError] {
510        &self.error_log
511    }
512}
513
514/// Report generated after recovery attempt
515#[derive(Debug, Clone)]
516pub struct RecoveryReport {
517    pub success: bool,
518    pub errors_encountered: Vec<RecoveryError>,
519    pub recovery_actions: Vec<RecoveryAction>,
520    pub statistics: RecoveryStatistics,
521    pub final_document_health: DocumentHealth,
522}
523
524/// Action taken during recovery
525#[derive(Debug, Clone)]
526pub struct RecoveryAction {
527    pub strategy_name: String,
528    pub action_type: RecoveryActionType,
529    pub description: String,
530    pub success: bool,
531    pub data_modified: bool,
532}
533
534/// Type of recovery action
535#[derive(Debug, Clone, PartialEq, Eq)]
536pub enum RecoveryActionType {
537    StructureRepair,
538    ReferenceResolution,
539    StreamDecoding,
540    EncodingFix,
541    HeuristicPatch,
542    FuzzyMatch,
543    DataReconstruction,
544    Failed,
545}
546
547/// Overall health of the document after recovery
548#[derive(Debug, Clone, Copy, PartialEq, Eq)]
549pub enum DocumentHealth {
550    Healthy,
551    PartiallyRecovered,
552    Damaged,
553    SeverelyDamaged,
554}
555
556/// Parse a PDF with automatic error recovery
557pub fn parse_with_automatic_recovery(data: &[u8]) -> AstResult<(PdfDocument, RecoveryReport)> {
558    let mut parser = RecoveryParser::new(RecoveryConfig::default());
559    parser.parse_with_recovery(data)
560}