1use crate::ast::{AstNode, AstResult, NodeId, NodeMetadata, NodeType, PdfDocument};
8use crate::parser::PdfParser;
9use crate::types::PdfValue;
10use std::collections::HashMap;
11use std::io::Cursor;
12
13pub mod diagnostics;
14pub mod reconstruction;
15pub mod strategies;
16
17pub use diagnostics::*;
18pub use reconstruction::*;
19pub use strategies::*;
20
21pub struct RecoveryParser {
23 base_parser: PdfParser,
24 recovery_config: RecoveryConfig,
25 recovery_strategies: Vec<Box<dyn RecoveryStrategy>>,
26 error_log: Vec<RecoveryError>,
27 statistics: RecoveryStatistics,
28}
29
30#[derive(Debug, Clone)]
32pub struct RecoveryConfig {
33 pub max_errors: usize,
34 pub skip_corrupted_objects: bool,
35 pub attempt_structure_reconstruction: bool,
36 pub use_heuristic_parsing: bool,
37 pub preserve_partial_objects: bool,
38 pub enable_fuzzy_matching: bool,
39 pub recovery_aggressiveness: RecoveryLevel,
40 pub timeout_ms: u64,
41}
42
43#[derive(Debug, Clone, Copy, PartialEq, Eq)]
45pub enum RecoveryLevel {
46 Conservative, Moderate, Aggressive, Experimental, }
51
52impl Default for RecoveryConfig {
53 fn default() -> Self {
54 Self {
55 max_errors: 1000,
56 skip_corrupted_objects: true,
57 attempt_structure_reconstruction: true,
58 use_heuristic_parsing: true,
59 preserve_partial_objects: true,
60 enable_fuzzy_matching: true,
61 recovery_aggressiveness: RecoveryLevel::Moderate,
62 timeout_ms: 60000, }
64 }
65}
66
67#[derive(Debug, Clone, Default)]
69pub struct RecoveryStatistics {
70 pub total_errors_encountered: usize,
71 pub errors_recovered: usize,
72 pub objects_skipped: usize,
73 pub objects_reconstructed: usize,
74 pub heuristic_fixes_applied: usize,
75 pub fuzzy_matches: usize,
76 pub recovery_time_ms: u64,
77 pub success_rate: f64,
78}
79
80#[derive(Debug, Clone)]
82pub struct RecoveryError {
83 pub error_type: RecoveryErrorType,
84 pub location: ErrorLocation,
85 pub original_error: String,
86 pub recovery_attempt: Option<RecoveryAttempt>,
87 pub severity: ErrorSeverity,
88 pub context: ErrorContext,
89}
90
91#[derive(Debug, Clone, PartialEq, Eq)]
93pub enum RecoveryErrorType {
94 ParseError,
95 StructuralError,
96 ReferenceError,
97 StreamError,
98 EncodingError,
99 IntegrityError,
100 UnknownFormat,
101}
102
103#[derive(Debug, Clone)]
105pub struct ErrorLocation {
106 pub byte_offset: u64,
107 pub line_number: Option<usize>,
108 pub object_number: Option<u32>,
109 pub context_description: String,
110}
111
112#[derive(Debug, Clone)]
114pub struct RecoveryAttempt {
115 pub strategy_used: String,
116 pub success: bool,
117 pub result_description: String,
118 pub time_taken_ms: u64,
119}
120
121#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
123pub enum ErrorSeverity {
124 Info,
125 Warning,
126 Error,
127 Critical,
128 Fatal,
129}
130
131#[derive(Debug, Clone)]
133pub struct ErrorContext {
134 pub surrounding_data: Vec<u8>,
135 pub object_hierarchy: Vec<String>,
136 pub reference_chain: Vec<String>,
137 pub hints: HashMap<String, String>,
138}
139
140impl RecoveryParser {
141 pub fn new(config: RecoveryConfig) -> Self {
143 let mut parser = Self {
144 base_parser: PdfParser::new(),
145 recovery_config: config.clone(),
146 recovery_strategies: Vec::new(),
147 error_log: Vec::new(),
148 statistics: RecoveryStatistics::default(),
149 };
150
151 parser.initialize_strategies(&config);
152 parser
153 }
154
155 pub fn parse_with_recovery(&mut self, data: &[u8]) -> AstResult<(PdfDocument, RecoveryReport)> {
157 let start_time = std::time::Instant::now();
158
159 match self.base_parser.parse(&mut Cursor::new(data)) {
161 Ok(document) => {
162 let report = RecoveryReport {
164 success: true,
165 errors_encountered: Vec::new(),
166 recovery_actions: Vec::new(),
167 statistics: self.statistics.clone(),
168 final_document_health: DocumentHealth::Healthy,
169 };
170 return Ok((document, report));
171 }
172 Err(initial_error) => {
173 self.log_error(RecoveryError {
175 error_type: RecoveryErrorType::ParseError,
176 location: ErrorLocation {
177 byte_offset: 0,
178 line_number: None,
179 object_number: None,
180 context_description: "Initial parse attempt".to_string(),
181 },
182 original_error: format!("{:?}", initial_error),
183 recovery_attempt: None,
184 severity: ErrorSeverity::Error,
185 context: ErrorContext {
186 surrounding_data: data.get(0..100).unwrap_or(data).to_vec(),
187 object_hierarchy: Vec::new(),
188 reference_chain: Vec::new(),
189 hints: HashMap::new(),
190 },
191 });
192 }
193 }
194
195 let recovery_result = self.attempt_recovery(data)?;
197 let elapsed = start_time.elapsed().as_millis() as u64;
198 self.statistics.recovery_time_ms = elapsed;
199
200 if self.statistics.total_errors_encountered > 0 {
202 self.statistics.success_rate = self.statistics.errors_recovered as f64
203 / self.statistics.total_errors_encountered as f64;
204 }
205
206 Ok(recovery_result)
207 }
208
209 fn attempt_recovery(&mut self, data: &[u8]) -> AstResult<(PdfDocument, RecoveryReport)> {
211 let mut document = PdfDocument::new(crate::ast::PdfVersion { major: 1, minor: 4 });
212 let mut recovery_actions = Vec::new();
213 let mut current_data = data.to_vec();
214
215 for strategy in &self.recovery_strategies {
217 let context = RecoveryContext {
218 original_data: data,
219 current_data: ¤t_data,
220 document: &document,
221 config: &self.recovery_config,
222 error_log: &self.error_log,
223 };
224
225 match strategy.apply_recovery(context) {
226 Ok(result) => {
227 recovery_actions.push(RecoveryAction {
228 strategy_name: strategy.name().to_string(),
229 action_type: result.action_type,
230 description: result.description,
231 success: true,
232 data_modified: result.data_changed,
233 });
234
235 if result.data_changed {
236 current_data = result.modified_data.unwrap_or(current_data);
237 }
238
239 if result.document_changed {
240 if let Some(new_doc) = result.modified_document {
241 document = new_doc;
242 }
243 }
244
245 self.statistics.errors_recovered += 1;
246 }
247 Err(e) => {
248 recovery_actions.push(RecoveryAction {
249 strategy_name: strategy.name().to_string(),
250 action_type: RecoveryActionType::Failed,
251 description: format!("Strategy failed: {:?}", e),
252 success: false,
253 data_modified: false,
254 });
255 }
256 }
257 }
258
259 let final_document = match self.base_parser.parse(&mut Cursor::new(¤t_data)) {
261 Ok(doc) => doc,
262 Err(_) => {
263 self.create_best_effort_document(¤t_data)?
265 }
266 };
267
268 let health = self.assess_document_health(&final_document);
269
270 let report = RecoveryReport {
271 success: !recovery_actions.is_empty(),
272 errors_encountered: self.error_log.clone(),
273 recovery_actions,
274 statistics: self.statistics.clone(),
275 final_document_health: health,
276 };
277
278 Ok((final_document, report))
279 }
280
281 fn initialize_strategies(&mut self, config: &RecoveryConfig) {
283 match config.recovery_aggressiveness {
285 RecoveryLevel::Conservative => {
286 self.recovery_strategies
287 .push(Box::new(BasicStructureRecovery::new()));
288 self.recovery_strategies
289 .push(Box::new(ReferenceRecovery::new()));
290 self.recovery_strategies
291 .push(Box::new(StructureRepairStrategy::new()));
292 }
293 RecoveryLevel::Moderate => {
294 self.recovery_strategies
295 .push(Box::new(BasicStructureRecovery::new()));
296 self.recovery_strategies
297 .push(Box::new(StructureRepairStrategy::new()));
298 self.recovery_strategies
299 .push(Box::new(XRefRebuildStrategy::new()));
300 self.recovery_strategies
301 .push(Box::new(ReferenceRecovery::new()));
302 self.recovery_strategies
303 .push(Box::new(StreamRecovery::new()));
304 self.recovery_strategies
305 .push(Box::new(StreamRepairStrategy::new()));
306 self.recovery_strategies
307 .push(Box::new(DataRecoveryStrategy::new()));
308 self.recovery_strategies
309 .push(Box::new(EncodingRecovery::new()));
310 }
311 RecoveryLevel::Aggressive => {
312 self.recovery_strategies
313 .push(Box::new(BasicStructureRecovery::new()));
314 self.recovery_strategies
315 .push(Box::new(StructureRepairStrategy::new()));
316 self.recovery_strategies
317 .push(Box::new(XRefRebuildStrategy::new()));
318 self.recovery_strategies
319 .push(Box::new(ReferenceRecovery::new()));
320 self.recovery_strategies
321 .push(Box::new(StreamRecovery::new()));
322 self.recovery_strategies
323 .push(Box::new(StreamRepairStrategy::new()));
324 self.recovery_strategies
325 .push(Box::new(DataRecoveryStrategy::new()));
326 self.recovery_strategies
327 .push(Box::new(EncodingRecovery::new()));
328 self.recovery_strategies
329 .push(Box::new(HeuristicRecovery::new()));
330 self.recovery_strategies
331 .push(Box::new(FuzzyMatchingRecovery::new()));
332 }
333 RecoveryLevel::Experimental => {
334 self.recovery_strategies
336 .push(Box::new(BasicStructureRecovery::new()));
337 self.recovery_strategies
338 .push(Box::new(StructureRepairStrategy::new()));
339 self.recovery_strategies
340 .push(Box::new(XRefRebuildStrategy::new()));
341 self.recovery_strategies
342 .push(Box::new(ReferenceRecovery::new()));
343 self.recovery_strategies
344 .push(Box::new(StreamRecovery::new()));
345 self.recovery_strategies
346 .push(Box::new(StreamRepairStrategy::new()));
347 self.recovery_strategies
348 .push(Box::new(DataRecoveryStrategy::new()));
349 self.recovery_strategies
350 .push(Box::new(EncodingRecovery::new()));
351 self.recovery_strategies
352 .push(Box::new(HeuristicRecovery::new()));
353 self.recovery_strategies
354 .push(Box::new(FuzzyMatchingRecovery::new()));
355 self.recovery_strategies
356 .push(Box::new(ExperimentalRecovery::new()));
357 }
358 }
359 }
360
361 fn log_error(&mut self, error: RecoveryError) {
363 self.statistics.total_errors_encountered += 1;
364 self.error_log.push(error);
365
366 if self.error_log.len() > self.recovery_config.max_errors {
368 self.error_log.remove(0);
369 }
370 }
371
372 fn create_best_effort_document(&self, data: &[u8]) -> AstResult<PdfDocument> {
374 let mut document = PdfDocument::new(crate::ast::PdfVersion { major: 1, minor: 4 });
375
376 let catalog_id = document.ast.create_node(
378 NodeType::Catalog,
379 PdfValue::Dictionary({
380 let mut dict = crate::types::PdfDictionary::new();
381 dict.insert(
382 "Type",
383 PdfValue::Name(crate::types::PdfName::new("Catalog")),
384 );
385 dict
386 }),
387 );
388 document.ast.set_root(catalog_id);
389
390 let objects = self.extract_salvageable_objects(data);
392 for object in objects {
393 let node_id = document.ast.create_node(object.node_type, object.value);
394 document
396 .ast
397 .add_edge(catalog_id, node_id, crate::ast::EdgeType::Child);
398 }
399
400 Ok(document)
401 }
402
403 fn extract_salvageable_objects(&self, data: &[u8]) -> Vec<AstNode> {
405 let mut objects = Vec::new();
406 let mut pos = 0;
407
408 while pos < data.len() {
409 if let Some(obj_start) = self.find_object_start(&data[pos..]) {
411 pos += obj_start;
412
413 if let Some(obj_end) = self.find_object_end(&data[pos..]) {
414 let obj_data = &data[pos..pos + obj_end];
415
416 if let Ok(node) = self.parse_partial_object(obj_data) {
418 objects.push(node);
419 }
420
421 pos += obj_end;
422 } else {
423 break;
424 }
425 } else {
426 break;
427 }
428 }
429
430 objects
431 }
432
433 fn find_object_start(&self, data: &[u8]) -> Option<usize> {
435 (0..data.len().saturating_sub(6)).find(|&i| {
437 data[i..].starts_with(b" obj") || (i > 0 && data[i - 1..].starts_with(b" obj"))
438 })
439 }
440
441 fn find_object_end(&self, data: &[u8]) -> Option<usize> {
443 for i in 0..data.len().saturating_sub(6) {
445 if data[i..].starts_with(b"endobj") {
446 return Some(i + 6);
447 }
448 }
449 None
450 }
451
452 fn parse_partial_object(&self, data: &[u8]) -> AstResult<AstNode> {
454 let node_id = NodeId(rand::random());
456
457 let node_type = if data.windows(4).any(|w| w == b"Type") {
459 if data.windows(7).any(|w| w == b"Catalog") {
460 NodeType::Catalog
461 } else if data.windows(4).any(|w| w == b"Page") {
462 NodeType::Page
463 } else if data.windows(4).any(|w| w == b"Font") {
464 NodeType::Font
465 } else {
466 NodeType::Other
467 }
468 } else {
469 NodeType::Other
470 };
471
472 Ok(AstNode {
473 id: node_id,
474 node_type,
475 value: PdfValue::String(crate::types::PdfString::new_literal(data)),
476 metadata: NodeMetadata::default(),
477 children: Vec::new(),
478 references: Vec::new(),
479 })
480 }
481
482 fn assess_document_health(&self, document: &PdfDocument) -> DocumentHealth {
484 let nodes = document.ast.get_all_nodes();
485 let has_catalog = document.ast.get_root().is_some();
486 let error_rate = if self.statistics.total_errors_encountered > 0 {
487 1.0 - self.statistics.success_rate
488 } else {
489 0.0
490 };
491
492 if !has_catalog || nodes.is_empty() {
493 DocumentHealth::SeverelyDamaged
494 } else if error_rate > 0.5 {
495 DocumentHealth::Damaged
496 } else if error_rate > 0.1 {
497 DocumentHealth::PartiallyRecovered
498 } else {
499 DocumentHealth::Healthy
500 }
501 }
502
503 pub fn get_statistics(&self) -> &RecoveryStatistics {
505 &self.statistics
506 }
507
508 pub fn get_error_log(&self) -> &[RecoveryError] {
510 &self.error_log
511 }
512}
513
514#[derive(Debug, Clone)]
516pub struct RecoveryReport {
517 pub success: bool,
518 pub errors_encountered: Vec<RecoveryError>,
519 pub recovery_actions: Vec<RecoveryAction>,
520 pub statistics: RecoveryStatistics,
521 pub final_document_health: DocumentHealth,
522}
523
524#[derive(Debug, Clone)]
526pub struct RecoveryAction {
527 pub strategy_name: String,
528 pub action_type: RecoveryActionType,
529 pub description: String,
530 pub success: bool,
531 pub data_modified: bool,
532}
533
534#[derive(Debug, Clone, PartialEq, Eq)]
536pub enum RecoveryActionType {
537 StructureRepair,
538 ReferenceResolution,
539 StreamDecoding,
540 EncodingFix,
541 HeuristicPatch,
542 FuzzyMatch,
543 DataReconstruction,
544 Failed,
545}
546
547#[derive(Debug, Clone, Copy, PartialEq, Eq)]
549pub enum DocumentHealth {
550 Healthy,
551 PartiallyRecovered,
552 Damaged,
553 SeverelyDamaged,
554}
555
556pub fn parse_with_automatic_recovery(data: &[u8]) -> AstResult<(PdfDocument, RecoveryReport)> {
558 let mut parser = RecoveryParser::new(RecoveryConfig::default());
559 parser.parse_with_recovery(data)
560}