use crate::error::{PdfError, Result};
use crate::parser::PdfReader;
use crate::Document;
use std::collections::HashMap;
use std::io::{Read, Seek};
use std::path::Path;
pub mod corruption;
pub mod repair;
pub mod scanner;
pub mod validator;
pub mod xref_recovery;
pub use corruption::{detect_corruption, CorruptionReport, CorruptionType};
pub use repair::{repair_document, RepairResult, RepairStrategy};
pub use scanner::{ObjectScanner, ScanResult};
pub use validator::{validate_pdf, ValidationError, ValidationResult};
pub use xref_recovery::{needs_xref_recovery, recover_xref, XRefRecovery};
#[derive(Debug, Clone)]
pub struct RecoveryOptions {
pub aggressive_recovery: bool,
pub partial_content: bool,
pub max_errors: usize,
pub rebuild_xref: bool,
pub recover_embedded: bool,
pub skip_validation: bool,
pub memory_limit: usize,
}
impl Default for RecoveryOptions {
fn default() -> Self {
Self {
aggressive_recovery: false,
partial_content: true,
max_errors: 100,
rebuild_xref: true,
recover_embedded: false,
skip_validation: false,
memory_limit: 500 * 1024 * 1024, }
}
}
impl RecoveryOptions {
pub fn with_aggressive_recovery(mut self, aggressive: bool) -> Self {
self.aggressive_recovery = aggressive;
self
}
pub fn with_partial_content(mut self, partial: bool) -> Self {
self.partial_content = partial;
self
}
pub fn with_max_errors(mut self, max: usize) -> Self {
self.max_errors = max;
self
}
pub fn with_memory_limit(mut self, limit: usize) -> Self {
self.memory_limit = limit;
self
}
}
pub struct PdfRecovery {
options: RecoveryOptions,
error_count: usize,
warnings: Vec<String>,
}
impl PdfRecovery {
pub fn new(options: RecoveryOptions) -> Self {
Self {
options,
error_count: 0,
warnings: Vec::new(),
}
}
pub fn recover_document<P: AsRef<Path>>(&mut self, path: P) -> Result<Document> {
let path = path.as_ref();
match PdfReader::open_document(path) {
Ok(doc) => {
self.warnings
.push("Document opened normally, no recovery needed".to_string());
return self.convert_to_document(doc);
}
Err(e) => {
self.warnings.push(format!("Standard parsing failed: {e}"));
}
}
let corruption = detect_corruption(path)?;
self.warnings.push(format!(
"Detected corruption: {:?}",
corruption.corruption_type
));
let strategy = RepairStrategy::for_corruption(&corruption.corruption_type);
let repair_result = repair_document(path, strategy, &self.options)?;
if let Some(doc) = repair_result.recovered_document {
Ok(doc)
} else {
Err(PdfError::InvalidStructure(
"Failed to recover document".to_string(),
))
}
}
pub fn recover_partial<P: AsRef<Path>>(&mut self, path: P) -> Result<PartialRecovery> {
let path = path.as_ref();
let mut partial = PartialRecovery::default();
let mut scanner = ObjectScanner::new();
let scan_result = scanner.scan_file(path)?;
partial.total_objects = scan_result.total_objects;
partial.recovered_objects = scan_result.valid_objects;
for page_num in 0..scan_result.estimated_pages {
if let Ok(page_content) = self.recover_page(path, page_num) {
partial.recovered_pages.push(RecoveredPage {
page_number: page_num,
content: page_content,
has_text: true,
has_images: false,
});
}
}
if let Ok(metadata) = self.recover_metadata(path) {
partial.metadata = Some(metadata);
}
partial.recovery_warnings = self.warnings.clone();
Ok(partial)
}
pub fn warnings(&self) -> &[String] {
&self.warnings
}
pub fn clear_warnings(&mut self) {
self.warnings.clear();
self.error_count = 0;
}
fn convert_to_document<R: Read + Seek>(
&self,
pdf_doc: crate::parser::PdfDocument<R>,
) -> Result<Document> {
let mut doc = Document::new();
let page_count = pdf_doc
.page_count()
.map_err(|e| PdfError::InvalidStructure(e.to_string()))?;
for i in 0..page_count {
if let Ok(page) = pdf_doc.get_page(i) {
let new_page = crate::Page::new(page.width(), page.height());
doc.add_page(new_page);
}
}
Ok(doc)
}
fn recover_page<P: AsRef<Path>>(&mut self, _path: P, _page_num: u32) -> Result<String> {
Ok(format!("Recovered content for page {_page_num}"))
}
fn recover_metadata<P: AsRef<Path>>(&mut self, _path: P) -> Result<HashMap<String, String>> {
let mut metadata = HashMap::new();
metadata.insert("Title".to_string(), "Recovered Document".to_string());
metadata.insert("RecoveryDate".to_string(), chrono::Utc::now().to_string());
Ok(metadata)
}
}
#[derive(Debug, Default)]
pub struct PartialRecovery {
pub recovered_pages: Vec<RecoveredPage>,
pub total_objects: usize,
pub recovered_objects: usize,
pub metadata: Option<HashMap<String, String>>,
pub recovery_warnings: Vec<String>,
}
#[derive(Debug)]
pub struct RecoveredPage {
pub page_number: u32,
pub content: String,
pub has_text: bool,
pub has_images: bool,
}
pub fn quick_recover<P: AsRef<Path>>(path: P) -> Result<Document> {
let mut recovery = PdfRecovery::new(RecoveryOptions::default());
recovery.recover_document(path)
}
pub fn analyze_corruption<P: AsRef<Path>>(path: P) -> Result<CorruptionReport> {
detect_corruption(path)
}
#[cfg(test)]
mod tests {
use super::*;
use std::fs;
use tempfile::TempDir;
#[test]
fn test_recovery_options() {
let options = RecoveryOptions::default();
assert!(!options.aggressive_recovery);
assert!(options.partial_content);
assert_eq!(options.max_errors, 100);
let options = options.with_aggressive_recovery(true).with_max_errors(50);
assert!(options.aggressive_recovery);
assert_eq!(options.max_errors, 50);
}
#[test]
fn test_pdf_recovery_creation() {
let recovery = PdfRecovery::new(RecoveryOptions::default());
assert_eq!(recovery.error_count, 0);
assert!(recovery.warnings.is_empty());
}
#[test]
fn test_partial_recovery_default() {
let partial = PartialRecovery::default();
assert!(partial.recovered_pages.is_empty());
assert_eq!(partial.total_objects, 0);
assert_eq!(partial.recovered_objects, 0);
assert!(partial.metadata.is_none());
}
#[test]
fn test_recovery_options_all_setters() {
let options = RecoveryOptions::default()
.with_aggressive_recovery(true)
.with_partial_content(false)
.with_max_errors(200)
.with_memory_limit(1024 * 1024 * 1024);
assert!(options.aggressive_recovery);
assert!(!options.partial_content);
assert_eq!(options.max_errors, 200);
assert_eq!(options.memory_limit, 1024 * 1024 * 1024);
assert!(options.rebuild_xref);
assert!(!options.recover_embedded);
assert!(!options.skip_validation);
}
#[test]
fn test_recovery_options_clone() {
let options1 = RecoveryOptions::default()
.with_aggressive_recovery(true)
.with_max_errors(50);
let options2 = options1.clone();
assert_eq!(options1.aggressive_recovery, options2.aggressive_recovery);
assert_eq!(options1.max_errors, options2.max_errors);
assert_eq!(options1.memory_limit, options2.memory_limit);
}
#[test]
fn test_recovery_options_debug() {
let options = RecoveryOptions::default();
let debug_str = format!("{options:?}");
assert!(debug_str.contains("RecoveryOptions"));
assert!(debug_str.contains("aggressive_recovery"));
assert!(debug_str.contains("max_errors"));
}
#[test]
fn test_pdf_recovery_clear_warnings() {
let mut recovery = PdfRecovery::new(RecoveryOptions::default());
recovery.warnings.push("Warning 1".to_string());
recovery.warnings.push("Warning 2".to_string());
recovery.error_count = 5;
assert_eq!(recovery.warnings.len(), 2);
assert_eq!(recovery.error_count, 5);
recovery.clear_warnings();
assert!(recovery.warnings.is_empty());
assert_eq!(recovery.error_count, 0);
}
#[test]
fn test_recovered_page_creation() {
let page = RecoveredPage {
page_number: 0,
content: "Test content".to_string(),
has_text: true,
has_images: false,
};
assert_eq!(page.page_number, 0);
assert_eq!(page.content, "Test content");
assert!(page.has_text);
assert!(!page.has_images);
}
#[test]
fn test_recovered_page_debug() {
let page = RecoveredPage {
page_number: 5,
content: "Page content".to_string(),
has_text: true,
has_images: true,
};
let debug_str = format!("{page:?}");
assert!(debug_str.contains("RecoveredPage"));
assert!(debug_str.contains("page_number: 5"));
}
#[test]
fn test_partial_recovery_with_data() {
let mut partial = PartialRecovery::default();
partial.recovered_pages.push(RecoveredPage {
page_number: 0,
content: "Page 1".to_string(),
has_text: true,
has_images: false,
});
partial.recovered_pages.push(RecoveredPage {
page_number: 1,
content: "Page 2".to_string(),
has_text: true,
has_images: true,
});
partial.total_objects = 100;
partial.recovered_objects = 85;
let mut metadata = HashMap::new();
metadata.insert("Title".to_string(), "Test Document".to_string());
metadata.insert("Author".to_string(), "Test Author".to_string());
partial.metadata = Some(metadata);
partial.recovery_warnings.push("Warning 1".to_string());
partial.recovery_warnings.push("Warning 2".to_string());
assert_eq!(partial.recovered_pages.len(), 2);
assert_eq!(partial.total_objects, 100);
assert_eq!(partial.recovered_objects, 85);
assert!(partial.metadata.is_some());
assert_eq!(partial.recovery_warnings.len(), 2);
}
#[test]
fn test_partial_recovery_debug() {
let partial = PartialRecovery {
recovered_pages: vec![],
total_objects: 50,
recovered_objects: 45,
metadata: None,
recovery_warnings: vec!["Test warning".to_string()],
};
let debug_str = format!("{partial:?}");
assert!(debug_str.contains("PartialRecovery"));
assert!(debug_str.contains("total_objects: 50"));
assert!(debug_str.contains("recovered_objects: 45"));
}
#[test]
fn test_recovery_with_memory_limit() {
let options = RecoveryOptions::default().with_memory_limit(1024 * 1024);
let recovery = PdfRecovery::new(options);
assert_eq!(recovery.options.memory_limit, 1024 * 1024);
}
#[test]
fn test_recovery_warnings_accumulation() {
let mut recovery = PdfRecovery::new(RecoveryOptions::default());
recovery.warnings.push("First warning".to_string());
recovery.warnings.push("Second warning".to_string());
let warnings = recovery.warnings();
assert_eq!(warnings.len(), 2);
assert_eq!(warnings[0], "First warning");
assert_eq!(warnings[1], "Second warning");
}
#[test]
fn test_recovery_options_with_all_flags() {
let options = RecoveryOptions {
aggressive_recovery: true,
partial_content: true,
max_errors: 200,
rebuild_xref: true,
recover_embedded: true,
skip_validation: true,
memory_limit: 1024 * 1024 * 1024,
};
assert!(options.aggressive_recovery);
assert!(options.partial_content);
assert_eq!(options.max_errors, 200);
assert!(options.rebuild_xref);
assert!(options.recover_embedded);
assert!(options.skip_validation);
assert_eq!(options.memory_limit, 1024 * 1024 * 1024);
}
#[test]
fn test_quick_recover_function() {
let temp_dir = TempDir::new().unwrap();
let path = temp_dir.path().join("nonexistent.pdf");
let result = quick_recover(&path);
assert!(result.is_err());
}
#[test]
fn test_analyze_corruption_function() {
let temp_dir = TempDir::new().unwrap();
let path = temp_dir.path().join("test.pdf");
fs::write(&path, b"Not a PDF").unwrap();
let result = analyze_corruption(&path);
match result {
Ok(_report) => {
}
Err(_) => {
}
}
}
#[test]
fn test_recovery_error_count() {
let mut recovery = PdfRecovery::new(RecoveryOptions::default());
assert_eq!(recovery.error_count, 0);
recovery.error_count += 1;
assert_eq!(recovery.error_count, 1);
recovery.error_count += 5;
assert_eq!(recovery.error_count, 6);
recovery.clear_warnings();
assert_eq!(recovery.error_count, 0);
}
#[test]
fn test_recovery_metadata_extraction() {
let mut recovery = PdfRecovery::new(RecoveryOptions::default());
let temp_dir = TempDir::new().unwrap();
let path = temp_dir.path().join("test.pdf");
fs::write(&path, b"test").unwrap();
let metadata = recovery.recover_metadata(&path).unwrap();
assert!(metadata.contains_key("Title"));
assert!(metadata.contains_key("RecoveryDate"));
assert_eq!(metadata.get("Title").unwrap(), "Recovered Document");
}
#[test]
fn test_recovery_page_extraction() {
let mut recovery = PdfRecovery::new(RecoveryOptions::default());
let temp_dir = TempDir::new().unwrap();
let path = temp_dir.path().join("test.pdf");
fs::write(&path, b"test").unwrap();
let content = recovery.recover_page(&path, 0).unwrap();
assert_eq!(content, "Recovered content for page 0");
let content2 = recovery.recover_page(&path, 5).unwrap();
assert_eq!(content2, "Recovered content for page 5");
}
#[test]
fn test_recovery_options_defaults() {
let options = RecoveryOptions::default();
assert!(!options.aggressive_recovery);
assert!(options.partial_content);
assert_eq!(options.max_errors, 100);
assert!(options.rebuild_xref);
assert!(!options.recover_embedded);
assert!(!options.skip_validation);
assert_eq!(options.memory_limit, 500 * 1024 * 1024); }
#[test]
fn test_recovery_with_skip_validation() {
let options = RecoveryOptions {
skip_validation: true,
..Default::default()
};
let recovery = PdfRecovery::new(options);
assert!(recovery.options.skip_validation);
}
#[test]
fn test_recovery_with_embedded_files() {
let options = RecoveryOptions {
recover_embedded: true,
..Default::default()
};
let recovery = PdfRecovery::new(options);
assert!(recovery.options.recover_embedded);
}
#[test]
fn test_partial_recovery_empty_warnings() {
let partial = PartialRecovery {
recovered_pages: vec![],
total_objects: 0,
recovered_objects: 0,
metadata: None,
recovery_warnings: vec![],
};
assert!(partial.recovery_warnings.is_empty());
}
#[test]
fn test_recovery_options_chaining() {
let options = RecoveryOptions::default()
.with_aggressive_recovery(true)
.with_partial_content(false)
.with_max_errors(50)
.with_memory_limit(256 * 1024 * 1024)
.with_aggressive_recovery(false);
assert!(!options.aggressive_recovery); assert!(!options.partial_content);
assert_eq!(options.max_errors, 50);
assert_eq!(options.memory_limit, 256 * 1024 * 1024);
}
#[test]
fn test_recovery_metadata_with_dates() {
let mut recovery = PdfRecovery::new(RecoveryOptions::default());
let temp_dir = TempDir::new().unwrap();
let path = temp_dir.path().join("test.pdf");
fs::write(&path, b"test").unwrap();
let metadata = recovery.recover_metadata(&path).unwrap();
let recovery_date = metadata.get("RecoveryDate").unwrap();
assert!(recovery_date.contains("20")); assert!(recovery_date.contains("T")); }
#[test]
fn test_partial_recovery_page_ordering() {
let mut partial = PartialRecovery::default();
partial.recovered_pages.push(RecoveredPage {
page_number: 2,
content: "Page 3".to_string(),
has_text: true,
has_images: false,
});
partial.recovered_pages.push(RecoveredPage {
page_number: 0,
content: "Page 1".to_string(),
has_text: true,
has_images: false,
});
partial.recovered_pages.push(RecoveredPage {
page_number: 1,
content: "Page 2".to_string(),
has_text: false,
has_images: true,
});
assert_eq!(partial.recovered_pages[0].page_number, 2);
assert_eq!(partial.recovered_pages[1].page_number, 0);
assert_eq!(partial.recovered_pages[2].page_number, 1);
}
#[test]
fn test_recovery_with_max_errors_limit() {
let options = RecoveryOptions::default().with_max_errors(1);
let mut recovery = PdfRecovery::new(options);
recovery.error_count = 2;
assert!(recovery.error_count > recovery.options.max_errors);
}
#[test]
fn test_recovered_page_mixed_content() {
let page = RecoveredPage {
page_number: 10,
content: "Mixed content with text and images".to_string(),
has_text: true,
has_images: true,
};
assert!(page.has_text);
assert!(page.has_images);
assert_eq!(page.page_number, 10);
}
#[test]
fn test_recovery_warnings_immutable_access() {
let mut recovery = PdfRecovery::new(RecoveryOptions::default());
recovery.warnings.push("Test warning".to_string());
let warnings_ref = recovery.warnings();
assert_eq!(warnings_ref.len(), 1);
assert_eq!(warnings_ref[0], "Test warning");
recovery.warnings.push("Another warning".to_string());
assert_eq!(recovery.warnings.len(), 2);
}
#[test]
fn test_partial_recovery_statistics() {
let partial = PartialRecovery {
recovered_pages: vec![
RecoveredPage {
page_number: 0,
content: "Page 1".to_string(),
has_text: true,
has_images: false,
},
RecoveredPage {
page_number: 1,
content: "Page 2".to_string(),
has_text: true,
has_images: true,
},
],
total_objects: 150,
recovered_objects: 142,
metadata: Some(HashMap::new()),
recovery_warnings: vec!["Minor issue".to_string()],
};
let recovery_percentage =
(partial.recovered_objects as f64 / partial.total_objects as f64) * 100.0;
assert!(recovery_percentage > 94.0 && recovery_percentage < 95.0);
let text_only_pages = partial
.recovered_pages
.iter()
.filter(|p| p.has_text && !p.has_images)
.count();
assert_eq!(text_only_pages, 1);
let mixed_content_pages = partial
.recovered_pages
.iter()
.filter(|p| p.has_text && p.has_images)
.count();
assert_eq!(mixed_content_pages, 1);
}
}