1use serde::Serialize;
5
6use crate::reconstruct::PageStats;
7
8#[derive(Debug, Serialize)]
9pub struct ConversionReport {
10 pub input: String,
11 pub output: String,
12 pub pages: usize,
13 pub blocks: usize,
14 pub reconstructed_chars: usize,
16 pub baseline_chars: usize,
18 pub coverage_percent: f64,
22 pub two_column_pages: usize,
23 pub page_stats: Vec<PageStats>,
24 pub warnings: Vec<String>,
25}
26
27impl ConversionReport {
28 pub fn build(
29 input: &str,
30 output: &str,
31 page_stats: Vec<PageStats>,
32 blocks: usize,
33 reconstructed_chars: usize,
34 baseline_chars: usize,
35 ) -> Self {
36 let coverage_percent = if baseline_chars == 0 {
37 100.0
38 } else {
39 (reconstructed_chars as f64 / baseline_chars as f64 * 100.0).min(100.0)
40 };
41
42 let mut warnings = Vec::new();
43 if coverage_percent < 95.0 {
44 warnings.push(format!(
45 "reconstructed text covers only {coverage_percent:.1}% of the pdftotext baseline; some content was not captured"
46 ));
47 }
48 for page in &page_stats {
49 if page.chars == 0 {
50 warnings.push(format!(
51 "page {}: no text reconstructed (image-only page, or extraction failure)",
52 page.page
53 ));
54 }
55 }
56
57 Self {
58 input: input.to_string(),
59 output: output.to_string(),
60 pages: page_stats.len(),
61 blocks,
62 reconstructed_chars,
63 baseline_chars,
64 coverage_percent,
65 two_column_pages: page_stats.iter().filter(|page| page.two_column).count(),
66 page_stats,
67 warnings,
68 }
69 }
70
71 pub fn summary(&self) -> String {
72 let mut out = format!(
73 "Pages: {}\nBlocks: {}\nTwo-column pages: {}\nText coverage vs pdftotext: {:.1}% ({} of {} characters)\n",
74 self.pages,
75 self.blocks,
76 self.two_column_pages,
77 self.coverage_percent,
78 self.reconstructed_chars,
79 self.baseline_chars,
80 );
81 if self.warnings.is_empty() {
82 out.push_str("Warnings: none\n");
83 } else {
84 out.push_str("Warnings:\n");
85 for warning in &self.warnings {
86 out.push_str(&format!(" - {warning}\n"));
87 }
88 }
89 out
90 }
91}