Skip to main content

bookforge_pdf/
report.rs

1//! Conversion fidelity report. The contract from ROADMAP ยง9b: pages
2//! that reconstruct badly are flagged, never hidden.
3
4use serde::Serialize;
5
6use crate::reconstruct::PageStats;
7
8#[derive(Debug, Serialize)]
9pub struct ConversionReport {
10    pub input: String,
11    pub output: String,
12    pub pages: usize,
13    pub blocks: usize,
14    /// Non-whitespace characters in reconstructed blocks.
15    pub reconstructed_chars: usize,
16    /// Non-whitespace characters in the raw `pdftotext` baseline.
17    pub baseline_chars: usize,
18    /// reconstructed/baseline, capped at 100. Above ~100 means the
19    /// baseline missed text (rare); far below means reconstruction
20    /// dropped content and the page list below says where.
21    pub coverage_percent: f64,
22    pub two_column_pages: usize,
23    pub page_stats: Vec<PageStats>,
24    pub warnings: Vec<String>,
25}
26
27impl ConversionReport {
28    pub fn build(
29        input: &str,
30        output: &str,
31        page_stats: Vec<PageStats>,
32        blocks: usize,
33        reconstructed_chars: usize,
34        baseline_chars: usize,
35    ) -> Self {
36        let coverage_percent = if baseline_chars == 0 {
37            100.0
38        } else {
39            (reconstructed_chars as f64 / baseline_chars as f64 * 100.0).min(100.0)
40        };
41
42        let mut warnings = Vec::new();
43        if coverage_percent < 95.0 {
44            warnings.push(format!(
45                "reconstructed text covers only {coverage_percent:.1}% of the pdftotext baseline; some content was not captured"
46            ));
47        }
48        for page in &page_stats {
49            if page.chars == 0 {
50                warnings.push(format!(
51                    "page {}: no text reconstructed (image-only page, or extraction failure)",
52                    page.page
53                ));
54            }
55        }
56
57        Self {
58            input: input.to_string(),
59            output: output.to_string(),
60            pages: page_stats.len(),
61            blocks,
62            reconstructed_chars,
63            baseline_chars,
64            coverage_percent,
65            two_column_pages: page_stats.iter().filter(|page| page.two_column).count(),
66            page_stats,
67            warnings,
68        }
69    }
70
71    pub fn summary(&self) -> String {
72        let mut out = format!(
73            "Pages: {}\nBlocks: {}\nTwo-column pages: {}\nText coverage vs pdftotext: {:.1}% ({} of {} characters)\n",
74            self.pages,
75            self.blocks,
76            self.two_column_pages,
77            self.coverage_percent,
78            self.reconstructed_chars,
79            self.baseline_chars,
80        );
81        if self.warnings.is_empty() {
82            out.push_str("Warnings: none\n");
83        } else {
84            out.push_str("Warnings:\n");
85            for warning in &self.warnings {
86                out.push_str(&format!("  - {warning}\n"));
87            }
88        }
89        out
90    }
91}