spdfdiff_cli 0.1.2

use clap::{Parser, Subcommand, ValueEnum};
use diff_core::{DiffConfig, diff_semantic_documents};
use pdf_content::{ContentOp, ContentProgram};
use serde::{Deserialize, Serialize};
use spdfdiff_types::{
    ByteRange, ChangeKind, ChangeSeverity, Diagnostic, DiffDocument, FileRole, ObjectId,
    ParseConfig, PdfDiffError, Provenance, Rect, SemanticChange, SemanticNodeEvidence,
};
use std::collections::BTreeMap;
use std::path::{Path, PathBuf};
use std::process::Command as ProcessCommand;
use std::time::Instant;

#[derive(Debug, Parser)]
#[command(name = "spdfdiff", version, about = "Semantic PDF diff CLI")]
struct Cli {
    #[command(subcommand)]
    command: Command,
}

#[derive(Debug, Subcommand)]
enum Command {
    Diff {
        old_pdf: PathBuf,
        new_pdf: PathBuf,
        #[arg(long, value_enum, default_value_t = DiffReportFormat::Json)]
        format: DiffReportFormat,
        #[arg(long)]
        output: Option<PathBuf>,
        #[arg(long, default_value_t = 2.0)]
        layout_tolerance_pt: f32,
        #[arg(long)]
        fail_on_changes: bool,
    },
    Inspect {
        file: PathBuf,
        #[arg(long, value_enum, default_value_t = ReportFormat::Json)]
        format: ReportFormat,
        #[arg(long)]
        output: Option<PathBuf>,
    },
    Extract {
        file: PathBuf,
        #[arg(long, value_enum, default_value_t = ReportFormat::Json)]
        format: ReportFormat,
        #[arg(long)]
        output: Option<PathBuf>,
    },
    Corpus {
        folder: PathBuf,
        #[arg(long)]
        manifest: Option<PathBuf>,
        #[arg(long)]
        output: PathBuf,
        #[arg(long)]
        fail_on_gate: bool,
    },
    Benchmark {
        #[arg(long, default_value_t = 50)]
        pages: usize,
        #[arg(long)]
        output: PathBuf,
    },
}

#[derive(Debug, Clone, Copy, PartialEq, Eq, ValueEnum)]
enum DiffReportFormat {
    Json,
    AiJson,
    Md,
    Html,
}

#[derive(Debug, Clone, Copy, PartialEq, Eq, ValueEnum)]
enum ReportFormat {
    Json,
    Md,
    Html,
}

fn main() {
    let cli = Cli::parse();

    match run(cli) {
        Ok(exit_code) => {
            if exit_code != 0 {
                std::process::exit(exit_code);
            }
        }
        Err(error) => {
            eprintln!("{error}");
            std::process::exit(exit_code_for_error(&error));
        }
    }
}

fn exit_code_for_error(error: &PdfDiffError) -> i32 {
    match error {
        PdfDiffError::UnsupportedPdf(message) if message.contains("UNSUPPORTED_ENCRYPTION") => 3,
        PdfDiffError::InternalInvariant(_) => 4,
        PdfDiffError::ResourceLimitExceeded(_)
        | PdfDiffError::UnsupportedPdf(_)
        | PdfDiffError::InvalidInput(_) => 2,
    }
}

fn run(cli: Cli) -> Result<i32, PdfDiffError> {
    match cli.command {
        Command::Diff {
            old_pdf,
            new_pdf,
            format,
            output,
            layout_tolerance_pt,
            fail_on_changes,
        } => {
            let old_bytes = std::fs::read(&old_pdf)
                .map_err(|error| PdfDiffError::InvalidInput(error.to_string()))?;
            let new_bytes = std::fs::read(&new_pdf)
                .map_err(|error| PdfDiffError::InvalidInput(error.to_string()))?;
            let document = diff_pdf_bytes(
                &display_file_name(&old_pdf),
                &old_bytes,
                &display_file_name(&new_pdf),
                &new_bytes,
                DiffConfig {
                    layout_tolerance_pt,
                    ..DiffConfig::default()
                },
            )?;
            let rendered = render_diff(&document, format);
            write_or_print(rendered, output)?;
            return Ok(if fail_on_changes && !document.changes.is_empty() {
                1
            } else {
                0
            });
        }
        Command::Inspect {
            file,
            format,
            output,
        } => {
            let bytes = std::fs::read(&file)
                .map_err(|error| PdfDiffError::InvalidInput(error.to_string()))?;
            let parsed = pdf_core::PdfDocument::parse_with_config(&bytes, ParseConfig::default())?;
            let rendered = render_inspect_report(&display_file_name(&file), &parsed, format);
            write_or_print(rendered, output)?;
        }
        Command::Extract {
            file,
            format,
            output,
        } => {
            let bytes = std::fs::read(&file)
                .map_err(|error| PdfDiffError::InvalidInput(error.to_string()))?;
            let semantic = semantic_document_from_pdf(
                &display_file_name(&file),
                &bytes,
                ParseConfig::default(),
            )?;
            let rendered = render_extract_report(&semantic, format);
            write_or_print(rendered, output)?;
        }
        Command::Corpus {
            folder,
            manifest,
            output,
            fail_on_gate,
        } => {
            let manifest = manifest.as_deref().map(load_corpus_manifest).transpose()?;
            let report =
                build_corpus_report_model(&folder, ParseConfig::default(), manifest.as_ref())?;
            let gate_failed = report.gate.as_ref().is_some_and(|gate| !gate.passed);
            let rendered = to_json_pretty(&report)?;
            std::fs::write(&output, rendered)
                .map_err(|error| PdfDiffError::InvalidInput(error.to_string()))?;
            if fail_on_gate && gate_failed {
                return Ok(1);
            }
        }
        Command::Benchmark { pages, output } => {
            let report = run_synthetic_benchmark(pages)?;
            let rendered = to_json_pretty(&report)?;
            std::fs::write(&output, rendered)
                .map_err(|error| PdfDiffError::InvalidInput(error.to_string()))?;
        }
    }
    Ok(0)
}

#[derive(Debug, Serialize)]
struct CorpusReport {
    folder: String,
    total: usize,
    parsed: usize,
    partial: usize,
    failed: usize,
    diagnostic_counts: BTreeMap<String, usize>,
    diff_diagnostic_counts: BTreeMap<String, usize>,
    diff_pairs: Vec<CorpusDiffPairReport>,
    gate: Option<CorpusGateReport>,
    files: Vec<CorpusFileReport>,
}

#[derive(Debug, Deserialize)]
struct CorpusManifest {
    schema_version: String,
    #[serde(default)]
    required_files: Vec<String>,
    #[serde(default)]
    diff_pairs: Vec<CorpusManifestDiffPair>,
    #[serde(default)]
    thresholds: CorpusGateThresholds,
}

#[derive(Debug, Deserialize)]
struct CorpusManifestDiffPair {
    name: String,
    old_file: String,
    new_file: String,
}

#[derive(Debug, Clone, Copy, Default, Deserialize, Serialize)]
struct CorpusGateThresholds {
    #[serde(default)]
    min_parsed_files: Option<usize>,
    #[serde(default)]
    max_missing_required_files: usize,
    #[serde(default)]
    max_failed_files: usize,
    #[serde(default)]
    max_failed_diff_pairs: usize,
}

#[derive(Debug, Serialize)]
struct CorpusDiffPairReport {
    name: String,
    old_file: String,
    new_file: String,
    status: CorpusDiffPairStatus,
    changes: usize,
    diagnostics: Vec<String>,
    error: Option<String>,
}

#[derive(Debug, Serialize)]
#[serde(rename_all = "snake_case")]
enum CorpusDiffPairStatus {
    Diffed,
    Failed,
}

#[derive(Debug, Serialize)]
struct CorpusGateReport {
    manifest_schema_version: String,
    passed: bool,
    thresholds: CorpusGateThresholds,
    missing_required_files: Vec<String>,
    failures: Vec<String>,
}

#[derive(Debug, Serialize)]
struct CorpusFileReport {
    file: String,
    status: CorpusFileStatus,
    extracted_nodes: usize,
    diagnostics: Vec<String>,
    error: Option<String>,
}

#[derive(Debug, Serialize)]
#[serde(rename_all = "snake_case")]
enum CorpusFileStatus {
    Parsed,
    Partial,
    Failed,
}

#[derive(Debug, Serialize)]
struct InspectReport<'a> {
    file: &'a str,
    object_count: usize,
    diagnostic_count: usize,
    first_page_streams: usize,
    tagged_structure: TaggedStructureReport,
}

#[derive(Debug, Serialize)]
struct ExtractReport<'a> {
    file: &'a str,
    paragraphs: usize,
    table_candidates: usize,
    table_cells: usize,
    tables: Vec<ExtractTableReport>,
    diagnostic_count: usize,
    tagged_structure: Option<TaggedStructureReport>,
}

#[derive(Debug, Serialize)]
struct ExtractTableReport {
    node_id: String,
    page: usize,
    rows: usize,
    columns: usize,
    cells: Vec<Vec<String>>,
    confidence: f32,
}

#[derive(Debug, Clone, Serialize)]
struct TaggedStructureReport {
    detected: bool,
    root_object: Option<String>,
    element_count: usize,
    mcid_count: usize,
    parent_tree_entries: usize,
    structure_types: Vec<String>,
    diagnostics: Vec<String>,
}

#[derive(Debug, Serialize)]
struct BenchmarkReport {
    pages: usize,
    target_total_ms: u128,
    under_target: bool,
    timings_ms: BenchmarkTimings,
    peak_memory_bytes: Option<u64>,
    memory_note: String,
    summary: spdfdiff_types::DiffSummary,
    diagnostics: Vec<String>,
}

#[derive(Debug, Serialize)]
struct BenchmarkTimings {
    parse: u128,
    extract: u128,
    semantic: u128,
    diff: u128,
    report: u128,
    total: u128,
}

fn load_corpus_manifest(path: &Path) -> Result<CorpusManifest, PdfDiffError> {
    let bytes =
        std::fs::read(path).map_err(|error| PdfDiffError::InvalidInput(error.to_string()))?;
    serde_json::from_slice(&bytes).map_err(|error| {
        PdfDiffError::InvalidInput(format!(
            "failed to parse corpus manifest {}: {error}",
            path.display()
        ))
    })
}

#[cfg(test)]
fn build_corpus_report(folder: &Path, config: ParseConfig) -> Result<String, PdfDiffError> {
    to_json_pretty(&build_corpus_report_model(folder, config, None)?)
}

fn build_corpus_report_model(
    folder: &Path,
    config: ParseConfig,
    manifest: Option<&CorpusManifest>,
) -> Result<CorpusReport, PdfDiffError> {
    let mut paths = Vec::new();
    for entry in
        std::fs::read_dir(folder).map_err(|error| PdfDiffError::InvalidInput(error.to_string()))?
    {
        let entry = entry.map_err(|error| PdfDiffError::InvalidInput(error.to_string()))?;
        let path = entry.path();
        if path.extension().and_then(|ext| ext.to_str()) == Some("pdf") {
            paths.push(path);
        }
    }
    paths.sort();

    let total = paths.len();
    let mut parsed = 0usize;
    let mut failed = 0usize;
    let mut partial = 0usize;
    let mut files = Vec::new();
    let mut diagnostic_counts = BTreeMap::new();
    let discovered_files = paths
        .iter()
        .map(|path| display_file_name(path))
        .collect::<Vec<_>>();
    for path in paths {
        let file = display_file_name(&path);
        match std::fs::read(&path) {
            Ok(bytes) => match semantic_document_from_pdf(&file, &bytes, config) {
                Ok(document) => {
                    let diagnostics = document
                        .diagnostics
                        .iter()
                        .map(|diagnostic| diagnostic.code.clone())
                        .collect::<Vec<_>>();
                    for code in &diagnostics {
                        *diagnostic_counts.entry(code.clone()).or_insert(0) += 1;
                    }
                    parsed += 1;
                    let status = if diagnostics.is_empty() {
                        CorpusFileStatus::Parsed
                    } else {
                        partial += 1;
                        CorpusFileStatus::Partial
                    };
                    files.push(CorpusFileReport {
                        file,
                        status,
                        extracted_nodes: document.nodes.len(),
                        diagnostics,
                        error: None,
                    });
                }
                Err(error) => {
                    failed += 1;
                    files.push(CorpusFileReport {
                        file,
                        status: CorpusFileStatus::Failed,
                        extracted_nodes: 0,
                        diagnostics: Vec::new(),
                        error: Some(error.to_string()),
                    });
                }
            },
            Err(error) => {
                failed += 1;
                files.push(CorpusFileReport {
                    file,
                    status: CorpusFileStatus::Failed,
                    extracted_nodes: 0,
                    diagnostics: Vec::new(),
                    error: Some(error.to_string()),
                });
            }
        }
    }

    let (diff_pairs, diff_diagnostic_counts) = build_corpus_diff_pair_reports(folder, manifest)?;
    let gate = manifest.map(|manifest| {
        build_corpus_gate_report(manifest, &discovered_files, parsed, failed, &diff_pairs)
    });

    Ok(CorpusReport {
        folder: display_file_name(folder),
        total,
        parsed,
        partial,
        failed,
        diagnostic_counts,
        diff_diagnostic_counts,
        diff_pairs,
        gate,
        files,
    })
}

fn build_corpus_diff_pair_reports(
    folder: &Path,
    manifest: Option<&CorpusManifest>,
) -> Result<(Vec<CorpusDiffPairReport>, BTreeMap<String, usize>), PdfDiffError> {
    let Some(manifest) = manifest else {
        return Ok((Vec::new(), BTreeMap::new()));
    };
    let mut reports = Vec::new();
    let mut diagnostic_counts = BTreeMap::new();
    for pair in &manifest.diff_pairs {
        let old_path = folder.join(&pair.old_file);
        let new_path = folder.join(&pair.new_file);
        let report = match (std::fs::read(&old_path), std::fs::read(&new_path)) {
            (Ok(old_bytes), Ok(new_bytes)) => match diff_pdf_bytes(
                &pair.old_file,
                &old_bytes,
                &pair.new_file,
                &new_bytes,
                DiffConfig::default(),
            ) {
                Ok(document) => {
                    let diagnostics = document
                        .diagnostics
                        .iter()
                        .map(|diagnostic| diagnostic.code.clone())
                        .collect::<Vec<_>>();
                    for code in &diagnostics {
                        *diagnostic_counts.entry(code.clone()).or_insert(0) += 1;
                    }
                    CorpusDiffPairReport {
                        name: pair.name.clone(),
                        old_file: pair.old_file.clone(),
                        new_file: pair.new_file.clone(),
                        status: CorpusDiffPairStatus::Diffed,
                        changes: document.changes.len(),
                        diagnostics,
                        error: None,
                    }
                }
                Err(error) => CorpusDiffPairReport {
                    name: pair.name.clone(),
                    old_file: pair.old_file.clone(),
                    new_file: pair.new_file.clone(),
                    status: CorpusDiffPairStatus::Failed,
                    changes: 0,
                    diagnostics: Vec::new(),
                    error: Some(error.to_string()),
                },
            },
            (Err(error), _) => CorpusDiffPairReport {
                name: pair.name.clone(),
                old_file: pair.old_file.clone(),
                new_file: pair.new_file.clone(),
                status: CorpusDiffPairStatus::Failed,
                changes: 0,
                diagnostics: Vec::new(),
                error: Some(format!("failed to read {}: {error}", pair.old_file)),
            },
            (_, Err(error)) => CorpusDiffPairReport {
                name: pair.name.clone(),
                old_file: pair.old_file.clone(),
                new_file: pair.new_file.clone(),
                status: CorpusDiffPairStatus::Failed,
                changes: 0,
                diagnostics: Vec::new(),
                error: Some(format!("failed to read {}: {error}", pair.new_file)),
            },
        };
        reports.push(report);
    }
    Ok((reports, diagnostic_counts))
}

fn build_corpus_gate_report(
    manifest: &CorpusManifest,
    discovered_files: &[String],
    parsed: usize,
    failed: usize,
    diff_pairs: &[CorpusDiffPairReport],
) -> CorpusGateReport {
    let mut missing_required_files = manifest
        .required_files
        .iter()
        .filter(|file| !discovered_files.contains(file))
        .cloned()
        .collect::<Vec<_>>();
    missing_required_files.sort();
    let failed_diff_pairs = diff_pairs
        .iter()
        .filter(|pair| matches!(pair.status, CorpusDiffPairStatus::Failed))
        .count();

    let mut failures = Vec::new();
    if let Some(minimum) = manifest.thresholds.min_parsed_files {
        if parsed < minimum {
            failures.push(format!(
                "parsed file count {parsed} is below minimum {minimum}"
            ));
        }
    }
    if missing_required_files.len() > manifest.thresholds.max_missing_required_files {
        failures.push(format!(
            "missing required file count {} exceeds maximum {}",
            missing_required_files.len(),
            manifest.thresholds.max_missing_required_files
        ));
    }
    if failed > manifest.thresholds.max_failed_files {
        failures.push(format!(
            "failed file count {failed} exceeds maximum {}",
            manifest.thresholds.max_failed_files
        ));
    }
    if failed_diff_pairs > manifest.thresholds.max_failed_diff_pairs {
        failures.push(format!(
            "failed diff pair count {failed_diff_pairs} exceeds maximum {}",
            manifest.thresholds.max_failed_diff_pairs
        ));
    }

    CorpusGateReport {
        manifest_schema_version: manifest.schema_version.clone(),
        passed: failures.is_empty(),
        thresholds: manifest.thresholds,
        missing_required_files,
        failures,
    }
}

fn display_file_name(path: &Path) -> String {
    path.file_name()
        .and_then(|file_name| file_name.to_str())
        .unwrap_or(".")
        .to_owned()
}

fn to_json_pretty(value: &impl Serialize) -> Result<String, PdfDiffError> {
    serde_json::to_string_pretty(value)
        .map_err(|error| PdfDiffError::InternalInvariant(error.to_string()))
}

fn write_or_print(rendered: String, output: Option<PathBuf>) -> Result<(), PdfDiffError> {
    if let Some(output) = output {
        std::fs::write(output, rendered)
            .map_err(|error| PdfDiffError::InvalidInput(error.to_string()))
    } else {
        println!("{rendered}");
        Ok(())
    }
}

fn diff_pdf_bytes(
    old_fingerprint: &str,
    old_bytes: &[u8],
    new_fingerprint: &str,
    new_bytes: &[u8],
    diff_config: DiffConfig,
) -> Result<DiffDocument, PdfDiffError> {
    let config = ParseConfig::default();
    let old_document = pdf_core::PdfDocument::parse_with_config(old_bytes, config)?;
    let new_document = pdf_core::PdfDocument::parse_with_config(new_bytes, config)?;
    let old = semantic_document_from_document(old_fingerprint, &old_document, config);
    let new = semantic_document_from_document(new_fingerprint, &new_document, config);
    let mut diff = diff_semantic_documents(&old, &new, diff_config);
    append_image_payload_changes(&mut diff, &old_document, &new_document);
    append_document_surface_changes(&mut diff, &old_document, &new_document);
    Ok(diff)
}

fn semantic_document_from_pdf(
    fingerprint: &str,
    bytes: &[u8],
    config: ParseConfig,
) -> Result<pdf_semantic::SemanticDocument, PdfDiffError> {
    let document = pdf_core::PdfDocument::parse_with_config(bytes, config)?;
    Ok(semantic_document_from_document(
        fingerprint,
        &document,
        config,
    ))
}

fn semantic_document_from_document(
    fingerprint: &str,
    document: &pdf_core::PdfDocument,
    config: ParseConfig,
) -> pdf_semantic::SemanticDocument {
    let extraction = extract_text_runs_from_document(document, config);
    let tagged_structure = document.tagged_structure(config);
    let tagged_summary = tagged_structure
        .root_object_id
        .is_some()
        .then(|| semantic_tagged_structure_summary(&tagged_structure));
    pdf_semantic::build_semantic_document_with_tagged_structure(
        fingerprint,
        &extraction.runs,
        extraction.diagnostics,
        tagged_summary,
    )
}

struct ExtractedTextRuns {
    runs: Vec<pdf_text::TextRun>,
    diagnostics: Vec<Diagnostic>,
}

#[derive(Debug, Clone, PartialEq, Eq)]
struct OcrConfig {
    command: String,
    mode: OcrCommandMode,
}

#[derive(Debug, Clone, Copy, PartialEq, Eq)]
enum OcrCommandMode {
    Tesseract,
    Plain,
}

impl OcrConfig {
    fn from_environment() -> Self {
        if let Some(command) = std::env::var_os("SPDFDIFF_OCR_COMMAND")
            .and_then(|value| value.into_string().ok())
            .filter(|value| !value.trim().is_empty())
        {
            return Self {
                mode: if command.to_ascii_lowercase().contains("tesseract") {
                    OcrCommandMode::Tesseract
                } else {
                    OcrCommandMode::Plain
                },
                command,
            };
        }

        Self {
            command: "tesseract".into(),
            mode: OcrCommandMode::Tesseract,
        }
    }
}

fn extract_text_runs_from_document(
    document: &pdf_core::PdfDocument,
    config: ParseConfig,
) -> ExtractedTextRuns {
    let contents = document.page_contents();
    if contents.is_empty() {
        let mut diagnostics = document.diagnostics.clone();
        diagnostics.push(spdfdiff_types::Diagnostic::warning(
            "MISSING_PAGE_CONTENT",
            "no page content stream was available for extraction",
        ));
        append_unsupported_feature_diagnostics(document, true, false, &mut diagnostics);
        return ExtractedTextRuns {
            runs: Vec::new(),
            diagnostics,
        };
    }
    let mut programs: Vec<(usize, ContentProgram)> = Vec::new();
    for content in &contents {
        let mut stream_program = pdf_content::parse_content_stream_with_limits(
            content.bytes,
            content.page_index,
            Some(content.stream_object_id),
            config.limits,
        );
        if let Some((_, page_program)) = programs
            .iter_mut()
            .find(|(page_index, _)| *page_index == content.page_index)
        {
            page_program
                .operations
                .append(&mut stream_program.operations);
            page_program
                .diagnostics
                .append(&mut stream_program.diagnostics);
        } else {
            programs.push((content.page_index, stream_program));
        }
    }
    let mut runs = Vec::new();
    let mut diagnostics = document.diagnostics.clone();
    let mut has_vector_graphics = false;
    for (page_index, mut program) in programs {
        has_vector_graphics |= program_has_vector_graphics(&program);
        let tounicode_result = apply_tounicode_maps(&mut program, document);
        let applied_tounicode = tounicode_result.applied;
        diagnostics.extend(tounicode_result.diagnostics);
        let extraction = pdf_text::extract_text_runs(&program, page_index);
        diagnostics.extend(
            extraction
                .diagnostics
                .into_iter()
                .filter(|diagnostic| !applied_tounicode || diagnostic.code != "MISSING_TOUNICODE"),
        );
        runs.extend(extraction.runs);
    }
    if runs.is_empty() && document_has_token(document, "/Subtype /Image") {
        let ocr = extract_ocr_text_runs_from_document(document, OcrConfig::from_environment());
        diagnostics.extend(ocr.diagnostics);
        runs.extend(ocr.runs);
    }
    append_unsupported_feature_diagnostics(
        document,
        runs.is_empty(),
        has_vector_graphics,
        &mut diagnostics,
    );
    ExtractedTextRuns { runs, diagnostics }
}

fn append_unsupported_feature_diagnostics(
    document: &pdf_core::PdfDocument,
    has_no_text_runs: bool,
    has_vector_graphics: bool,
    diagnostics: &mut Vec<Diagnostic>,
) {
    if has_vector_graphics {
        diagnostics.push(Diagnostic::warning(
            "UNSUPPORTED_VECTOR_GRAPHIC_DIFF",
            "native vector path comparison is not implemented",
        ));
    }

    if document_has_token(document, "/Subtype /Image") && has_no_text_runs {
        diagnostics.push(Diagnostic::warning(
            "MISSING_TEXT_LAYER",
            "no extractable text layer was found and OCR did not produce text",
        ));
    }

    if document_has_any_token(document, &["/Annots", "/Subtype /Link", "/Annot"]) {
        diagnostics.push(Diagnostic::warning(
            "UNSUPPORTED_ANNOTATION_DIFF",
            "annotation and link target comparison is not implemented",
        ));
    }

    if document_has_any_token(document, &["/AcroForm", "/Widget"]) {
        diagnostics.push(Diagnostic::warning(
            "UNSUPPORTED_FORM_FIELD_DIFF",
            "interactive form field comparison is not implemented",
        ));
    }

    append_font_diagnostics(document, diagnostics);
    append_tagged_pdf_diagnostics(document, diagnostics);
}

#[derive(Debug, Clone, PartialEq, Eq)]
struct OcrImage {
    index: usize,
    object_id: ObjectId,
    byte_range: ByteRange,
    width: usize,
    height: usize,
    pixels_rgb: Vec<u8>,
    hash: String,
}

fn extract_ocr_text_runs_from_document(
    document: &pdf_core::PdfDocument,
    config: OcrConfig,
) -> ExtractedTextRuns {
    let mut runs = Vec::new();
    let mut diagnostics = Vec::new();
    let images = ocr_images(document, &mut diagnostics);

    for image in images {
        match run_ocr_for_image(&image, &config) {
            Ok(text) => {
                let normalized = normalize_ocr_text(&text);
                if normalized.is_empty() {
                    continue;
                }
                diagnostics.push(Diagnostic::info(
                    "OCR_TEXT_EXTRACTED",
                    format!(
                        "extracted OCR text from image XObject {} 0 R",
                        image.object_id.number
                    ),
                ));
                runs.push(ocr_text_run(&image, normalized));
            }
            Err(error) if error.kind() == std::io::ErrorKind::NotFound => {
                break;
            }
            Err(error) => diagnostics.push(Diagnostic::warning(
                "OCR_ENGINE_FAILED",
                format!(
                    "OCR engine failed for image XObject {} 0 R: {error}",
                    image.object_id.number
                ),
            )),
        }
    }

    ExtractedTextRuns { runs, diagnostics }
}

fn ocr_images(
    document: &pdf_core::PdfDocument,
    diagnostics: &mut Vec<Diagnostic>,
) -> Vec<OcrImage> {
    let soft_masks = soft_mask_object_ids(document);
    let mut images = Vec::new();
    for object in document
        .objects
        .iter()
        .filter(|object| object.body.contains("/Subtype /Image"))
    {
        if soft_masks.contains(&object.id) {
            continue;
        }
        let index = images.len();
        match ocr_image_from_object(index, object) {
            Ok(Some(image)) => images.push(image),
            Ok(None) => {}
            Err(message) => diagnostics.push(Diagnostic::warning(
                "OCR_IMAGE_UNSUPPORTED",
                format!(
                    "image XObject {} 0 R is not supported for OCR extraction: {message}",
                    object.id.number
                ),
            )),
        }
    }
    images
}

fn soft_mask_object_ids(document: &pdf_core::PdfDocument) -> Vec<ObjectId> {
    document
        .objects
        .iter()
        .filter_map(|object| reference_after_key(&object.body, "SMask"))
        .collect()
}

fn ocr_image_from_object(
    index: usize,
    object: &pdf_core::PdfObject,
) -> Result<Option<OcrImage>, String> {
    let Some(stream) = &object.stream else {
        return Ok(None);
    };
    if !stream.decoded {
        return Err("stream bytes were not decoded".into());
    }

    let width =
        pdf_usize_after_name(&object.body, "Width").ok_or_else(|| "missing /Width".to_owned())?;
    let height =
        pdf_usize_after_name(&object.body, "Height").ok_or_else(|| "missing /Height".to_owned())?;
    let bits_per_component = pdf_usize_after_name(&object.body, "BitsPerComponent")
        .ok_or_else(|| "missing /BitsPerComponent".to_owned())?;
    if bits_per_component != 8 {
        return Err(format!(
            "BitsPerComponent {bits_per_component} is not supported"
        ));
    }

    let color_space = value_after_pdf_name(&object.body, "ColorSpace")
        .ok_or_else(|| "missing /ColorSpace".to_owned())?;
    let components = match color_space.as_str() {
        "DeviceRGB" => 3,
        "DeviceGray" => 1,
        other => return Err(format!("ColorSpace /{other} is not supported")),
    };

    let columns = pdf_usize_after_name(&object.body, "Columns").unwrap_or(width);
    let colors = pdf_usize_after_name(&object.body, "Colors").unwrap_or(components);
    if columns != width || colors != components {
        return Err("DecodeParms columns/colors do not match image dimensions".into());
    }

    let predictor = pdf_usize_after_name(&object.body, "Predictor").unwrap_or(1);
    let samples = decode_image_samples(&stream.bytes, width, height, components, predictor)?;
    let pixels_rgb = if components == 3 {
        samples
    } else {
        samples
            .into_iter()
            .flat_map(|sample| [sample, sample, sample])
            .collect()
    };
    let hash = stable_hash(&pixels_rgb);

    Ok(Some(OcrImage {
        index,
        object_id: object.id,
        byte_range: stream.byte_range,
        width,
        height,
        pixels_rgb,
        hash,
    }))
}

fn decode_image_samples(
    bytes: &[u8],
    width: usize,
    height: usize,
    components: usize,
    predictor: usize,
) -> Result<Vec<u8>, String> {
    let row_len = width
        .checked_mul(components)
        .ok_or_else(|| "image row size overflowed".to_owned())?;
    let expected = row_len
        .checked_mul(height)
        .ok_or_else(|| "image size overflowed".to_owned())?;

    if predictor == 1 {
        if bytes.len() < expected {
            return Err(format!(
                "decoded stream has {} bytes but expected at least {expected}",
                bytes.len()
            ));
        }
        return Ok(bytes[..expected].to_vec());
    }

    if !(10..=15).contains(&predictor) {
        return Err(format!("Predictor {predictor} is not supported"));
    }

    let encoded_row_len = row_len
        .checked_add(1)
        .ok_or_else(|| "PNG predictor row size overflowed".to_owned())?;
    let expected_encoded = encoded_row_len
        .checked_mul(height)
        .ok_or_else(|| "PNG predictor image size overflowed".to_owned())?;
    if bytes.len() < expected_encoded {
        return Err(format!(
            "decoded stream has {} bytes but expected at least {expected_encoded}",
            bytes.len()
        ));
    }

    let mut output = vec![0; expected];
    for y in 0..height {
        let input_start = y * encoded_row_len;
        let filter = bytes[input_start];
        let input = &bytes[input_start + 1..input_start + 1 + row_len];
        let row_start = y * row_len;
        for x in 0..row_len {
            let left = if x >= components {
                output[row_start + x - components]
            } else {
                0
            };
            let up = if y > 0 {
                output[row_start + x - row_len]
            } else {
                0
            };
            let up_left = if y > 0 && x >= components {
                output[row_start + x - row_len - components]
            } else {
                0
            };
            let predictor_byte = match filter {
                0 => 0,
                1 => left,
                2 => up,
                3 => ((u16::from(left) + u16::from(up)) / 2) as u8,
                4 => paeth_predictor(left, up, up_left),
                other => return Err(format!("PNG row filter {other} is not supported")),
            };
            output[row_start + x] = input[x].wrapping_add(predictor_byte);
        }
    }

    Ok(output)
}

fn paeth_predictor(left: u8, up: u8, up_left: u8) -> u8 {
    let left = i32::from(left);
    let up = i32::from(up);
    let up_left = i32::from(up_left);
    let estimate = left + up - up_left;
    let left_distance = (estimate - left).abs();
    let up_distance = (estimate - up).abs();
    let up_left_distance = (estimate - up_left).abs();
    if left_distance <= up_distance && left_distance <= up_left_distance {
        left as u8
    } else if up_distance <= up_left_distance {
        up as u8
    } else {
        up_left as u8
    }
}

fn run_ocr_for_image(image: &OcrImage, config: &OcrConfig) -> std::io::Result<String> {
    let path = write_temp_ppm(image)?;
    let mut command = ProcessCommand::new(&config.command);
    match config.mode {
        OcrCommandMode::Tesseract => {
            command.arg(&path).arg("stdout").arg("--psm").arg("6");
        }
        OcrCommandMode::Plain => {
            command.arg(&path);
        }
    }
    command
        .env("SPDFDIFF_OCR_OBJECT_ID", image.object_id.number.to_string())
        .env("SPDFDIFF_OCR_IMAGE_INDEX", image.index.to_string())
        .env("SPDFDIFF_OCR_IMAGE_HASH", &image.hash);
    let output = command.output();
    let _ = std::fs::remove_file(&path);
    let output = output?;
    if output.status.success() {
        Ok(String::from_utf8_lossy(&output.stdout).into_owned())
    } else {
        Err(std::io::Error::other(
            String::from_utf8_lossy(&output.stderr).into_owned(),
        ))
    }
}

fn write_temp_ppm(image: &OcrImage) -> std::io::Result<PathBuf> {
    let mut path = std::env::temp_dir();
    path.push(format!(
        "spdfdiff-ocr-{}-{}-{}.ppm",
        std::process::id(),
        image.object_id.number,
        image.hash
    ));
    let mut bytes = format!("P6\n{} {}\n255\n", image.width, image.height).into_bytes();
    bytes.extend_from_slice(&image.pixels_rgb);
    std::fs::write(&path, bytes)?;
    Ok(path)
}

fn ocr_text_run(image: &OcrImage, text: String) -> pdf_text::TextRun {
    pdf_text::TextRun {
        id: format!("ocr-image-{:04}", image.index),
        text: text.clone(),
        normalized_text: text,
        glyphs: Vec::new(),
        bbox: Rect {
            x0: 0.0,
            y0: 0.0,
            x1: image.width as f32,
            y1: image.height as f32,
        },
        source: Provenance {
            file_role: None,
            object_id: Some(image.object_id),
            page_index: Some(0),
            stream_object_id: Some(image.object_id),
            content_op_index: None,
            byte_range: Some(image.byte_range),
        },
        marked_content: None,
    }
}

fn normalize_ocr_text(text: &str) -> String {
    text.split_whitespace().collect::<Vec<_>>().join(" ")
}

fn pdf_usize_after_name(body: &str, key: &str) -> Option<usize> {
    value_after_pdf_name(body, key)?.parse().ok()
}

fn append_font_diagnostics(document: &pdf_core::PdfDocument, diagnostics: &mut Vec<Diagnostic>) {
    let cid_missing_count = document
        .objects
        .iter()
        .filter(|object| {
            (document_has_object_token(object, "/Subtype /Type0")
                || document_has_object_token(object, "/CIDFontType"))
                && !document_has_object_token(object, "/ToUnicode")
        })
        .count();
    if cid_missing_count > 0 {
        diagnostics.push(Diagnostic::warning(
            "MISSING_TOUNICODE_CID_FONT",
            format!(
                "{cid_missing_count} CID/Type0 font objects have no ToUnicode map; extraction falls back to literal bytes with lower confidence"
            ),
        ));
    }
}

fn append_tagged_pdf_diagnostics(
    document: &pdf_core::PdfDocument,
    diagnostics: &mut Vec<Diagnostic>,
) {
    let tagged_structure = document.tagged_structure(ParseConfig::default());
    if tagged_structure.root_object_id.is_some() {
        diagnostics.push(Diagnostic::info(
            "TAGGED_PDF_STRUCTURE_DETECTED",
            format!(
                "parsed tagged PDF structure with {} elements and {} MCID references; untagged layout heuristics remain the fallback when MCID mapping is incomplete",
                tagged_element_count(&tagged_structure.roots),
                tagged_mcid_count(&tagged_structure.roots)
            ),
        ));
    }
    diagnostics.extend(tagged_structure.diagnostics);
    let mcid_count = document
        .objects
        .iter()
        .filter_map(|object| object.stream.as_ref())
        .map(|stream| byte_pattern_count(&stream.bytes, b"/MCID"))
        .sum::<usize>();
    if mcid_count > 0 {
        diagnostics.push(Diagnostic::info(
            "TAGGED_MCID_DETECTED",
            format!("detected {mcid_count} marked-content IDs available for semantic node mapping"),
        ));
    }
}

fn semantic_tagged_structure_summary(
    structure: &pdf_core::TaggedStructure,
) -> pdf_semantic::TaggedStructureSummary {
    let mut structure_types = Vec::new();
    collect_tagged_structure_types(&structure.roots, &mut structure_types);
    structure_types.sort();
    structure_types.dedup();
    pdf_semantic::TaggedStructureSummary {
        root_object_id: structure.root_object_id,
        element_count: tagged_element_count(&structure.roots),
        mcid_count: tagged_mcid_count(&structure.roots),
        parent_tree_entries: structure.parent_tree.len(),
        structure_types,
        elements: semantic_tagged_elements(&structure.roots),
        confidence: if structure.diagnostics.is_empty() {
            0.8
        } else {
            0.5
        },
    }
}

fn semantic_tagged_elements(
    elements: &[pdf_core::TaggedStructureElement],
) -> Vec<pdf_semantic::TaggedStructureElementSummary> {
    elements
        .iter()
        .map(|element| pdf_semantic::TaggedStructureElementSummary {
            structure_type: element.structure_type.clone(),
            mcids: element.mcids.clone(),
            children: semantic_tagged_elements(&element.children),
        })
        .collect()
}

fn tagged_structure_report(structure: &pdf_core::TaggedStructure) -> TaggedStructureReport {
    let mut structure_types = Vec::new();
    collect_tagged_structure_types(&structure.roots, &mut structure_types);
    structure_types.sort();
    structure_types.dedup();
    TaggedStructureReport {
        detected: structure.root_object_id.is_some(),
        root_object: structure
            .root_object_id
            .map(|object_id| format!("{} {} R", object_id.number, object_id.generation)),
        element_count: tagged_element_count(&structure.roots),
        mcid_count: tagged_mcid_count(&structure.roots),
        parent_tree_entries: structure.parent_tree.len(),
        structure_types,
        diagnostics: structure
            .diagnostics
            .iter()
            .map(|diagnostic| diagnostic.code.clone())
            .collect(),
    }
}

fn tagged_structure_report_from_semantic(
    summary: &pdf_semantic::TaggedStructureSummary,
) -> TaggedStructureReport {
    TaggedStructureReport {
        detected: summary.root_object_id.is_some(),
        root_object: summary
            .root_object_id
            .map(|object_id| format!("{} {} R", object_id.number, object_id.generation)),
        element_count: summary.element_count,
        mcid_count: summary.mcid_count,
        parent_tree_entries: summary.parent_tree_entries,
        structure_types: summary.structure_types.clone(),
        diagnostics: Vec::new(),
    }
}

fn tagged_element_count(elements: &[pdf_core::TaggedStructureElement]) -> usize {
    elements
        .iter()
        .map(|element| 1 + tagged_element_count(&element.children))
        .sum()
}

fn tagged_mcid_count(elements: &[pdf_core::TaggedStructureElement]) -> usize {
    elements
        .iter()
        .map(|element| element.mcids.len() + tagged_mcid_count(&element.children))
        .sum()
}

fn collect_tagged_structure_types(
    elements: &[pdf_core::TaggedStructureElement],
    structure_types: &mut Vec<String>,
) {
    for element in elements {
        structure_types.push(element.structure_type.clone());
        collect_tagged_structure_types(&element.children, structure_types);
    }
}

#[derive(Debug, Clone, PartialEq, Eq)]
struct ImagePayload {
    index: usize,
    object_id: ObjectId,
    byte_range: ByteRange,
    byte_len: usize,
    hash: String,
}

fn append_image_payload_changes(
    document: &mut DiffDocument,
    old_document: &pdf_core::PdfDocument,
    new_document: &pdf_core::PdfDocument,
) {
    let old_images = image_payloads(old_document);
    let new_images = image_payloads(new_document);
    for index in 0..old_images.len().max(new_images.len()) {
        match (old_images.get(index), new_images.get(index)) {
            (Some(old_image), Some(new_image)) if old_image.hash == new_image.hash => {}
            (Some(old_image), Some(new_image)) => push_image_payload_change(
                document,
                Some(old_image),
                Some(new_image),
                format!(
                    "image payload differs at image index {index} (old hash {} -> new hash {})",
                    old_image.hash, new_image.hash
                ),
            ),
            (Some(old_image), None) => push_image_payload_change(
                document,
                Some(old_image),
                None,
                format!("image payload at index {index} exists only in old document"),
            ),
            (None, Some(new_image)) => push_image_payload_change(
                document,
                None,
                Some(new_image),
                format!("image payload at index {index} exists only in new document"),
            ),
            (None, None) => {}
        }
    }
}

#[derive(Debug, Clone, PartialEq, Eq)]
struct DocumentSurface {
    category: SurfaceCategory,
    index: usize,
    object_id: ObjectId,
    summary: String,
    hash: String,
    byte_range: Option<ByteRange>,
}

#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
enum SurfaceCategory {
    Annotation,
    FormField,
    Outline,
    Metadata,
    Attachment,
}

fn append_document_surface_changes(
    document: &mut DiffDocument,
    old_document: &pdf_core::PdfDocument,
    new_document: &pdf_core::PdfDocument,
) {
    for category in [
        SurfaceCategory::Annotation,
        SurfaceCategory::FormField,
        SurfaceCategory::Outline,
        SurfaceCategory::Metadata,
        SurfaceCategory::Attachment,
    ] {
        let old_surfaces = document_surfaces(old_document, category);
        let new_surfaces = document_surfaces(new_document, category);
        for index in 0..old_surfaces.len().max(new_surfaces.len()) {
            match (old_surfaces.get(index), new_surfaces.get(index)) {
                (Some(old_surface), Some(new_surface)) if old_surface.hash == new_surface.hash => {}
                (Some(old_surface), Some(new_surface)) => push_surface_change(
                    document,
                    Some(old_surface),
                    Some(new_surface),
                    format!(
                        "{} differs at index {index}",
                        old_surface.category.report_label()
                    ),
                ),
                (Some(old_surface), None) => push_surface_change(
                    document,
                    Some(old_surface),
                    None,
                    format!(
                        "{} exists only in old document at index {index}",
                        old_surface.category.report_label()
                    ),
                ),
                (None, Some(new_surface)) => push_surface_change(
                    document,
                    None,
                    Some(new_surface),
                    format!(
                        "{} exists only in new document at index {index}",
                        new_surface.category.report_label()
                    ),
                ),
                (None, None) => {}
            }
        }
    }
}

impl SurfaceCategory {
    fn change_kind(self) -> ChangeKind {
        match self {
            SurfaceCategory::Annotation | SurfaceCategory::Attachment => {
                ChangeKind::AnnotationChanged
            }
            SurfaceCategory::FormField => ChangeKind::FormFieldChanged,
            SurfaceCategory::Outline | SurfaceCategory::Metadata => ChangeKind::MetadataChanged,
        }
    }

    fn report_label(self) -> &'static str {
        match self {
            SurfaceCategory::Annotation => "annotation/link surface",
            SurfaceCategory::FormField => "form field surface",
            SurfaceCategory::Outline => "outline/bookmark surface",
            SurfaceCategory::Metadata => "metadata/XMP surface",
            SurfaceCategory::Attachment => "embedded attachment surface",
        }
    }

    fn node_prefix(self) -> &'static str {
        match self {
            SurfaceCategory::Annotation => "annotation",
            SurfaceCategory::FormField => "form",
            SurfaceCategory::Outline => "outline",
            SurfaceCategory::Metadata => "metadata",
            SurfaceCategory::Attachment => "attachment",
        }
    }
}

fn document_surfaces(
    document: &pdf_core::PdfDocument,
    category: SurfaceCategory,
) -> Vec<DocumentSurface> {
    let mut surfaces = document
        .objects
        .iter()
        .filter(|object| surface_matches_category(&object.body, category))
        .enumerate()
        .map(|(index, object)| {
            let bytes = object
                .stream
                .as_ref()
                .map_or_else(|| object.body.as_bytes(), |stream| stream.bytes.as_slice());
            let summary = summarize_surface(&object.body, category);
            DocumentSurface {
                category,
                index,
                object_id: object.id,
                summary,
                hash: stable_hash(bytes),
                byte_range: object.stream.as_ref().map(|stream| stream.byte_range),
            }
        })
        .collect::<Vec<_>>();
    surfaces.sort_by_key(|surface| (surface.category, surface.index, surface.object_id));
    surfaces
}

fn surface_matches_category(body: &str, category: SurfaceCategory) -> bool {
    match category {
        SurfaceCategory::Annotation => {
            body.contains("/Subtype /Link")
                || (body.contains("/Type /Annot") && !body.contains("/Subtype /Widget"))
        }
        SurfaceCategory::FormField => {
            body.contains("/AcroForm") || body.contains("/Subtype /Widget")
        }
        SurfaceCategory::Outline => {
            body.contains("/Outlines")
                || (body.contains("/Title")
                    && (body.contains("/Dest")
                        || body.contains("/Parent")
                        || body.contains("/First")
                        || body.contains("/Next")))
        }
        SurfaceCategory::Metadata => {
            body.contains("/Type /Metadata") || body.contains("/Metadata") || body.contains("/Info")
        }
        SurfaceCategory::Attachment => {
            body.contains("/EmbeddedFiles")
                || body.contains("/Filespec")
                || body.contains("/Subtype /FileAttachment")
        }
    }
}

fn summarize_surface(body: &str, category: SurfaceCategory) -> String {
    let mut parts = vec![category.report_label().to_owned()];
    for key in ["Subtype", "URI", "Title", "F", "Desc", "Contents", "T", "V"] {
        if let Some(value) = value_after_pdf_name(body, key) {
            parts.push(format!("{key}={value}"));
        }
    }
    parts.join(" ")
}

fn value_after_pdf_name(body: &str, key: &str) -> Option<String> {
    let start = body.find(&format!("/{key}"))? + key.len() + 1;
    let remaining = body[start..].trim_start();
    if let Some(value) = remaining.strip_prefix('(') {
        return value
            .split_once(')')
            .map(|(value, _)| value.chars().take(120).collect());
    }
    if let Some(value) = remaining.strip_prefix('/') {
        return value
            .split_whitespace()
            .next()
            .map(|value| value.chars().take(120).collect());
    }
    remaining
        .split_whitespace()
        .next()
        .map(|value| value.chars().take(120).collect())
}

fn push_surface_change(
    document: &mut DiffDocument,
    old_surface: Option<&DocumentSurface>,
    new_surface: Option<&DocumentSurface>,
    reason: String,
) {
    let category = old_surface
        .map(|surface| surface.category)
        .or_else(|| new_surface.map(|surface| surface.category))
        .unwrap_or(SurfaceCategory::Metadata);
    let change = SemanticChange {
        id: format!("change-{:04}", document.changes.len()),
        kind: category.change_kind(),
        severity: ChangeSeverity::Info,
        old_node: old_surface.map(|surface| surface_evidence(FileRole::Old, surface)),
        new_node: new_surface.map(|surface| surface_evidence(FileRole::New, surface)),
        text_hunks: Vec::new(),
        layout_diff: None,
        confidence: 0.8,
        reason,
    };
    document.changes.push(change);
}

fn surface_evidence(file_role: FileRole, surface: &DocumentSurface) -> SemanticNodeEvidence {
    SemanticNodeEvidence {
        node_id: format!("{}-{:04}", surface.category.node_prefix(), surface.index),
        page: 0,
        bbox: None,
        text: Some(format!(
            "{} object {} 0 R hash={} {}",
            surface.category.report_label(),
            surface.object_id.number,
            surface.hash,
            surface.summary
        )),
        source: vec![Provenance {
            file_role: Some(file_role),
            object_id: Some(surface.object_id),
            page_index: None,
            stream_object_id: Some(surface.object_id),
            content_op_index: None,
            byte_range: surface.byte_range,
        }],
    }
}

fn image_payloads(document: &pdf_core::PdfDocument) -> Vec<ImagePayload> {
    document
        .objects
        .iter()
        .filter(|object| object.body.contains("/Subtype /Image"))
        .filter_map(|object| {
            let stream = object.stream.as_ref()?;
            Some((object.id, stream.byte_range, stream.bytes.as_slice()))
        })
        .enumerate()
        .map(|(index, (object_id, byte_range, bytes))| ImagePayload {
            index,
            object_id,
            byte_range,
            byte_len: bytes.len(),
            hash: stable_hash(bytes),
        })
        .collect()
}

fn push_image_payload_change(
    document: &mut DiffDocument,
    old_image: Option<&ImagePayload>,
    new_image: Option<&ImagePayload>,
    reason: String,
) {
    let change = SemanticChange {
        id: format!("change-{:04}", document.changes.len()),
        kind: ChangeKind::ObjectChanged,
        severity: ChangeSeverity::Info,
        old_node: old_image.map(|image| image_payload_evidence(FileRole::Old, image)),
        new_node: new_image.map(|image| image_payload_evidence(FileRole::New, image)),
        text_hunks: Vec::new(),
        layout_diff: None,
        confidence: 1.0,
        reason,
    };
    document.changes.push(change);
}

fn image_payload_evidence(file_role: FileRole, image: &ImagePayload) -> SemanticNodeEvidence {
    SemanticNodeEvidence {
        node_id: format!("image-{:04}", image.index),
        page: 0,
        bbox: None,
        text: Some(format!(
            "image XObject {} 0 R bytes={} hash={}",
            image.object_id.number, image.byte_len, image.hash
        )),
        source: vec![Provenance {
            file_role: Some(file_role),
            object_id: Some(image.object_id),
            page_index: None,
            stream_object_id: Some(image.object_id),
            content_op_index: None,
            byte_range: Some(image.byte_range),
        }],
    }
}

fn stable_hash(bytes: &[u8]) -> String {
    let mut hash = 0xcbf2_9ce4_8422_2325u64;
    for byte in bytes {
        hash ^= u64::from(*byte);
        hash = hash.wrapping_mul(0x0000_0100_0000_01b3);
    }
    format!("{hash:016x}")
}

fn document_has_any_token(document: &pdf_core::PdfDocument, tokens: &[&str]) -> bool {
    tokens
        .iter()
        .any(|token| document_has_token(document, token))
}

fn document_has_token(document: &pdf_core::PdfDocument, token: &str) -> bool {
    document
        .objects
        .iter()
        .any(|object| document_has_object_token(object, token))
}

fn document_has_object_token(object: &pdf_core::PdfObject, token: &str) -> bool {
    object.body.contains(token)
}

fn byte_pattern_count(bytes: &[u8], pattern: &[u8]) -> usize {
    bytes
        .windows(pattern.len())
        .filter(|window| *window == pattern)
        .count()
}

fn program_has_vector_graphics(program: &ContentProgram) -> bool {
    program.operations.iter().any(|operation| {
        matches!(
            operation,
            ContentOp::RecognizedNonText { operator, .. }
                if is_vector_graphics_operator(operator)
        )
    })
}

fn is_vector_graphics_operator(operator: &str) -> bool {
    matches!(
        operator,
        "m" | "l"
            | "c"
            | "v"
            | "y"
            | "h"
            | "re"
            | "S"
            | "s"
            | "f"
            | "F"
            | "f*"
            | "B"
            | "B*"
            | "b"
            | "b*"
            | "n"
            | "W"
            | "W*"
            | "sh"
    )
}

struct ToUnicodeApplyResult {
    applied: bool,
    diagnostics: Vec<Diagnostic>,
}

fn apply_tounicode_maps(
    program: &mut ContentProgram,
    document: &pdf_core::PdfDocument,
) -> ToUnicodeApplyResult {
    let maps = font_tounicode_maps(document);
    if maps.maps.is_empty() {
        return ToUnicodeApplyResult {
            applied: false,
            diagnostics: maps.diagnostics,
        };
    }

    let mut current_font: Option<String> = None;
    let mut applied = false;
    for operation in &mut program.operations {
        match operation {
            ContentOp::SetFont { name, .. } => {
                current_font = Some(name.clone());
            }
            ContentOp::ShowText {
                text, raw_bytes, ..
            }
            | ContentOp::ShowAdjustedText {
                text, raw_bytes, ..
            } => {
                let Some(font_name) = current_font.as_deref() else {
                    continue;
                };
                let Some(map) = maps.maps.get(font_name) else {
                    continue;
                };
                if let Some(decoded) = decode_with_tounicode(raw_bytes, map) {
                    *text = decoded;
                    applied = true;
                }
            }
            ContentOp::BeginText { .. }
            | ContentOp::EndText { .. }
            | ContentOp::MoveTextPosition { .. }
            | ContentOp::MoveToNextLine { .. }
            | ContentOp::SetTextLeading { .. }
            | ContentOp::SetCharacterSpacing { .. }
            | ContentOp::SetWordSpacing { .. }
            | ContentOp::SetHorizontalScaling { .. }
            | ContentOp::SetTextMatrix { .. }
            | ContentOp::SaveGraphicsState { .. }
            | ContentOp::RestoreGraphicsState { .. }
            | ContentOp::ConcatMatrix { .. }
            | ContentOp::BeginMarkedContent { .. }
            | ContentOp::EndMarkedContent { .. }
            | ContentOp::RecognizedNonText { .. }
            | ContentOp::Unknown { .. } => {}
        }
    }

    ToUnicodeApplyResult {
        applied,
        diagnostics: maps.diagnostics,
    }
}

struct ToUnicodeMaps {
    maps: BTreeMap<String, BTreeMap<Vec<u8>, String>>,
    diagnostics: Vec<Diagnostic>,
}

fn font_tounicode_maps(document: &pdf_core::PdfDocument) -> ToUnicodeMaps {
    let objects_by_id = document
        .objects
        .iter()
        .map(|object| (object.id, object))
        .collect::<BTreeMap<_, _>>();
    let mut font_to_cmap = BTreeMap::new();
    for object in &document.objects {
        if let Some(cmap_id) = reference_after_key(&object.body, "ToUnicode") {
            font_to_cmap.insert(object.id, cmap_id);
        }
    }

    let mut maps = BTreeMap::new();
    let mut diagnostics = Vec::new();
    for object in &document.objects {
        for (font_name, font_object_id) in named_references(&object.body) {
            let Some(cmap_object_id) = font_to_cmap.get(&font_object_id) else {
                continue;
            };
            let Some(cmap_stream) = objects_by_id
                .get(cmap_object_id)
                .and_then(|object| object.stream.as_ref())
            else {
                continue;
            };
            let cmap = parse_tounicode_cmap_with_diagnostics(&cmap_stream.bytes);
            diagnostics.extend(cmap.diagnostics);
            if !cmap.map.is_empty() {
                maps.insert(font_name, cmap.map);
            }
        }
    }
    ToUnicodeMaps { maps, diagnostics }
}

fn reference_after_key(body: &str, key: &str) -> Option<ObjectId> {
    let start = body.find(&format!("/{key}"))? + key.len() + 1;
    parse_reference_at(&body[start..])
}

fn named_references(body: &str) -> Vec<(String, ObjectId)> {
    let tokens = body_tokens(body);
    let mut references = Vec::new();
    for index in 0..tokens.len().saturating_sub(3) {
        let Some(name) = tokens[index].strip_prefix('/') else {
            continue;
        };
        let Ok(number) = tokens[index + 1].parse::<u32>() else {
            continue;
        };
        let Ok(generation) = tokens[index + 2].parse::<u16>() else {
            continue;
        };
        if tokens[index + 3] == "R" {
            references.push((name.to_owned(), ObjectId { number, generation }));
        }
    }
    references
}

fn parse_reference_at(body: &str) -> Option<ObjectId> {
    let tokens = body_tokens(body);
    let number = tokens.first()?.parse().ok()?;
    let generation = tokens.get(1)?.parse().ok()?;
    if tokens.get(2)? != "R" {
        return None;
    }
    Some(ObjectId { number, generation })
}

fn body_tokens(body: &str) -> Vec<String> {
    body.replace("<<", " ")
        .replace(">>", " ")
        .replace('/', " /")
        .replace(['[', ']'], " ")
        .split_whitespace()
        .map(ToOwned::to_owned)
        .collect()
}

struct ToUnicodeCMap {
    map: BTreeMap<Vec<u8>, String>,
    diagnostics: Vec<Diagnostic>,
}

fn parse_tounicode_cmap_with_diagnostics(bytes: &[u8]) -> ToUnicodeCMap {
    let text = String::from_utf8_lossy(bytes);
    let mut map = BTreeMap::new();
    let mut diagnostics = Vec::new();
    let mut in_bfrange = false;
    for line in text.lines() {
        let trimmed = line.trim();
        if trimmed.ends_with("beginbfrange") {
            in_bfrange = true;
            continue;
        }
        if trimmed == "endbfrange" {
            in_bfrange = false;
            continue;
        }
        if trimmed.ends_with("beginbfchar") || trimmed == "endbfchar" {
            continue;
        }
        let hex_tokens = hex_tokens_in_line(line);
        if in_bfrange && hex_tokens.len() >= 3 {
            if let Err(message) = insert_bfrange(&mut map, &hex_tokens, trimmed.contains('[')) {
                diagnostics.push(Diagnostic::warning("CMAP_UNSUPPORTED_RANGE", message));
            }
        } else if hex_tokens.len() == 2 {
            if let Some(decoded) = unicode_hex_to_string(&hex_tokens[1]) {
                map.insert(hex_tokens[0].clone(), decoded);
            }
        } else if trimmed.contains("begin")
            || trimmed.contains("end")
            || trimmed.is_empty()
            || trimmed.starts_with('%')
        {
            continue;
        }
    }
    ToUnicodeCMap { map, diagnostics }
}

fn insert_bfrange(
    map: &mut BTreeMap<Vec<u8>, String>,
    hex_tokens: &[Vec<u8>],
    array_mode: bool,
) -> Result<(), String> {
    if hex_tokens.len() < 3 {
        return Err("bfrange entry has fewer than three hex operands".into());
    }
    let start = bytes_to_u32(&hex_tokens[0]).ok_or("bfrange start is too wide")?;
    let end = bytes_to_u32(&hex_tokens[1]).ok_or("bfrange end is too wide")?;
    if end < start {
        return Err("bfrange end is before start".into());
    }
    let count = usize::try_from(end - start + 1).map_err(|_| "bfrange is too large")?;
    if !array_mode {
        let destination_start =
            bytes_to_u32(&hex_tokens[2]).ok_or("bfrange destination is too wide")?;
        for offset in 0..count {
            let source = int_to_be_bytes(start + offset as u32, hex_tokens[0].len());
            let destination =
                int_to_be_bytes(destination_start + offset as u32, hex_tokens[2].len());
            let Some(decoded) = unicode_hex_to_string(&destination) else {
                return Err("bfrange destination is not valid UTF-16BE".into());
            };
            map.insert(source, decoded);
        }
    } else {
        if hex_tokens.len() - 2 < count {
            return Err("bfrange destination array is shorter than source range".into());
        }
        for (offset, destination) in hex_tokens[2..2 + count].iter().enumerate() {
            let source = int_to_be_bytes(start + offset as u32, hex_tokens[0].len());
            let Some(decoded) = unicode_hex_to_string(destination) else {
                return Err("bfrange array destination is not valid UTF-16BE".into());
            };
            map.insert(source, decoded);
        }
    }
    Ok(())
}

fn bytes_to_u32(bytes: &[u8]) -> Option<u32> {
    if bytes.len() > 4 {
        return None;
    }
    let mut value = 0u32;
    for byte in bytes {
        value = (value << 8) | u32::from(*byte);
    }
    Some(value)
}

fn int_to_be_bytes(value: u32, width: usize) -> Vec<u8> {
    let bytes = value.to_be_bytes();
    bytes[bytes.len().saturating_sub(width)..].to_vec()
}

fn hex_token_bytes(token: &str) -> Option<Vec<u8>> {
    let token = token.strip_prefix('<')?.strip_suffix('>')?;
    if token.is_empty() || token.len() % 2 != 0 {
        return None;
    }
    (0..token.len())
        .step_by(2)
        .map(|index| u8::from_str_radix(&token[index..index + 2], 16).ok())
        .collect()
}

fn hex_tokens_in_line(line: &str) -> Vec<Vec<u8>> {
    let bytes = line.as_bytes();
    let mut tokens = Vec::new();
    let mut index = 0;
    while index < bytes.len() {
        if bytes[index] != b'<' || bytes.get(index + 1) == Some(&b'<') {
            index += 1;
            continue;
        }
        let Some(relative_end) = bytes[index + 1..].iter().position(|byte| *byte == b'>') else {
            break;
        };
        let end = index + 1 + relative_end;
        if let Some(token) = line.get(index..=end).and_then(hex_token_bytes) {
            tokens.push(token);
        }
        index = end + 1;
    }
    tokens
}

fn unicode_hex_to_string(bytes: &[u8]) -> Option<String> {
    if bytes.len() % 2 != 0 {
        return None;
    }
    let mut out = String::new();
    for chunk in bytes.chunks_exact(2) {
        let code_unit = u16::from_be_bytes([chunk[0], chunk[1]]);
        let character = char::from_u32(u32::from(code_unit))?;
        out.push(character);
    }
    Some(out)
}

fn decode_with_tounicode(raw_bytes: &[u8], map: &BTreeMap<Vec<u8>, String>) -> Option<String> {
    let mut decoded = String::new();
    let mut index = 0;
    while index < raw_bytes.len() {
        let mut matched = None;
        for width in (1..=4).rev() {
            let end = index + width;
            if end <= raw_bytes.len() {
                if let Some(value) = map.get(&raw_bytes[index..end]) {
                    matched = Some((width, value));
                    break;
                }
            }
        }
        let (width, value) = matched?;
        decoded.push_str(value);
        index += width;
    }
    Some(decoded)
}

fn run_synthetic_benchmark(pages: usize) -> Result<BenchmarkReport, PdfDiffError> {
    const TARGET_TOTAL_MS: u128 = 5_000;
    let total_start = Instant::now();
    let old_bytes = synthetic_text_pdf(pages, None);
    let new_bytes = synthetic_text_pdf(pages, Some((pages / 2, "revised")));
    let config = ParseConfig::default();

    let parse_start = Instant::now();
    let old_document = pdf_core::PdfDocument::parse_with_config(&old_bytes, config)?;
    let new_document = pdf_core::PdfDocument::parse_with_config(&new_bytes, config)?;
    let parse = parse_start.elapsed().as_millis();

    let extract_start = Instant::now();
    let old_extraction = extract_text_runs_from_document(&old_document, config);
    let new_extraction = extract_text_runs_from_document(&new_document, config);
    let extract = extract_start.elapsed().as_millis();

    let semantic_start = Instant::now();
    let old_semantic = pdf_semantic::build_semantic_document(
        "benchmark-old",
        &old_extraction.runs,
        old_extraction.diagnostics,
    );
    let new_semantic = pdf_semantic::build_semantic_document(
        "benchmark-new",
        &new_extraction.runs,
        new_extraction.diagnostics,
    );
    let semantic = semantic_start.elapsed().as_millis();

    let diff_start = Instant::now();
    let diff = diff_semantic_documents(&old_semantic, &new_semantic, DiffConfig::default());
    let diff_ms = diff_start.elapsed().as_millis();

    let report_start = Instant::now();
    let _rendered = render_diff(&diff, DiffReportFormat::Json);
    let report = report_start.elapsed().as_millis();
    let total = total_start.elapsed().as_millis();

    Ok(BenchmarkReport {
        pages,
        target_total_ms: TARGET_TOTAL_MS,
        under_target: total <= TARGET_TOTAL_MS,
        timings_ms: BenchmarkTimings {
            parse,
            extract,
            semantic,
            diff: diff_ms,
            report,
            total,
        },
        peak_memory_bytes: current_process_memory_bytes(),
        memory_note: if current_process_memory_bytes().is_some() {
            "current process memory sample".into()
        } else {
            "memory usage is unavailable without a platform-specific safe probe".into()
        },
        summary: diff.summary,
        diagnostics: diff
            .diagnostics
            .iter()
            .map(|diagnostic| diagnostic.code.clone())
            .collect(),
    })
}

fn current_process_memory_bytes() -> Option<u64> {
    #[cfg(target_os = "linux")]
    {
        let status = std::fs::read_to_string("/proc/self/status").ok()?;
        let rss_kb = status
            .lines()
            .find_map(|line| line.strip_prefix("VmHWM:"))
            .or_else(|| status.lines().find_map(|line| line.strip_prefix("VmRSS:")))?
            .split_whitespace()
            .next()?
            .parse::<u64>()
            .ok()?;
        Some(rss_kb * 1024)
    }
    #[cfg(not(target_os = "linux"))]
    {
        None
    }
}

fn synthetic_text_pdf(pages: usize, replacement: Option<(usize, &str)>) -> Vec<u8> {
    let page_count = pages.max(1);
    let mut objects = Vec::<String>::new();
    let page_ids = (0..page_count)
        .map(|index| 3 + (index * 2))
        .collect::<Vec<_>>();
    let kids = page_ids
        .iter()
        .map(|object_id| format!("{object_id} 0 R"))
        .collect::<Vec<_>>()
        .join(" ");

    objects.push("1 0 obj\n<< /Type /Catalog /Pages 2 0 R >>\nendobj\n".into());
    objects.push(format!(
        "2 0 obj\n<< /Type /Pages /Kids [{kids}] /Count {page_count} >>\nendobj\n"
    ));

    for page_index in 0..page_count {
        let page_object_id = 3 + (page_index * 2);
        let content_object_id = page_object_id + 1;
        let text = replacement
            .filter(|(target_page, _)| *target_page == page_index)
            .map_or_else(
                || format!("Benchmark page {page_index} stable paragraph"),
                |(_, replacement_text)| {
                    format!("Benchmark page {page_index} stable paragraph {replacement_text}")
                },
            );
        let content = format!("BT /F1 12 Tf 72 720 Td ({text}) Tj ET");
        objects.push(format!(
            "{page_object_id} 0 obj\n<< /Type /Page /Parent 2 0 R /Contents {content_object_id} 0 R >>\nendobj\n"
        ));
        objects.push(format!(
            "{content_object_id} 0 obj\n<< /Length {} >>\nstream\n{content}\nendstream\nendobj\n",
            content.len()
        ));
    }

    let mut pdf = b"%PDF-1.7\n".to_vec();
    for object in objects {
        pdf.extend_from_slice(object.as_bytes());
    }
    pdf
}

fn render_diff(document: &DiffDocument, format: DiffReportFormat) -> String {
    match format {
        DiffReportFormat::Json => diff_report::to_json(document)
            .unwrap_or_else(|error| format!("{{\"error\":\"{error}\"}}")),
        DiffReportFormat::AiJson => diff_report::to_ai_review_json(document)
            .unwrap_or_else(|error| format!("{{\"error\":\"{error}\"}}")),
        DiffReportFormat::Md => diff_report::to_markdown(document),
        DiffReportFormat::Html => diff_report::to_html(document),
    }
}

fn render_inspect_report(
    fingerprint: &str,
    document: &pdf_core::PdfDocument,
    format: ReportFormat,
) -> String {
    let object_count = document.objects.len();
    let diagnostic_count = document.diagnostics.len();
    let first_page_streams = document
        .first_page_contents()
        .map_or(0, |contents| contents.len());
    let tagged_structure =
        tagged_structure_report(&document.tagged_structure(ParseConfig::default()));
    let report = InspectReport {
        file: fingerprint,
        object_count,
        diagnostic_count,
        first_page_streams,
        tagged_structure: tagged_structure.clone(),
    };
    match format {
        ReportFormat::Json => {
            to_json_pretty(&report).unwrap_or_else(|error| format!("{{\"error\":\"{error}\"}}"))
        }
        ReportFormat::Md => format!(
            "# PDF Inspect\n\n- File: `{}`\n- Objects: {}\n- Diagnostics: {}\n- First-page streams: {}\n- Tagged structure: {} elements, {} MCIDs\n",
            fingerprint,
            object_count,
            diagnostic_count,
            first_page_streams,
            tagged_structure.element_count,
            tagged_structure.mcid_count
        ),
        ReportFormat::Html => format!(
            "<!doctype html><meta charset=\"utf-8\"><pre># PDF Inspect\n\n- File: `{}`\n- Objects: {}\n- Diagnostics: {}\n- First-page streams: {}\n- Tagged structure: {} elements, {} MCIDs\n</pre>",
            escape_html(fingerprint),
            object_count,
            diagnostic_count,
            first_page_streams,
            tagged_structure.element_count,
            tagged_structure.mcid_count
        ),
    }
}

fn render_extract_report(
    document: &pdf_semantic::SemanticDocument,
    format: ReportFormat,
) -> String {
    match format {
        ReportFormat::Json => {
            let tables = extract_table_reports(document);
            let table_cells = tables
                .iter()
                .map(|table| table.cells.iter().map(Vec::len).sum::<usize>())
                .sum();
            let report = ExtractReport {
                file: &document.fingerprint,
                paragraphs: document.nodes.len(),
                table_candidates: tables.len(),
                table_cells,
                tables,
                diagnostic_count: document.diagnostics.len(),
                tagged_structure: document
                    .tagged_structure
                    .as_ref()
                    .map(tagged_structure_report_from_semantic),
            };
            to_json_pretty(&report).unwrap_or_else(|error| format!("{{\"error\":\"{error}\"}}"))
        }
        ReportFormat::Md => {
            let mut out = format!("# Extracted Text\n\nFile: `{}`\n\n", document.fingerprint);
            if let Some(tagged_structure) = &document.tagged_structure {
                out.push_str(&format!(
                    "Tagged structure: {} elements, {} MCIDs\n\n",
                    tagged_structure.element_count, tagged_structure.mcid_count
                ));
            }
            for node in &document.nodes {
                if let Some(text) = &node.normalized_text {
                    out.push_str(&format!("- {}\n", text));
                    if let Some(table) = &node.table {
                        out.push_str(&format!(
                            "  table: {} rows x {} columns, confidence {:.2}\n",
                            table.rows.len(),
                            table.column_x_positions.len(),
                            table.confidence
                        ));
                    }
                }
            }
            out
        }
        ReportFormat::Html => {
            let markdown = render_extract_report(document, ReportFormat::Md);
            format!(
                "<!doctype html><meta charset=\"utf-8\"><pre>{}</pre>",
                escape_html(&markdown)
            )
        }
    }
}

fn extract_table_reports(document: &pdf_semantic::SemanticDocument) -> Vec<ExtractTableReport> {
    document
        .nodes
        .iter()
        .filter(|node| node.kind == pdf_semantic::SemanticNodeKind::TableCandidate)
        .filter_map(|node| {
            let table = node.table.as_ref()?;
            Some(ExtractTableReport {
                node_id: node.id.clone(),
                page: node.page_index,
                rows: table.rows.len(),
                columns: table.column_x_positions.len(),
                cells: table
                    .rows
                    .iter()
                    .map(|row| row.cells.iter().map(|cell| cell.text.clone()).collect())
                    .collect(),
                confidence: table.confidence,
            })
        })
        .collect()
}

fn escape_html(value: &str) -> String {
    value
        .replace('&', "&amp;")
        .replace('<', "&lt;")
        .replace('>', "&gt;")
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn diffs_minimal_pdf_text() {
        let old_pdf = minimal_pdf("Hello");
        let new_pdf = minimal_pdf("Hello world");
        let diff = diff_pdf_bytes("old", &old_pdf, "new", &new_pdf, DiffConfig::default())
            .expect("minimal vertical slice should diff");

        assert_eq!(diff.summary.modified, 1);
    }

    #[test]
    fn diffs_text_across_multiple_content_streams() {
        let old_pdf = multi_stream_pdf("world");
        let new_pdf = multi_stream_pdf("there");
        let diff = diff_pdf_bytes("old", &old_pdf, "new", &new_pdf, DiffConfig::default())
            .expect("multi-stream vertical slice should diff");

        assert_eq!(diff.summary.modified, 1);
        assert_eq!(
            diff.changes[0].old_node.as_ref().unwrap().text.as_deref(),
            Some("Hello world")
        );
        assert_eq!(
            diff.changes[0].new_node.as_ref().unwrap().text.as_deref(),
            Some("Hello there")
        );
    }

    #[test]
    fn extracts_text_across_multiple_pages() {
        let semantic =
            semantic_document_from_pdf("sample", &multi_page_pdf(), ParseConfig::default())
                .expect("multi-page extraction should succeed");

        let extracted = semantic
            .nodes
            .iter()
            .filter_map(|node| node.normalized_text.as_deref())
            .collect::<Vec<_>>()
            .join("\n");

        assert!(extracted.contains("First page"));
        assert!(extracted.contains("Second page"));
    }

    #[test]
    fn reports_image_only_pdf_as_missing_text_layer() {
        let semantic =
            semantic_document_from_pdf("image-only", &image_only_pdf(), ParseConfig::default())
                .expect("image-only extraction should complete with diagnostics");

        assert!(semantic.nodes.is_empty());
        assert!(
            semantic
                .diagnostics
                .iter()
                .any(|diagnostic| diagnostic.code == "MISSING_TEXT_LAYER")
        );
    }

    #[test]
    fn reports_image_payload_changes() {
        let diff = diff_pdf_bytes(
            "old",
            &image_payload_pdf(b"x"),
            "new",
            &image_payload_pdf(b"y"),
            DiffConfig::default(),
        )
        .expect("image payload diff should complete");

        assert!(
            diff.changes
                .iter()
                .any(|change| change.kind == ChangeKind::ObjectChanged
                    && change.reason.contains("image payload differs"))
        );
        assert!(
            diff.diagnostics
                .iter()
                .all(|diagnostic| diagnostic.code != "UNSUPPORTED_IMAGE_DIFF")
        );
        assert!(
            diff.diagnostics
                .iter()
                .any(|diagnostic| diagnostic.code == "MISSING_TEXT_LAYER")
        );
    }

    #[test]
    fn reports_vector_graphics_as_unsupported_diff_surface() {
        let semantic =
            semantic_document_from_pdf("vector", &vector_graphics_pdf(), ParseConfig::default())
                .expect("vector extraction should complete with diagnostics");

        assert!(
            semantic
                .diagnostics
                .iter()
                .any(|diagnostic| diagnostic.code == "UNSUPPORTED_VECTOR_GRAPHIC_DIFF")
        );
    }

    #[test]
    fn reports_cid_font_without_tounicode() {
        let semantic = semantic_document_from_pdf(
            "cid-font",
            &cid_font_without_tounicode_pdf(),
            ParseConfig::default(),
        )
        .expect("CID-font extraction should complete with diagnostics");

        assert!(
            semantic
                .diagnostics
                .iter()
                .any(|diagnostic| diagnostic.code == "MISSING_TOUNICODE_CID_FONT")
        );
    }

    #[test]
    fn reports_tagged_pdf_structure_markers() {
        let semantic = semantic_document_from_pdf("tagged", &tagged_pdf(), ParseConfig::default())
            .expect("tagged PDF extraction should complete with diagnostics");

        assert!(
            semantic
                .diagnostics
                .iter()
                .any(|diagnostic| diagnostic.code == "TAGGED_PDF_STRUCTURE_DETECTED")
        );
        assert!(
            semantic
                .diagnostics
                .iter()
                .any(|diagnostic| diagnostic.code == "TAGGED_MCID_DETECTED")
        );
        let tagged_structure = semantic
            .tagged_structure
            .as_ref()
            .expect("simple structure tree should be parsed");
        assert_eq!(tagged_structure.element_count, 1);
        assert_eq!(tagged_structure.mcid_count, 1);
        assert_eq!(tagged_structure.structure_types, vec!["P".to_owned()]);
    }

    #[test]
    fn synthetic_benchmark_reports_all_m8_t5_phases() {
        let report = run_synthetic_benchmark(50).expect("benchmark should run");

        assert_eq!(report.pages, 50);
        assert!(report.under_target);
        assert!(report.timings_ms.total <= report.target_total_ms);
        assert!(report.summary.modified >= 1);
    }

    #[test]
    fn inspect_report_includes_object_count() {
        let parsed = pdf_core::PdfDocument::parse(minimal_pdf("Hello").as_slice()).unwrap();
        let json = render_inspect_report("sample.pdf", &parsed, ReportFormat::Json);
        assert!(json.contains("\"object_count\""));
    }

    #[test]
    fn inspect_report_includes_tagged_structure_summary() {
        let parsed = pdf_core::PdfDocument::parse(tagged_pdf().as_slice()).unwrap();
        let json = render_inspect_report("tagged.pdf", &parsed, ReportFormat::Json);
        let value: serde_json::Value = serde_json::from_str(&json).expect("JSON should parse");

        assert_eq!(value["tagged_structure"]["detected"], true);
        assert_eq!(value["tagged_structure"]["element_count"], 1);
        assert_eq!(value["tagged_structure"]["mcid_count"], 1);
        assert_eq!(value["tagged_structure"]["structure_types"][0], "P");
    }

    #[test]
    fn inspect_json_escapes_file_name() {
        let parsed = pdf_core::PdfDocument::parse(minimal_pdf("Hello").as_slice()).unwrap();
        let json =
            render_inspect_report("sample \"quoted\" \\ file.pdf", &parsed, ReportFormat::Json);
        let value: serde_json::Value = serde_json::from_str(&json).expect("JSON should parse");

        assert_eq!(value["file"], "sample \"quoted\" \\ file.pdf");
    }

    #[test]
    fn extract_report_lists_text() {
        let semantic =
            semantic_document_from_pdf("sample", &minimal_pdf("Hello"), ParseConfig::default())
                .expect("extract should succeed");
        let markdown = render_extract_report(&semantic, ReportFormat::Md);
        assert!(markdown.contains("- Hello"));
    }

    #[test]
    fn extract_report_serializes_table_candidate_evidence() {
        let semantic = pdf_semantic::build_semantic_document(
            "table",
            &[
                text_run("a1", "A1", 10.0, 100.0),
                text_run("a2", "A2", 70.0, 100.0),
                text_run("b1", "B1", 10.0, 84.0),
                text_run("b2", "B2", 70.0, 84.0),
            ],
            Vec::new(),
        );
        let json = render_extract_report(&semantic, ReportFormat::Json);
        let value: serde_json::Value =
            serde_json::from_str(&json).expect("extract JSON should parse");

        assert_eq!(value["table_candidates"], 1);
        assert_eq!(value["table_cells"], 4);
        assert_eq!(value["tables"][0]["rows"], 2);
        assert_eq!(value["tables"][0]["columns"], 2);
        assert_eq!(value["tables"][0]["cells"][1][1], "B2");

        let markdown = render_extract_report(&semantic, ReportFormat::Md);
        assert!(markdown.contains("table: 2 rows x 2 columns"));
    }

    fn text_run(id: &str, text: &str, x: f32, y: f32) -> pdf_text::TextRun {
        pdf_text::TextRun {
            id: id.to_owned(),
            text: text.to_owned(),
            normalized_text: text.to_owned(),
            glyphs: Vec::new(),
            bbox: Rect {
                x0: x,
                y0: y,
                x1: x + 10.0,
                y1: y + 12.0,
            },
            source: Provenance {
                page_index: Some(0),
                ..Provenance::unknown()
            },
            marked_content: None,
        }
    }

    #[test]
    fn parses_and_applies_tounicode_cmap() {
        let cmap = parse_tounicode_cmap_with_diagnostics(
            b"2 beginbfchar\n<0026> <0043>\n<004f> <006c>\nendbfchar\n",
        );

        assert_eq!(
            decode_with_tounicode(&[0x00, 0x26, 0x00, 0x4f], &cmap.map).as_deref(),
            Some("Cl")
        );
    }

    #[test]
    fn parses_tounicode_bfrange_and_reports_unsupported_syntax() {
        let cmap = parse_tounicode_cmap_with_diagnostics(
            b"1 beginbfrange\n<0001> <0003> <0041>\n<0004> <0005> [<0058> <0059>]\n<0006> <0008> [<005a>]\nendbfrange\n",
        );

        assert_eq!(
            decode_with_tounicode(&[0, 1, 0, 2, 0, 3], &cmap.map).as_deref(),
            Some("ABC")
        );
        assert_eq!(
            decode_with_tounicode(&[0, 4, 0, 5], &cmap.map).as_deref(),
            Some("XY")
        );
        assert!(
            cmap.diagnostics
                .iter()
                .any(|diagnostic| diagnostic.code == "CMAP_UNSUPPORTED_RANGE")
        );
    }

    #[test]
    fn finds_font_resource_references_for_tounicode_maps() {
        let refs = named_references("<</LQYSYM 18 0 R/KFDXKX 22 0 R>>");

        assert_eq!(
            refs,
            vec![
                (
                    "LQYSYM".into(),
                    ObjectId {
                        number: 18,
                        generation: 0
                    }
                ),
                (
                    "KFDXKX".into(),
                    ObjectId {
                        number: 22,
                        generation: 0
                    }
                )
            ]
        );
    }

    #[test]
    fn corpus_report_lists_files_and_diagnostic_counts() {
        let folder = PathBuf::from("target/spdfdiff_cli_tests/corpus_report");
        let _ = std::fs::remove_dir_all(&folder);
        std::fs::create_dir_all(&folder).expect("fixture folder should be created");
        std::fs::write(folder.join("b.pdf"), minimal_pdf("Hello"))
            .expect("valid fixture should be written");
        std::fs::write(folder.join("a.pdf"), b"not a pdf")
            .expect("invalid fixture should be written");

        let report = build_corpus_report(&folder, ParseConfig::default())
            .expect("corpus report should render");
        let value: serde_json::Value =
            serde_json::from_str(&report).expect("corpus JSON should parse");

        assert_eq!(value["folder"], "corpus_report");
        assert_eq!(value["total"], 2);
        assert_eq!(value["parsed"], 1);
        assert_eq!(value["partial"], 1);
        assert_eq!(value["failed"], 1);
        assert_eq!(value["files"][0]["file"], "a.pdf");
        assert_eq!(value["files"][1]["file"], "b.pdf");
        assert_eq!(value["diagnostic_counts"]["MISSING_TOUNICODE"], 1);

        std::fs::remove_dir_all(&folder).expect("fixture folder should be removed");
    }

    #[test]
    fn corpus_report_evaluates_manifest_gate_and_diff_pairs() {
        let folder = PathBuf::from("target/spdfdiff_cli_tests/corpus_manifest");
        let _ = std::fs::remove_dir_all(&folder);
        std::fs::create_dir_all(&folder).expect("fixture folder should be created");
        std::fs::write(folder.join("old.pdf"), minimal_pdf("Hello"))
            .expect("old fixture should be written");
        std::fs::write(folder.join("new.pdf"), minimal_pdf("Hello world"))
            .expect("new fixture should be written");
        let manifest = CorpusManifest {
            schema_version: "1".to_owned(),
            required_files: vec![
                "old.pdf".to_owned(),
                "new.pdf".to_owned(),
                "missing.pdf".to_owned(),
            ],
            diff_pairs: vec![CorpusManifestDiffPair {
                name: "fixture".to_owned(),
                old_file: "old.pdf".to_owned(),
                new_file: "new.pdf".to_owned(),
            }],
            thresholds: CorpusGateThresholds {
                min_parsed_files: Some(3),
                max_missing_required_files: 0,
                max_failed_files: 0,
                max_failed_diff_pairs: 0,
            },
        };

        let report = build_corpus_report_model(&folder, ParseConfig::default(), Some(&manifest))
            .expect("manifest corpus report should render");
        assert_eq!(report.diff_pairs.len(), 1);
        assert_eq!(report.diff_pairs[0].changes, 1);
        let gate = report.gate.expect("manifest should produce gate report");
        assert!(!gate.passed);
        assert_eq!(gate.missing_required_files, vec!["missing.pdf"]);
        assert_eq!(gate.failures.len(), 2);

        std::fs::remove_dir_all(&folder).expect("fixture folder should be removed");
    }

    fn minimal_pdf(text: &str) -> Vec<u8> {
        format!("%PDF-1.7\n1 0 obj\n<< /Type /Catalog /Pages 2 0 R >>\nendobj\n2 0 obj\n<< /Type /Pages /Kids [3 0 R] /Count 1 >>\nendobj\n3 0 obj\n<< /Type /Page /Parent 2 0 R /Contents 4 0 R >>\nendobj\n4 0 obj\n<< /Length {} >>\nstream\nBT /F1 12 Tf 72 720 Td ({text}) Tj ET\nendstream\nendobj\n", text.len() + 32).into_bytes()
    }

    fn multi_stream_pdf(second_text: &str) -> Vec<u8> {
        format!(
            "%PDF-1.7
1 0 obj
<< /Type /Catalog /Pages 2 0 R >>
endobj
2 0 obj
<< /Type /Pages /Kids [3 0 R] /Count 1 >>
endobj
3 0 obj
<< /Type /Page /Parent 2 0 R /Contents [4 0 R 5 0 R] >>
endobj
4 0 obj
<< /Length 33 >>
stream
BT /F1 12 Tf 72 720 Td (Hello) Tj
endstream
endobj
5 0 obj
<< /Length {} >>
stream
({second_text}) Tj ET
endstream
endobj
",
            second_text.len() + 9
        )
        .into_bytes()
    }

    fn multi_page_pdf() -> Vec<u8> {
        "%PDF-1.7
1 0 obj
<< /Type /Catalog /Pages 2 0 R >>
endobj
2 0 obj
<< /Type /Pages /Kids [3 0 R 5 0 R] /Count 2 >>
endobj
3 0 obj
<< /Type /Page /Parent 2 0 R /Contents 4 0 R >>
endobj
4 0 obj
<< /Length 43 >>
stream
BT /F1 12 Tf 72 720 Td (First page) Tj ET
endstream
endobj
5 0 obj
<< /Type /Page /Parent 2 0 R /Contents 6 0 R >>
endobj
6 0 obj
<< /Length 44 >>
stream
BT /F1 12 Tf 72 720 Td (Second page) Tj ET
endstream
endobj
"
        .as_bytes()
        .to_vec()
    }

    fn image_only_pdf() -> Vec<u8> {
        image_payload_pdf(b"x")
    }

    fn image_payload_pdf(payload: &[u8]) -> Vec<u8> {
        let payload_text = String::from_utf8_lossy(payload);
        "%PDF-1.7
1 0 obj
<< /Type /Catalog /Pages 2 0 R >>
endobj
2 0 obj
<< /Type /Pages /Kids [3 0 R] /Count 1 >>
endobj
3 0 obj
<< /Type /Page /Parent 2 0 R /Resources << /XObject << /Im1 5 0 R >> >> /Contents 4 0 R >>
endobj
4 0 obj
<< /Length 21 >>
stream
q 10 0 0 10 0 0 cm /Im1 Do Q
endstream
endobj
5 0 obj
<< /Type /XObject /Subtype /Image /Width 1 /Height 1 /ColorSpace /DeviceGray /BitsPerComponent 8 /Length 1 >>
stream
x
endstream
endobj
"
        .replace("stream\nx\nendstream", &format!("stream\n{payload_text}\nendstream"))
        .into_bytes()
    }

    fn vector_graphics_pdf() -> Vec<u8> {
        "%PDF-1.7
1 0 obj
<< /Type /Catalog /Pages 2 0 R >>
endobj
2 0 obj
<< /Type /Pages /Kids [3 0 R] /Count 1 >>
endobj
3 0 obj
<< /Type /Page /Parent 2 0 R /Contents 4 0 R >>
endobj
4 0 obj
<< /Length 44 >>
stream
BT /F1 12 Tf 72 720 Td (Chart) Tj ET 0 0 m 10 10 l S
endstream
endobj
"
        .as_bytes()
        .to_vec()
    }

    fn cid_font_without_tounicode_pdf() -> Vec<u8> {
        "%PDF-1.7
1 0 obj
<< /Type /Catalog /Pages 2 0 R >>
endobj
2 0 obj
<< /Type /Pages /Kids [3 0 R] /Count 1 >>
endobj
3 0 obj
<< /Type /Page /Parent 2 0 R /Resources << /Font << /F1 5 0 R >> >> /Contents 4 0 R >>
endobj
4 0 obj
<< /Length 38 >>
stream
BT /F1 12 Tf 72 720 Td (Hello) Tj ET
endstream
endobj
5 0 obj
<< /Type /Font /Subtype /Type0 /BaseFont /CIDFont /DescendantFonts [6 0 R] >>
endobj
6 0 obj
<< /Type /Font /Subtype /CIDFontType2 /BaseFont /CIDFont >>
endobj
"
        .as_bytes()
        .to_vec()
    }

    fn tagged_pdf() -> Vec<u8> {
        "%PDF-1.7
1 0 obj
<< /Type /Catalog /Pages 2 0 R /StructTreeRoot 6 0 R >>
endobj
2 0 obj
<< /Type /Pages /Kids [3 0 R] /Count 1 >>
endobj
3 0 obj
<< /Type /Page /Parent 2 0 R /StructParents 0 /Contents 4 0 R >>
endobj
4 0 obj
<< /Length 61 >>
stream
BT /P << /MCID 0 >> BDC /F1 12 Tf 72 720 Td (Tagged) Tj EMC ET
endstream
endobj
6 0 obj
<< /Type /StructTreeRoot /K [7 0 R] >>
endobj
7 0 obj
<< /Type /StructElem /S /P /K 0 /Pg 3 0 R >>
endobj
"
        .as_bytes()
        .to_vec()
    }
}