provenant-cli 0.0.33

// SPDX-FileCopyrightText: Provenant contributors
// SPDX-License-Identifier: Apache-2.0

use std::collections::{BTreeMap, BTreeSet, HashMap};
use std::fs;
use std::path::{Path, PathBuf};

use anyhow::{Context, Result, bail};
use serde::Serialize;
use serde_json::{Map, Value, json};

use crate::utils::spdx::combine_license_expressions;
use crate::version::BUILD_VERSION;

const COMPARISON_MODE: &str = "direct_json";
const FILES_COUNT_SOURCE: &str = "files[]";
const PACKAGES_COUNT_SOURCE: &str = "packages[]";
const PACKAGE_DATA_COUNT_SOURCE: &str = "packages[] empty; files[].package_data present";
const DEPENDENCIES_COUNT_SOURCE: &str = "dependencies[]";
const PACKAGE_DATA_DEPENDENCIES_COUNT_SOURCE: &str =
    "dependencies[] empty; files[].package_data[].dependencies present";
const LICENSE_DETECTIONS_COUNT_SOURCE: &str = "license_detections[]";
const LICENSE_REFERENCES_COUNT_SOURCE: &str = "license_references[]";
const LICENSE_RULE_REFERENCES_COUNT_SOURCE: &str = "license_rule_references[]";

#[derive(Debug, Clone)]
pub(crate) struct CompareArtifactLayout {
    pub artifact_dir: PathBuf,
    pub raw_dir: PathBuf,
    pub scancode_json: PathBuf,
    pub provenant_json: PathBuf,
    pub comparison_dir: PathBuf,
    pub samples_dir: PathBuf,
    pub summary_json: PathBuf,
    pub summary_tsv: PathBuf,
    pub manifest_path: PathBuf,
}

#[derive(Debug, Clone)]
pub(crate) struct CompareCommandResult {
    pub comparison_status: String,
    pub artifact_dir: PathBuf,
    pub scancode_json: PathBuf,
    pub provenant_json: PathBuf,
    pub summary_json: PathBuf,
    pub summary_tsv: PathBuf,
    pub samples_dir: PathBuf,
    pub manifest_path: PathBuf,
}

#[derive(Debug, Serialize)]
struct CompareManifest {
    mode: &'static str,
    tool_version: &'static str,
    created_at: String,
    inputs: CompareInputManifest,
    artifacts: CompareArtifactManifest,
}

#[derive(Debug, Serialize)]
struct CompareInputManifest {
    scancode_json_source: PathBuf,
    provenant_json_source: PathBuf,
}

#[derive(Debug, Serialize)]
struct CompareArtifactManifest {
    artifact_dir: PathBuf,
    raw_dir: PathBuf,
    scancode_json: PathBuf,
    provenant_json: PathBuf,
    comparison_dir: PathBuf,
    summary_json: PathBuf,
    summary_tsv: PathBuf,
    samples_dir: PathBuf,
}

#[derive(Debug, Serialize)]
struct ValueCountEntry {
    value: String,
    count: usize,
}

#[derive(Debug, Serialize)]
struct CountDeltaEntry {
    path: String,
    scancode: usize,
    provenant: usize,
    delta: isize,
    scancode_sample_values: Vec<String>,
    provenant_sample_values: Vec<String>,
}

#[derive(Debug, Serialize)]
struct ValueDifferenceEntry {
    path: String,
    scancode: usize,
    provenant: usize,
    missing_in_provenant: Vec<ValueCountEntry>,
    extra_in_provenant: Vec<ValueCountEntry>,
}

#[derive(Debug, Serialize)]
struct ScalarDifferenceEntry {
    path: String,
    scancode: Option<String>,
    provenant: Option<String>,
}

#[derive(Debug, Serialize)]
struct TopLevelSectionDifferenceEntry {
    section: String,
    scancode: Option<Value>,
    provenant: Option<Value>,
}

#[derive(Debug, Clone)]
struct TopLevelCounts {
    counts: HashMap<&'static str, i64>,
    sources: HashMap<&'static str, &'static str>,
}

impl TopLevelCounts {
    fn count(&self, key: &str) -> i64 {
        *self.counts.get(key).expect("top-level count exists")
    }

    fn source(&self, key: &str) -> &'static str {
        self.sources
            .get(key)
            .copied()
            .expect("top-level count source exists")
    }

    fn counts_json(&self) -> BTreeMap<String, i64> {
        self.counts
            .iter()
            .map(|(key, value)| ((*key).to_string(), *value))
            .collect()
    }

    fn sources_json(&self) -> BTreeMap<String, String> {
        self.sources
            .iter()
            .map(|(key, value)| ((*key).to_string(), (*value).to_string()))
            .collect()
    }
}

pub(crate) fn compare_json_files(
    scancode_source: &Path,
    provenant_source: &Path,
    artifact_dir: Option<&Path>,
) -> Result<CompareCommandResult> {
    validate_json_input(scancode_source, "--scancode-json")?;
    validate_json_input(provenant_source, "--provenant-json")?;

    let artifact_dir = resolve_artifact_dir(artifact_dir)?;
    let layout = prepare_layout(&artifact_dir)?;
    materialize_file(scancode_source, &layout.scancode_json)?;
    materialize_file(provenant_source, &layout.provenant_json)?;

    let summary =
        write_comparison_artifacts(&layout.scancode_json, &layout.provenant_json, &layout, &[])?;
    write_manifest(scancode_source, provenant_source, &layout)?;

    let comparison_status = summary
        .get("comparison_status")
        .and_then(Value::as_str)
        .unwrap_or("unknown")
        .to_string();

    Ok(CompareCommandResult {
        comparison_status,
        artifact_dir: layout.artifact_dir.clone(),
        scancode_json: layout.scancode_json.clone(),
        provenant_json: layout.provenant_json.clone(),
        summary_json: layout.summary_json.clone(),
        summary_tsv: layout.summary_tsv.clone(),
        samples_dir: layout.samples_dir.clone(),
        manifest_path: layout.manifest_path.clone(),
    })
}

fn resolve_artifact_dir(artifact_dir: Option<&Path>) -> Result<PathBuf> {
    if let Some(artifact_dir) = artifact_dir {
        return Ok(artifact_dir.to_path_buf());
    }

    let cwd = std::env::current_dir().context("failed to determine current working directory")?;
    let timestamp = chrono::Utc::now().format("%Y%m%dT%H%M%SZ");
    Ok(cwd.join(format!("provenant-compare-{timestamp}")))
}

pub(crate) fn write_comparison_artifacts(
    scancode_json_path: &Path,
    provenant_json_path: &Path,
    layout: &CompareArtifactLayout,
    scan_args: &[String],
) -> Result<Value> {
    let scancode: Value =
        serde_json::from_str(&fs::read_to_string(scancode_json_path).with_context(|| {
            format!(
                "failed to read ScanCode JSON {}",
                scancode_json_path.display()
            )
        })?)?;
    let provenant: Value =
        serde_json::from_str(&fs::read_to_string(provenant_json_path).with_context(|| {
            format!(
                "failed to read Provenant JSON {}",
                provenant_json_path.display()
            )
        })?)?;

    let scancode_files = files_by_path(&scancode);
    let provenant_files = files_by_path(&provenant);
    let scancode_resources = resources_by_path(&scancode);
    let provenant_resources = resources_by_path(&provenant);
    let scancode_paths: BTreeSet<String> = scancode_files.keys().cloned().collect();
    let provenant_paths: BTreeSet<String> = provenant_files.keys().cloned().collect();
    let scancode_resource_paths: BTreeSet<String> = scancode_resources.keys().cloned().collect();
    let provenant_resource_paths: BTreeSet<String> = provenant_resources.keys().cloned().collect();
    let common_paths: Vec<String> = scancode_paths
        .intersection(&provenant_paths)
        .cloned()
        .collect();
    let scancode_only_output_paths: Vec<String> = scancode_paths
        .difference(&provenant_paths)
        .cloned()
        .collect();
    let provenant_only_output_paths: Vec<String> = provenant_paths
        .difference(&scancode_paths)
        .cloned()
        .collect();
    let common_resource_paths: Vec<String> = scancode_resource_paths
        .intersection(&provenant_resource_paths)
        .cloned()
        .collect();
    let scancode_only_output_resource_paths: Vec<String> = scancode_resource_paths
        .difference(&provenant_resource_paths)
        .cloned()
        .collect();
    let provenant_only_output_resource_paths: Vec<String> = provenant_resource_paths
        .difference(&scancode_resource_paths)
        .cloned()
        .collect();
    let only_findings_active = compare_uses_only_findings(scan_args, &scancode, &provenant);
    let path_presence_note = only_findings_active.then_some(
        "This compare run used --only-findings. Path-presence buckets reflect final filtered outputs, not proven scan coverage gaps: a missing path may simply have had no findings after filtering.",
    );

    let metrics = [
        "license_detections",
        "license_clues",
        "license_policy",
        "package_data",
        "copyrights",
        "holders",
        "authors",
        "emails",
        "urls",
        "scan_errors",
    ];
    let info_metrics = [
        "mime_type",
        "file_type",
        "programming_language",
        "sha1",
        "md5",
        "sha256",
        "sha1_git",
        "is_binary",
        "is_text",
        "is_archive",
        "is_media",
        "is_source",
        "is_script",
        "files_count",
        "dirs_count",
        "size_count",
        "source_count",
    ];
    let classify_metrics = [
        "is_legal",
        "is_manifest",
        "is_readme",
        "is_top_level",
        "is_key_file",
        "is_community",
    ];
    let row2_value_metrics = ["facets", "tallies"];
    let row2_top_level_sections = [
        "summary",
        "tallies",
        "tallies_of_key_files",
        "tallies_by_facet",
    ];

    let info_mode = scan_args
        .iter()
        .any(|arg| matches!(arg.as_str(), "--info" | "--mark-source"))
        || resources_contain_any_field(&scancode_resources, &info_metrics)
        || resources_contain_any_field(&provenant_resources, &info_metrics);
    let row2_mode = scan_args.iter().any(|arg| {
        matches!(
            arg.as_str(),
            "--classify"
                | "--summary"
                | "--license-clarity-score"
                | "--tallies"
                | "--tallies-key-files"
                | "--tallies-with-details"
                | "--tallies-by-facet"
                | "--facet"
        )
    }) || resources_contain_any_field(&scancode_resources, &classify_metrics)
        || resources_contain_any_field(&provenant_resources, &classify_metrics)
        || resources_contain_any_field(&scancode_resources, &row2_value_metrics)
        || resources_contain_any_field(&provenant_resources, &row2_value_metrics)
        || value_contains_any_section(&scancode, &row2_top_level_sections)
        || value_contains_any_section(&provenant, &row2_top_level_sections);

    let mut lower_counts: BTreeMap<String, Vec<CountDeltaEntry>> = metrics
        .iter()
        .map(|metric| ((*metric).to_string(), Vec::new()))
        .collect();
    let mut higher_counts: BTreeMap<String, Vec<CountDeltaEntry>> = metrics
        .iter()
        .map(|metric| ((*metric).to_string(), Vec::new()))
        .collect();
    let mut value_differences: BTreeMap<String, Vec<ValueDifferenceEntry>> = metrics
        .iter()
        .map(|metric| ((*metric).to_string(), Vec::new()))
        .collect();
    let mut info_value_differences: BTreeMap<String, Vec<ScalarDifferenceEntry>> = info_metrics
        .iter()
        .map(|metric| ((*metric).to_string(), Vec::new()))
        .collect();
    let mut classify_value_differences: BTreeMap<String, Vec<ScalarDifferenceEntry>> =
        classify_metrics
            .iter()
            .map(|metric| ((*metric).to_string(), Vec::new()))
            .collect();
    let mut row2_value_differences: BTreeMap<String, Vec<ScalarDifferenceEntry>> =
        row2_value_metrics
            .iter()
            .map(|metric| ((*metric).to_string(), Vec::new()))
            .collect();
    let mut row2_top_level_differences = Vec::new();

    for path in &common_paths {
        let scancode_file = scancode_files.get(path).expect("common path exists");
        let provenant_file = provenant_files.get(path).expect("common path exists");
        for metric in metrics {
            let sc_count = metric_count(scancode_file, metric);
            let pr_count = metric_count(provenant_file, metric);
            let sc_values = metric_values(scancode_file, metric);
            let pr_values = metric_values(provenant_file, metric);
            if pr_count < sc_count {
                lower_counts
                    .get_mut(metric)
                    .expect("metric bucket exists")
                    .push(CountDeltaEntry {
                        path: path.clone(),
                        scancode: sc_count,
                        provenant: pr_count,
                        delta: pr_count as isize - sc_count as isize,
                        scancode_sample_values: sample_values(&sc_values),
                        provenant_sample_values: sample_values(&pr_values),
                    });
            } else if pr_count > sc_count {
                higher_counts
                    .get_mut(metric)
                    .expect("metric bucket exists")
                    .push(CountDeltaEntry {
                        path: path.clone(),
                        scancode: sc_count,
                        provenant: pr_count,
                        delta: pr_count as isize - sc_count as isize,
                        scancode_sample_values: sample_values(&sc_values),
                        provenant_sample_values: sample_values(&pr_values),
                    });
            }
            let sc_counter = value_counter(&sc_values);
            let pr_counter = value_counter(&pr_values);
            let missing = subtract_counters(&sc_counter, &pr_counter);
            let extra = subtract_counters(&pr_counter, &sc_counter);
            if !missing.is_empty() || !extra.is_empty() {
                value_differences
                    .get_mut(metric)
                    .expect("metric bucket exists")
                    .push(ValueDifferenceEntry {
                        path: path.clone(),
                        scancode: sc_count,
                        provenant: pr_count,
                        missing_in_provenant: counter_entries(&missing),
                        extra_in_provenant: counter_entries(&extra),
                    });
            }
        }
    }

    for path in &common_resource_paths {
        let scancode_resource = scancode_resources
            .get(path)
            .expect("common resource exists");
        let provenant_resource = provenant_resources
            .get(path)
            .expect("common resource exists");
        for metric in info_metrics {
            let scancode_value = scalar_field_value(scancode_resource, metric);
            let provenant_value = scalar_field_value(provenant_resource, metric);
            if scancode_value != provenant_value {
                info_value_differences
                    .get_mut(metric)
                    .expect("metric bucket exists")
                    .push(ScalarDifferenceEntry {
                        path: path.clone(),
                        scancode: scancode_value,
                        provenant: provenant_value,
                    });
            }
        }
        for metric in classify_metrics {
            let scancode_value = classify_scalar_value(scancode_resource, metric);
            let provenant_value = classify_scalar_value(provenant_resource, metric);
            if scancode_value != provenant_value {
                classify_value_differences
                    .get_mut(metric)
                    .expect("metric bucket exists")
                    .push(ScalarDifferenceEntry {
                        path: path.clone(),
                        scancode: scancode_value,
                        provenant: provenant_value,
                    });
            }
        }
        for metric in row2_value_metrics {
            let scancode_value = structured_field_value(scancode_resource, metric);
            let provenant_value = structured_field_value(provenant_resource, metric);
            if scancode_value != provenant_value {
                row2_value_differences
                    .get_mut(metric)
                    .expect("metric bucket exists")
                    .push(ScalarDifferenceEntry {
                        path: path.clone(),
                        scancode: scancode_value,
                        provenant: provenant_value,
                    });
            }
        }
    }

    for section in row2_top_level_sections {
        let scancode_value = canonical_section_value(&scancode, section);
        let provenant_value = canonical_section_value(&provenant, section);
        if scancode_value != provenant_value {
            row2_top_level_differences.push(TopLevelSectionDifferenceEntry {
                section: section.to_string(),
                scancode: scancode_value,
                provenant: provenant_value,
            });
        }
    }

    let sc_top = top_level_counts(&scancode);
    let pr_top = top_level_counts(&provenant);
    let license_deltas = top_level_license_deltas(&scancode, &provenant);
    let top_level_regressions_map = top_level_regressions(&sc_top, &pr_top, true);
    let top_level_higher_counts = top_level_regressions(&pr_top, &sc_top, false);
    let skipped_comparisons = skipped_comparisons(&sc_top, &pr_top);

    let mut file_metric_summary = Map::new();
    let mut rows = vec![];
    for key in [
        "files",
        "packages",
        "dependencies",
        "license_detections",
        "license_references",
        "license_rule_references",
    ] {
        rows.push(tsv_row(
            key,
            sc_top.count(key),
            pr_top.count(key),
            pr_top.count(key) - sc_top.count(key),
            &top_level_count_note(key, &sc_top, &pr_top),
        ));
    }
    rows.push(tsv_row(
        "common_file_paths",
        common_paths.len() as i64,
        common_paths.len() as i64,
        0,
        "paths present in both final outputs",
    ));
    rows.push(tsv_row(
        "scancode_only_output_file_paths",
        scancode_only_output_paths.len() as i64,
        0,
        -(scancode_only_output_paths.len() as i64),
        &output_only_path_note("ScanCode", "file", only_findings_active),
    ));
    rows.push(tsv_row(
        "provenant_only_output_file_paths",
        0,
        provenant_only_output_paths.len() as i64,
        provenant_only_output_paths.len() as i64,
        &output_only_path_note("Provenant", "file", only_findings_active),
    ));
    rows.push(tsv_row(
        "common_resource_paths",
        common_resource_paths.len() as i64,
        common_resource_paths.len() as i64,
        0,
        "resource paths present in both final outputs",
    ));
    rows.push(tsv_row(
        "scancode_only_output_resource_paths",
        scancode_only_output_resource_paths.len() as i64,
        0,
        -(scancode_only_output_resource_paths.len() as i64),
        &output_only_path_note("ScanCode", "resource", only_findings_active),
    ));
    rows.push(tsv_row(
        "provenant_only_output_resource_paths",
        0,
        provenant_only_output_resource_paths.len() as i64,
        provenant_only_output_resource_paths.len() as i64,
        &output_only_path_note("Provenant", "resource", only_findings_active),
    ));

    let mut potential_regressions =
        scancode_only_output_paths.len() + top_level_regressions_map.len();
    let mut potential_higher = provenant_only_output_paths.len() + top_level_higher_counts.len();
    if info_mode {
        potential_regressions += scancode_only_output_resource_paths.len();
        potential_higher += provenant_only_output_resource_paths.len();
    }
    if row2_mode {
        potential_regressions += row2_top_level_differences.len();
    }

    for metric in metrics {
        let missing = value_differences[metric]
            .iter()
            .filter(|entry| !entry.missing_in_provenant.is_empty())
            .count();
        let extra = value_differences[metric]
            .iter()
            .filter(|entry| !entry.extra_in_provenant.is_empty())
            .count();
        file_metric_summary.insert(
            metric.to_string(),
            json!({
                "lower_counts": lower_counts[metric].len(),
                "higher_counts": higher_counts[metric].len(),
                "missing_in_provenant": missing,
                "extra_in_provenant": extra,
            }),
        );
        if metric == "scan_errors" {
            potential_regressions += higher_counts[metric].len();
            potential_regressions += extra;
            potential_higher += missing;
        } else {
            potential_regressions += lower_counts[metric].len();
            potential_higher += higher_counts[metric].len();
            potential_regressions += missing;
            potential_higher += extra;
        }
        rows.push(tsv_row(
            &format!("{metric}_lower_counts"),
            lower_counts[metric].len() as i64,
            0,
            -(lower_counts[metric].len() as i64),
            "common-path files where Provenant count is lower",
        ));
        rows.push(tsv_row(
            &format!("{metric}_higher_counts"),
            0,
            higher_counts[metric].len() as i64,
            higher_counts[metric].len() as i64,
            "common-path files where Provenant count is higher",
        ));
        rows.push(tsv_row(
            &format!("{metric}_missing_in_provenant"),
            missing as i64,
            0,
            -(missing as i64),
            "paths where normalized values exist only in ScanCode output",
        ));
        rows.push(tsv_row(
            &format!("{metric}_extra_in_provenant"),
            0,
            extra as i64,
            extra as i64,
            "paths where normalized values exist only in Provenant output",
        ));
    }

    let mut info_metric_summary = Map::new();
    for metric in info_metrics {
        let differences = info_value_differences[metric].len();
        info_metric_summary.insert(
            metric.to_string(),
            json!({
                "value_differences": differences,
            }),
        );
        if info_mode {
            potential_regressions += differences;
        }
        rows.push(tsv_row(
            &format!("info_{metric}_value_differences"),
            differences as i64,
            differences as i64,
            0,
            "common-path resources where info values differ",
        ));
    }

    let mut classify_metric_summary = Map::new();
    for metric in classify_metrics {
        let differences = classify_value_differences[metric].len();
        classify_metric_summary.insert(
            metric.to_string(),
            json!({
                "value_differences": differences,
            }),
        );
        if row2_mode {
            potential_regressions += differences;
        }
        rows.push(tsv_row(
            &format!("classify_{metric}_value_differences"),
            differences as i64,
            differences as i64,
            0,
            "common-path resources where classify values differ",
        ));
    }

    let mut row2_metric_summary = Map::new();
    for metric in row2_value_metrics {
        let differences = row2_value_differences[metric].len();
        row2_metric_summary.insert(
            metric.to_string(),
            json!({
                "value_differences": differences,
            }),
        );
        if row2_mode {
            potential_regressions += differences;
        }
        rows.push(tsv_row(
            &format!("row2_{metric}_value_differences"),
            differences as i64,
            differences as i64,
            0,
            "common-path resources where row-2 workflow values differ",
        ));
    }

    rows.push(tsv_row(
        "row2_top_level_section_differences",
        row2_top_level_differences.len() as i64,
        row2_top_level_differences.len() as i64,
        0,
        "top-level row-2 workflow sections with normalized JSON differences",
    ));

    let top_level_package_skip_reason = skipped_comparisons.get("packages").cloned();
    let top_level_package_value_differences = top_level_package_differences(&scancode, &provenant);
    let top_level_package_missing = top_level_package_value_differences
        .iter()
        .filter(|entry| !entry.missing_in_provenant.is_empty())
        .map(|entry| entry.missing_in_provenant.len())
        .sum::<usize>();
    let top_level_package_extra = top_level_package_value_differences
        .iter()
        .filter(|entry| !entry.extra_in_provenant.is_empty())
        .map(|entry| entry.extra_in_provenant.len())
        .sum::<usize>();
    let top_level_dependency_skip_reason = skipped_comparisons.get("dependencies").cloned();
    let top_level_dependency_value_differences =
        top_level_dependency_differences(&scancode, &provenant);
    let top_level_dependency_missing = top_level_dependency_value_differences
        .iter()
        .filter(|entry| !entry.missing_in_provenant.is_empty())
        .map(|entry| entry.missing_in_provenant.len())
        .sum::<usize>();
    let top_level_dependency_extra = top_level_dependency_value_differences
        .iter()
        .filter(|entry| !entry.extra_in_provenant.is_empty())
        .map(|entry| entry.extra_in_provenant.len())
        .sum::<usize>();
    let raw_dependency_value_differences = raw_dependency_differences(&scancode, &provenant);
    let raw_dependency_missing = raw_dependency_value_differences
        .iter()
        .filter(|entry| !entry.missing_in_provenant.is_empty())
        .map(|entry| entry.missing_in_provenant.len())
        .sum::<usize>();
    let raw_dependency_extra = raw_dependency_value_differences
        .iter()
        .filter(|entry| !entry.extra_in_provenant.is_empty())
        .map(|entry| entry.extra_in_provenant.len())
        .sum::<usize>();
    let top_level_package_summary = json!({
        "missing_in_provenant": top_level_package_missing,
        "extra_in_provenant": top_level_package_extra,
        "comparison_skipped": top_level_package_skip_reason.is_some(),
        "skip_reason": top_level_package_skip_reason,
    });
    let top_level_dependency_summary = json!({
        "missing_in_provenant": top_level_dependency_missing,
        "extra_in_provenant": top_level_dependency_extra,
        "comparison_skipped": top_level_dependency_skip_reason.is_some(),
        "skip_reason": top_level_dependency_skip_reason,
    });
    let raw_dependency_summary = json!({
        "missing_in_provenant": raw_dependency_missing,
        "extra_in_provenant": raw_dependency_extra,
    });
    file_metric_summary.insert(
        "raw_package_dependencies".to_string(),
        json!({
            "missing_in_provenant": raw_dependency_missing,
            "extra_in_provenant": raw_dependency_extra,
        }),
    );
    if top_level_package_skip_reason.is_none() {
        potential_regressions += top_level_package_missing;
        potential_higher += top_level_package_extra;
    }
    if top_level_dependency_skip_reason.is_none() {
        potential_regressions += top_level_dependency_missing;
        potential_higher += top_level_dependency_extra;
    }
    potential_regressions += raw_dependency_missing;
    potential_higher += raw_dependency_extra;
    rows.push(tsv_row(
        "top_level_packages_missing_in_provenant",
        top_level_package_missing as i64,
        0,
        -(top_level_package_missing as i64),
        top_level_package_skip_reason
            .as_deref()
            .unwrap_or("top-level package identities present only in ScanCode output"),
    ));
    rows.push(tsv_row(
        "top_level_packages_extra_in_provenant",
        0,
        top_level_package_extra as i64,
        top_level_package_extra as i64,
        top_level_package_skip_reason
            .as_deref()
            .unwrap_or("top-level package identities present only in Provenant output"),
    ));
    rows.push(tsv_row(
        "top_level_dependencies_missing_in_provenant",
        top_level_dependency_missing as i64,
        0,
        -(top_level_dependency_missing as i64),
        top_level_dependency_skip_reason
            .as_deref()
            .unwrap_or("top-level dependency identities present only in ScanCode output"),
    ));
    rows.push(tsv_row(
        "top_level_dependencies_extra_in_provenant",
        0,
        top_level_dependency_extra as i64,
        top_level_dependency_extra as i64,
        top_level_dependency_skip_reason
            .as_deref()
            .unwrap_or("top-level dependency identities present only in Provenant output"),
    ));
    rows.push(tsv_row(
        "raw_package_dependencies_missing_in_provenant",
        raw_dependency_missing as i64,
        0,
        -(raw_dependency_missing as i64),
        "raw dependency identities present only in ScanCode file-level package_data output",
    ));
    rows.push(tsv_row(
        "raw_package_dependencies_extra_in_provenant",
        0,
        raw_dependency_extra as i64,
        raw_dependency_extra as i64,
        "raw dependency identities present only in Provenant file-level package_data output",
    ));
    rows.push(tsv_row(
        "top_level_license_expression_deltas",
        license_deltas.len() as i64,
        license_deltas.len() as i64,
        0,
        "expressions with different top-level detection counts",
    ));

    let comparison_status = if potential_regressions > 0 {
        "potential_regressions_detected"
    } else if potential_higher > 0 || !license_deltas.is_empty() {
        "differences_detected"
    } else {
        "no_detected_differences"
    };

    let sample_paths = [
        (
            "scancode_only_output_paths",
            layout.samples_dir.join("scancode_only_output_paths.json"),
        ),
        (
            "provenant_only_output_paths",
            layout.samples_dir.join("provenant_only_output_paths.json"),
        ),
        (
            "file_metric_lower_counts",
            layout.samples_dir.join("file_metric_lower_counts.json"),
        ),
        (
            "file_metric_higher_counts",
            layout.samples_dir.join("file_metric_higher_counts.json"),
        ),
        (
            "file_metric_value_differences",
            layout
                .samples_dir
                .join("file_metric_value_differences.json"),
        ),
        (
            "top_level_license_expression_deltas",
            layout
                .samples_dir
                .join("top_level_license_expression_deltas.json"),
        ),
        (
            "top_level_package_value_differences",
            layout
                .samples_dir
                .join("top_level_package_value_differences.json"),
        ),
        (
            "top_level_dependency_value_differences",
            layout
                .samples_dir
                .join("top_level_dependency_value_differences.json"),
        ),
        (
            "raw_dependency_value_differences",
            layout
                .samples_dir
                .join("raw_dependency_value_differences.json"),
        ),
        (
            "info_value_differences",
            layout.samples_dir.join("info_value_differences.json"),
        ),
        (
            "classify_value_differences",
            layout.samples_dir.join("classify_value_differences.json"),
        ),
        (
            "row2_value_differences",
            layout.samples_dir.join("row2_value_differences.json"),
        ),
        (
            "row2_top_level_differences",
            layout.samples_dir.join("row2_top_level_differences.json"),
        ),
    ];

    write_pretty_json(&sample_paths[0].1, &scancode_only_output_paths)?;
    write_pretty_json(&sample_paths[1].1, &provenant_only_output_paths)?;
    write_pretty_json(&sample_paths[2].1, &lower_counts)?;
    write_pretty_json(&sample_paths[3].1, &higher_counts)?;
    write_pretty_json(&sample_paths[4].1, &value_differences)?;
    write_pretty_json(&sample_paths[5].1, &license_deltas)?;
    write_pretty_json(&sample_paths[6].1, &top_level_package_value_differences)?;
    write_pretty_json(&sample_paths[7].1, &top_level_dependency_value_differences)?;
    write_pretty_json(&sample_paths[8].1, &raw_dependency_value_differences)?;
    write_pretty_json(&sample_paths[9].1, &info_value_differences)?;
    write_pretty_json(&sample_paths[10].1, &classify_value_differences)?;
    write_pretty_json(&sample_paths[11].1, &row2_value_differences)?;
    write_pretty_json(&sample_paths[12].1, &row2_top_level_differences)?;

    let summary = json!({
        "comparison_status": comparison_status,
        "top_level_counts": {
            "scancode": sc_top.counts_json(),
            "provenant": pr_top.counts_json(),
            "delta": {
                "files": pr_top.count("files") - sc_top.count("files"),
                "packages": pr_top.count("packages") - sc_top.count("packages"),
                "dependencies": pr_top.count("dependencies") - sc_top.count("dependencies"),
                "license_detections": pr_top.count("license_detections") - sc_top.count("license_detections"),
                "license_references": pr_top.count("license_references") - sc_top.count("license_references"),
                "license_rule_references": pr_top.count("license_rule_references") - sc_top.count("license_rule_references"),
            },
            "sources": {
                "scancode": sc_top.sources_json(),
                "provenant": pr_top.sources_json(),
            },
        },
        "skipped_comparisons": skipped_comparisons,
        "top_level_package_summary": top_level_package_summary,
        "top_level_dependency_summary": top_level_dependency_summary,
        "raw_dependency_summary": raw_dependency_summary,
        "comparison_context": {
            "only_findings_active": only_findings_active,
            "path_presence_semantics": "final_output_membership",
            "path_presence_note": path_presence_note,
        },
        "file_path_comparison": {
            "common_paths": common_paths.len(),
            "scancode_only_output_paths": scancode_only_output_paths.len(),
            "provenant_only_output_paths": provenant_only_output_paths.len(),
        },
        "resource_path_comparison": {
            "common_paths": common_resource_paths.len(),
            "scancode_only_output_paths": scancode_only_output_resource_paths.len(),
            "provenant_only_output_paths": provenant_only_output_resource_paths.len(),
        },
        "file_metric_summary": file_metric_summary,
        "info_metric_summary": info_metric_summary,
        "classify_metric_summary": classify_metric_summary,
        "row2_metric_summary": row2_metric_summary,
        "row2_top_level_section_difference_count": row2_top_level_differences.len(),
        "top_level_regressions": top_level_regressions_map,
        "top_level_higher_counts": top_level_higher_counts,
        "top_level_license_expression_delta_count": license_deltas.len(),
        "sample_artifacts": BTreeMap::from(sample_paths.map(|(name, path)| (name.to_string(), path.display().to_string()))),
    });

    write_pretty_json(&layout.summary_json, &summary)?;
    write_tsv(
        &layout.summary_tsv,
        &["metric", "scancode", "provenant", "delta", "notes"],
        &rows,
    )?;

    Ok(summary)
}

fn prepare_layout(artifact_dir: &Path) -> Result<CompareArtifactLayout> {
    if artifact_dir.exists() && !artifact_dir.is_dir() {
        bail!(
            "compare artifact path is not a directory: {}",
            artifact_dir.display()
        );
    }

    fs::create_dir_all(artifact_dir).with_context(|| {
        format!(
            "failed to create compare artifact directory {}",
            artifact_dir.display()
        )
    })?;

    let raw_dir = artifact_dir.join("raw");
    let comparison_dir = artifact_dir.join("comparison");
    let samples_dir = comparison_dir.join("samples");
    fs::create_dir_all(&raw_dir)?;
    fs::create_dir_all(&samples_dir)?;

    Ok(CompareArtifactLayout {
        artifact_dir: artifact_dir.to_path_buf(),
        raw_dir: raw_dir.clone(),
        scancode_json: raw_dir.join("scancode.json"),
        provenant_json: raw_dir.join("provenant.json"),
        comparison_dir: comparison_dir.clone(),
        samples_dir: samples_dir.clone(),
        summary_json: comparison_dir.join("summary.json"),
        summary_tsv: comparison_dir.join("summary.tsv"),
        manifest_path: artifact_dir.join("run-manifest.json"),
    })
}

fn validate_json_input(path: &Path, flag_name: &str) -> Result<()> {
    let metadata = fs::metadata(path)
        .with_context(|| format!("failed to read {flag_name} {}", path.display()))?;
    if !metadata.is_file() {
        bail!(
            "{flag_name} must point to a regular file: {}",
            path.display()
        );
    }
    Ok(())
}

fn write_manifest(
    scancode_source: &Path,
    provenant_source: &Path,
    layout: &CompareArtifactLayout,
) -> Result<()> {
    let manifest = CompareManifest {
        mode: COMPARISON_MODE,
        tool_version: BUILD_VERSION,
        created_at: chrono::Utc::now().to_rfc3339(),
        inputs: CompareInputManifest {
            scancode_json_source: scancode_source.to_path_buf(),
            provenant_json_source: provenant_source.to_path_buf(),
        },
        artifacts: CompareArtifactManifest {
            artifact_dir: layout.artifact_dir.clone(),
            raw_dir: layout.raw_dir.clone(),
            scancode_json: layout.scancode_json.clone(),
            provenant_json: layout.provenant_json.clone(),
            comparison_dir: layout.comparison_dir.clone(),
            summary_json: layout.summary_json.clone(),
            summary_tsv: layout.summary_tsv.clone(),
            samples_dir: layout.samples_dir.clone(),
        },
    };
    write_pretty_json(&layout.manifest_path, &manifest)
}

fn materialize_file(src: &Path, dst: &Path) -> Result<()> {
    if let Some(parent) = dst.parent() {
        fs::create_dir_all(parent)
            .with_context(|| format!("failed to create parent directory {}", parent.display()))?;
    }
    if dst.exists() {
        fs::remove_file(dst)
            .with_context(|| format!("failed to remove existing file {}", dst.display()))?;
    }
    match fs::hard_link(src, dst) {
        Ok(()) => Ok(()),
        Err(_) => {
            fs::copy(src, dst).with_context(|| {
                format!(
                    "failed to copy compare artifact {} -> {}",
                    src.display(),
                    dst.display()
                )
            })?;
            Ok(())
        }
    }
}

fn write_pretty_json<T: ?Sized + Serialize>(path: &Path, value: &T) -> Result<()> {
    if let Some(parent) = path.parent() {
        fs::create_dir_all(parent)?;
    }
    let bytes = serde_json::to_vec_pretty(value)?;
    fs::write(path, bytes).with_context(|| format!("failed to write {}", path.display()))?;
    Ok(())
}

fn write_tsv(path: &Path, headers: &[&str], rows: &[Vec<String>]) -> Result<()> {
    if let Some(parent) = path.parent() {
        fs::create_dir_all(parent)?;
    }
    let mut content = String::new();
    content.push_str(&headers.join("\t"));
    content.push('\n');
    for row in rows {
        content.push_str(&row.join("\t"));
        content.push('\n');
    }
    fs::write(path, content).with_context(|| format!("failed to write {}", path.display()))?;
    Ok(())
}

fn resources_contain_any_field(resources: &BTreeMap<String, Value>, fields: &[&str]) -> bool {
    resources.values().any(|entry| {
        fields.iter().any(|field| {
            entry.get(field).is_some_and(|value| match value {
                Value::Null => false,
                Value::Array(values) => !values.is_empty(),
                Value::String(text) => !text.trim().is_empty(),
                _ => true,
            })
        })
    })
}

fn value_contains_any_section(value: &Value, sections: &[&str]) -> bool {
    sections
        .iter()
        .any(|section| value.get(section).is_some_and(|entry| !entry.is_null()))
}

fn files_by_path(value: &Value) -> BTreeMap<String, Value> {
    value
        .get("files")
        .and_then(Value::as_array)
        .into_iter()
        .flatten()
        .filter_map(|entry| {
            if entry.get("type").and_then(Value::as_str) != Some("file") {
                return None;
            }
            entry
                .get("path")
                .and_then(Value::as_str)
                .map(|path| (normalize_compare_path(path), entry.clone()))
        })
        .collect()
}

fn resources_by_path(value: &Value) -> BTreeMap<String, Value> {
    value
        .get("files")
        .and_then(Value::as_array)
        .into_iter()
        .flatten()
        .filter_map(|entry| {
            entry
                .get("path")
                .and_then(Value::as_str)
                .map(|path| (normalize_compare_path(path), entry.clone()))
        })
        .collect()
}

fn metric_count(entry: &Value, key: &str) -> usize {
    entry
        .get(key)
        .and_then(Value::as_array)
        .map(|values| values.len())
        .unwrap_or(0)
}

fn normalize_text(value: &str) -> String {
    value.split_whitespace().collect::<Vec<_>>().join(" ")
}

fn metric_values(entry: &Value, metric: &str) -> Vec<String> {
    let Some(values) = entry.get(metric).and_then(Value::as_array) else {
        return Vec::new();
    };
    values
        .iter()
        .filter_map(|item| {
            let value = match metric {
                "license_detections" => item
                    .get("license_expression_spdx")
                    .or_else(|| item.get("license_expression"))
                    .or_else(|| item.get("identifier"))
                    .and_then(Value::as_str)
                    .map(normalize_license_expression),
                "license_clues" | "license_policy" => Some(canonical_value_string(item)),
                "package_data" => package_identity(item)
                    .map(str::to_string)
                    .or_else(|| package_fallback_identity(item)),
                "copyrights" => item
                    .get("copyright")
                    .and_then(Value::as_str)
                    .map(str::to_string),
                "holders" => item
                    .get("holder")
                    .and_then(Value::as_str)
                    .map(str::to_string),
                "authors" => item
                    .get("author")
                    .and_then(Value::as_str)
                    .map(str::to_string),
                "emails" => item
                    .get("email")
                    .and_then(Value::as_str)
                    .map(str::to_string),
                "urls" => item.get("url").and_then(Value::as_str).map(str::to_string),
                "scan_errors" => scan_error_identity(item).map(str::to_string),
                _ => None,
            }?;
            let normalized = normalize_text(&value);
            (!normalized.is_empty()).then_some(normalized)
        })
        .collect()
}

fn package_identity(item: &Value) -> Option<&str> {
    item.get("purl")
        .and_then(Value::as_str)
        .or_else(|| item.get("package_url").and_then(Value::as_str))
}

fn package_fallback_identity(item: &Value) -> Option<String> {
    let mut parts = Vec::new();
    for key in [
        "type",
        "package_type",
        "scope",
        "namespace",
        "name",
        "version",
        "datasource_id",
    ] {
        if let Some(value) = item.get(key).and_then(Value::as_str) {
            let normalized = normalize_text(value);
            if !normalized.is_empty() {
                parts.push(format!("{key}={normalized}"));
            }
        }
    }
    if parts.is_empty() {
        None
    } else {
        Some(parts.join("|"))
    }
}

fn scan_error_identity(item: &Value) -> Option<&str> {
    item.as_str()
        .or_else(|| item.get("error").and_then(Value::as_str))
        .or_else(|| item.get("message").and_then(Value::as_str))
        .or_else(|| item.get("scan_error").and_then(Value::as_str))
        .or_else(|| item.get("details").and_then(Value::as_str))
}

fn normalize_compare_path(path: &str) -> String {
    let trimmed = path.trim();
    if matches!(trimmed, "" | "." | "input" | "/input") {
        "<root>".to_string()
    } else {
        trimmed
            .trim_start_matches("./")
            .trim_start_matches("/input/")
            .trim_start_matches("input/")
            .to_string()
    }
}

fn normalize_license_expression(value: &str) -> String {
    let normalized = normalize_text(value);
    if normalized.is_empty() {
        return normalized;
    }

    let stripped = strip_trivial_outer_parens(&normalized);
    let canonical =
        combine_license_expressions(std::iter::once(stripped.clone())).unwrap_or(stripped);
    strip_trivial_outer_parens(&canonical)
}

fn strip_trivial_outer_parens(value: &str) -> String {
    let mut current = value.trim();
    while has_trivial_outer_parens(current) {
        current = current[1..current.len() - 1].trim();
    }
    current.to_string()
}

fn has_trivial_outer_parens(value: &str) -> bool {
    if !(value.starts_with('(') && value.ends_with(')')) {
        return false;
    }

    let mut depth = 0usize;
    for (index, ch) in value.char_indices() {
        match ch {
            '(' => depth += 1,
            ')' => {
                if depth == 0 {
                    return false;
                }
                depth -= 1;
                if depth == 0 && index != value.len() - 1 {
                    return false;
                }
            }
            _ => {}
        }
    }

    depth == 0
}

fn scalar_field_value(entry: &Value, key: &str) -> Option<String> {
    let value = entry.get(key)?;
    let normalized = match value {
        Value::Null => return None,
        Value::String(text) => normalize_text(text),
        Value::Bool(flag) => flag.to_string(),
        Value::Number(number) => number.to_string(),
        _ => normalize_text(&value.to_string()),
    };
    (!normalized.is_empty()).then_some(normalized)
}

fn structured_field_value(entry: &Value, key: &str) -> Option<String> {
    let value = entry.get(key)?;
    if value.is_null() {
        return None;
    }
    match key {
        "facets" if value.as_array().is_some_and(|items| items.is_empty()) => None,
        "tallies" => canonical_tallies_field_string(value),
        _ => Some(canonical_value_string(value)),
    }
}

fn classify_scalar_value(entry: &Value, key: &str) -> Option<String> {
    match entry.get(key) {
        Some(Value::Bool(flag)) => Some(flag.to_string()),
        Some(Value::Null) | None => Some("false".to_string()),
        Some(other) => scalar_field_value(&json!({ key: other }), key),
    }
}

fn canonical_section_value(value: &Value, key: &str) -> Option<Value> {
    let section = value.get(key)?;
    match key {
        "summary" => Some(canonicalize_summary_section(section)),
        "tallies" | "tallies_of_key_files" => canonical_tallies_section(section),
        "tallies_by_facet" => canonical_tallies_by_facet_section(section),
        _ => Some(canonicalize_json_value(section)),
    }
}

fn canonical_value_string(value: &Value) -> String {
    serde_json::to_string(&canonicalize_json_value(value)).unwrap_or_else(|_| value.to_string())
}

fn canonicalize_json_value(value: &Value) -> Value {
    match value {
        Value::Array(values) => {
            let mut normalized: Vec<Value> = values.iter().map(canonicalize_json_value).collect();
            normalized.sort_by_cached_key(canonical_value_string);
            Value::Array(normalized)
        }
        Value::Object(map) => {
            let mut entries: Vec<_> = map.iter().collect();
            entries.sort_by_key(|(left, _)| *left);
            Value::Object(
                entries
                    .into_iter()
                    .map(|(key, value)| (key.clone(), canonicalize_json_value(value)))
                    .collect(),
            )
        }
        _ => value.clone(),
    }
}

fn is_empty_tallies_value(value: &Value) -> bool {
    let Some(object) = value.as_object() else {
        return false;
    };
    object
        .values()
        .all(|entry| entry.as_array().is_some_and(|items| items.is_empty()))
}

fn canonical_tallies_field_string(value: &Value) -> Option<String> {
    canonical_tallies_section(value).map(|value| canonical_value_string(&value))
}

fn canonicalize_summary_section(value: &Value) -> Value {
    let Some(object) = value.as_object() else {
        return canonicalize_json_value(value);
    };

    let mut normalized = serde_json::Map::new();
    for (key, section_value) in object {
        let normalized_value = match key.as_str() {
            "other_license_expressions" => {
                canonicalize_tally_entry_array(section_value, "detected_license_expression")
            }
            "other_holders" => canonicalize_tally_entry_array(section_value, "holders"),
            "other_languages" => {
                canonicalize_tally_entry_array(section_value, "programming_language")
            }
            _ => canonicalize_json_value(section_value),
        };
        normalized.insert(key.clone(), normalized_value);
    }

    for key in [
        "other_license_expressions",
        "other_holders",
        "other_languages",
    ] {
        normalized
            .entry(key.to_string())
            .or_insert_with(|| Value::Array(Vec::new()));
    }

    Value::Object(normalized)
}

fn canonical_tallies_section(value: &Value) -> Option<Value> {
    let Some(object) = value.as_object() else {
        return Some(canonicalize_json_value(value));
    };

    let mut normalized = serde_json::Map::new();
    for key in [
        "detected_license_expression",
        "copyrights",
        "holders",
        "authors",
        "programming_language",
    ] {
        let normalized_entries = object
            .get(key)
            .map(|entries| canonicalize_tally_entry_array(entries, key))
            .unwrap_or_else(|| Value::Array(Vec::new()));
        normalized.insert(key.to_string(), normalized_entries);
    }

    let normalized_value = Value::Object(normalized);
    (!is_empty_tallies_value(&normalized_value)).then_some(normalized_value)
}

fn canonical_tallies_by_facet_section(value: &Value) -> Option<Value> {
    let Some(array) = value.as_array() else {
        return Some(canonicalize_json_value(value));
    };

    let mut normalized: Vec<Value> = array
        .iter()
        .map(|entry| {
            let facet = entry
                .get("facet")
                .and_then(Value::as_str)
                .unwrap_or_default()
                .to_string();
            let tallies = canonical_tallies_section(entry.get("tallies").unwrap_or(&Value::Null))
                .unwrap_or_else(|| Value::Object(serde_json::Map::new()));
            json!({
                "facet": facet,
                "tallies": tallies,
            })
        })
        .collect();
    normalized.sort_by_cached_key(canonical_value_string);
    Some(Value::Array(normalized))
}

fn canonicalize_tally_entry_array(value: &Value, kind: &str) -> Value {
    let Some(array) = value.as_array() else {
        return Value::Array(Vec::new());
    };

    let mut normalized: Vec<Value> = array
        .iter()
        .map(|entry| {
            let count = entry.get("count").and_then(Value::as_u64).unwrap_or(0);
            let normalized_value = entry
                .get("value")
                .and_then(Value::as_str)
                .map(|text| normalize_tally_value(kind, text));
            json!({
                "count": count,
                "value": normalized_value,
            })
        })
        .collect();
    normalized.sort_by_cached_key(canonical_value_string);
    Value::Array(normalized)
}

fn normalize_tally_value(kind: &str, value: &str) -> String {
    match kind {
        "detected_license_expression" => normalize_license_expression(value),
        "copyrights" => normalize_tally_copyright_value(value),
        "holders" | "authors" | "programming_language" => normalize_text(value),
        _ => normalize_text(value),
    }
}

fn normalize_tally_copyright_value(value: &str) -> String {
    let trimmed = value
        .trim()
        .trim_end_matches(" as indicated by the @authors tag");

    if let Some(rest) = trimmed.strip_prefix("(c) ") {
        let normalized_rest = rest.trim_start_matches(|ch: char| {
            ch.is_ascii_digit() || ch == ' ' || ch == ',' || ch == '-'
        });

        if !normalized_rest.is_empty() && normalized_rest != rest {
            return format!("(c) {}", normalized_rest.trim());
        }
    }

    if let Some(rest) = trimmed.strip_prefix("Copyright (c) ") {
        let normalized_rest = rest.trim_start_matches(|ch: char| {
            ch.is_ascii_digit() || ch == ' ' || ch == ',' || ch == '-'
        });

        if !normalized_rest.is_empty() && normalized_rest != rest {
            return format!("Copyright (c) {}", normalized_rest.trim());
        }
    }

    if let Some(rest) = trimmed.strip_prefix("Copyright ")
        && let Some((yearish, remainder)) = rest.split_once(',')
        && !yearish.is_empty()
        && yearish
            .chars()
            .all(|ch| ch.is_ascii_digit() || ch == ' ' || ch == ',' || ch == '-')
    {
        return format!("Copyright {}", remainder.trim());
    }

    if let Some(rest) = trimmed.strip_prefix("Copyright ") {
        let mut parts = rest.rsplitn(2, ' ');
        let trailing = parts.next().unwrap_or_default();
        let leading = parts.next().unwrap_or_default();
        if !leading.is_empty()
            && trailing
                .chars()
                .all(|ch| ch.is_ascii_digit() || ch == ',' || ch == '-')
        {
            return format!("Copyright {}", leading.trim());
        }
    }

    trimmed.to_string()
}

fn sample_values(values: &[String]) -> Vec<String> {
    let mut set = BTreeSet::new();
    for value in values {
        set.insert(value.clone());
    }
    set.into_iter().take(10).collect()
}

fn value_counter(values: &[String]) -> BTreeMap<String, usize> {
    let mut counts = BTreeMap::new();
    for value in values {
        *counts.entry(value.clone()).or_insert(0) += 1;
    }
    counts
}

fn subtract_counters(
    left: &BTreeMap<String, usize>,
    right: &BTreeMap<String, usize>,
) -> BTreeMap<String, usize> {
    let mut result = BTreeMap::new();
    for (key, left_count) in left {
        let right_count = right.get(key).copied().unwrap_or(0);
        if left_count > &right_count {
            result.insert(key.clone(), left_count - right_count);
        }
    }
    result
}

fn counter_entries(counter: &BTreeMap<String, usize>) -> Vec<ValueCountEntry> {
    counter
        .iter()
        .map(|(value, count)| ValueCountEntry {
            value: value.clone(),
            count: *count,
        })
        .collect()
}

fn top_level_counts(value: &Value) -> TopLevelCounts {
    let package_count = array_len(value, "packages");
    let fallback_package_count = file_package_data_count(value);
    let dependency_count = array_len(value, "dependencies");
    let fallback_dependency_count = file_package_data_dependency_count(value);

    let packages_source = if package_count == 0 && fallback_package_count > 0 {
        PACKAGE_DATA_COUNT_SOURCE
    } else {
        PACKAGES_COUNT_SOURCE
    };
    let dependencies_source = if dependency_count == 0 && fallback_dependency_count > 0 {
        PACKAGE_DATA_DEPENDENCIES_COUNT_SOURCE
    } else {
        DEPENDENCIES_COUNT_SOURCE
    };

    TopLevelCounts {
        counts: HashMap::from([
            ("files", file_entry_count(value) as i64),
            ("packages", package_count as i64),
            ("dependencies", dependency_count as i64),
            (
                "license_detections",
                array_len(value, "license_detections") as i64,
            ),
            (
                "license_references",
                array_len(value, "license_references") as i64,
            ),
            (
                "license_rule_references",
                array_len(value, "license_rule_references") as i64,
            ),
        ]),
        sources: HashMap::from([
            ("files", FILES_COUNT_SOURCE),
            ("packages", packages_source),
            ("dependencies", dependencies_source),
            ("license_detections", LICENSE_DETECTIONS_COUNT_SOURCE),
            ("license_references", LICENSE_REFERENCES_COUNT_SOURCE),
            (
                "license_rule_references",
                LICENSE_RULE_REFERENCES_COUNT_SOURCE,
            ),
        ]),
    }
}

fn file_entry_count(value: &Value) -> usize {
    value
        .get("files")
        .and_then(Value::as_array)
        .into_iter()
        .flatten()
        .filter(|entry| entry.get("type").and_then(Value::as_str) == Some("file"))
        .count()
}

fn array_len(value: &Value, key: &str) -> usize {
    value
        .get(key)
        .and_then(Value::as_array)
        .map(|values| values.len())
        .unwrap_or(0)
}

fn file_package_data_count(value: &Value) -> usize {
    value
        .get("files")
        .and_then(Value::as_array)
        .into_iter()
        .flatten()
        .map(|entry| {
            entry
                .get("package_data")
                .and_then(Value::as_array)
                .map(|package_data| package_data.len())
                .unwrap_or(0)
        })
        .sum()
}

fn file_package_data_dependency_count(value: &Value) -> usize {
    value
        .get("files")
        .and_then(Value::as_array)
        .into_iter()
        .flatten()
        .map(|entry| {
            entry
                .get("package_data")
                .and_then(Value::as_array)
                .into_iter()
                .flatten()
                .map(|package_data| {
                    package_data
                        .get("dependencies")
                        .and_then(Value::as_array)
                        .map(|dependencies| dependencies.len())
                        .unwrap_or(0)
                })
                .sum::<usize>()
        })
        .sum()
}

fn top_level_license_deltas(scancode: &Value, provenant: &Value) -> Vec<Value> {
    let mut counter = BTreeMap::new();
    for (label, value) in [("scancode", scancode), ("provenant", provenant)] {
        for item in value
            .get("license_detections")
            .and_then(Value::as_array)
            .into_iter()
            .flatten()
        {
            let key = item
                .get("license_expression_spdx")
                .or_else(|| item.get("license_expression"))
                .or_else(|| item.get("identifier"))
                .and_then(Value::as_str)
                .map(normalize_license_expression)
                .unwrap_or_else(|| "<unknown>".to_string());
            let count = item
                .get("detection_count")
                .and_then(Value::as_i64)
                .unwrap_or(1);
            let entry = counter.entry(key).or_insert((0_i64, 0_i64));
            if label == "scancode" {
                entry.0 += count;
            } else {
                entry.1 += count;
            }
        }
    }
    counter
        .into_iter()
        .filter_map(|(key, (sc, pr))| {
            (sc != pr).then_some(json!({
                "license_expression": key,
                "scancode": sc,
                "provenant": pr,
                "delta": pr - sc
            }))
        })
        .collect()
}

fn top_level_package_differences(scancode: &Value, provenant: &Value) -> Vec<ValueDifferenceEntry> {
    let sc_top = top_level_counts(scancode);
    let pr_top = top_level_counts(provenant);
    if !count_delta_is_hard_regression_comparable("packages", &sc_top, &pr_top) {
        return Vec::new();
    }

    let sc_identities = top_level_package_identities(scancode);
    let pr_identities = top_level_package_identities(provenant);
    let missing = difference_entries(&sc_identities, &pr_identities);
    let extra = difference_entries(&pr_identities, &sc_identities);
    if missing.is_empty() && extra.is_empty() {
        return Vec::new();
    }

    vec![ValueDifferenceEntry {
        path: "<top-level>".to_string(),
        scancode: sc_identities.len(),
        provenant: pr_identities.len(),
        missing_in_provenant: missing,
        extra_in_provenant: extra,
    }]
}

fn top_level_package_identities(value: &Value) -> BTreeSet<String> {
    value
        .get("packages")
        .and_then(Value::as_array)
        .into_iter()
        .flatten()
        .map(|item| {
            package_identity(item)
                .map(str::to_string)
                .or_else(|| package_fallback_identity(item))
                .unwrap_or_else(|| "<unknown>".to_string())
        })
        .collect()
}

fn top_level_dependency_differences(
    scancode: &Value,
    provenant: &Value,
) -> Vec<ValueDifferenceEntry> {
    let sc_top = top_level_counts(scancode);
    let pr_top = top_level_counts(provenant);
    if !count_delta_is_hard_regression_comparable("dependencies", &sc_top, &pr_top) {
        return Vec::new();
    }

    let sc_by_path = top_level_dependency_identities_by_path(scancode);
    let pr_by_path = top_level_dependency_identities_by_path(provenant);
    let mut paths = BTreeSet::new();
    paths.extend(sc_by_path.keys().cloned());
    paths.extend(pr_by_path.keys().cloned());
    let mut differences = Vec::new();
    for path in paths {
        let sc_identities = sc_by_path.get(&path).cloned().unwrap_or_default();
        let pr_identities = pr_by_path.get(&path).cloned().unwrap_or_default();
        let missing = difference_entries(&sc_identities, &pr_identities);
        let extra = difference_entries(&pr_identities, &sc_identities);
        if !missing.is_empty() || !extra.is_empty() {
            differences.push(ValueDifferenceEntry {
                path,
                scancode: sc_identities.len(),
                provenant: pr_identities.len(),
                missing_in_provenant: missing,
                extra_in_provenant: extra,
            });
        }
    }
    differences
}

fn top_level_dependency_identities_by_path(value: &Value) -> BTreeMap<String, BTreeSet<String>> {
    let mut output: BTreeMap<String, BTreeSet<String>> = BTreeMap::new();
    for item in value
        .get("dependencies")
        .and_then(Value::as_array)
        .into_iter()
        .flatten()
    {
        let path = item
            .get("datafile_path")
            .or_else(|| item.get("path"))
            .and_then(Value::as_str)
            .map(normalize_compare_path)
            .unwrap_or_else(|| "<unknown>".to_string());
        let identity = dependency_identity(item).unwrap_or_else(|| "<unknown>".to_string());
        output.entry(path).or_default().insert(identity);
    }
    output
}

fn raw_dependency_differences(scancode: &Value, provenant: &Value) -> Vec<ValueDifferenceEntry> {
    let sc_by_path = raw_dependency_identities_by_path(scancode);
    let pr_by_path = raw_dependency_identities_by_path(provenant);
    let mut paths = BTreeSet::new();
    paths.extend(sc_by_path.keys().cloned());
    paths.extend(pr_by_path.keys().cloned());
    let mut differences = Vec::new();
    for path in paths {
        let sc_identities = sc_by_path.get(&path).cloned().unwrap_or_default();
        let pr_identities = pr_by_path.get(&path).cloned().unwrap_or_default();
        let missing = difference_entries(&sc_identities, &pr_identities);
        let extra = difference_entries(&pr_identities, &sc_identities);
        if !missing.is_empty() || !extra.is_empty() {
            differences.push(ValueDifferenceEntry {
                path,
                scancode: sc_identities.len(),
                provenant: pr_identities.len(),
                missing_in_provenant: missing,
                extra_in_provenant: extra,
            });
        }
    }
    differences
}

fn raw_dependency_identities_by_path(value: &Value) -> BTreeMap<String, BTreeSet<String>> {
    let mut output: BTreeMap<String, BTreeSet<String>> = BTreeMap::new();
    for file in value
        .get("files")
        .and_then(Value::as_array)
        .into_iter()
        .flatten()
    {
        let file_path = file
            .get("path")
            .and_then(Value::as_str)
            .map(normalize_compare_path)
            .unwrap_or_else(|| "<unknown>".to_string());

        for package_data in file
            .get("package_data")
            .and_then(Value::as_array)
            .into_iter()
            .flatten()
        {
            for item in package_data
                .get("dependencies")
                .and_then(Value::as_array)
                .into_iter()
                .flatten()
            {
                let path = item
                    .get("datafile_path")
                    .or_else(|| item.get("path"))
                    .and_then(Value::as_str)
                    .map(normalize_compare_path)
                    .unwrap_or_else(|| file_path.clone());
                let identity = dependency_identity(item).unwrap_or_else(|| "<unknown>".to_string());
                output.entry(path).or_default().insert(identity);
            }
        }
    }
    output
}

fn difference_entries(left: &BTreeSet<String>, right: &BTreeSet<String>) -> Vec<ValueCountEntry> {
    left.difference(right)
        .map(|value| ValueCountEntry {
            value: value.clone(),
            count: 1,
        })
        .collect()
}

fn dependency_identity(item: &Value) -> Option<String> {
    for key in ["purl", "package_url", "dependency_uid"] {
        if let Some(value) = item.get(key).and_then(Value::as_str) {
            let normalized = normalize_text(value);
            if !normalized.is_empty() {
                return Some(normalized);
            }
        }
    }
    let mut parts = Vec::new();
    for key in [
        "datafile_path",
        "scope",
        "namespace",
        "name",
        "version",
        "version_requirement",
        "is_runtime",
        "is_optional",
    ] {
        if let Some(value) = item.get(key) {
            let normalized = if let Some(text) = value.as_str() {
                normalize_text(text)
            } else {
                value.to_string()
            };
            if !normalized.is_empty() {
                parts.push(format!("{key}={normalized}"));
            }
        }
    }
    if parts.is_empty() {
        None
    } else {
        Some(parts.join("|"))
    }
}

fn top_level_regressions(
    left: &TopLevelCounts,
    right: &TopLevelCounts,
    left_is_scancode: bool,
) -> BTreeMap<String, i64> {
    let mut output = BTreeMap::new();
    for key in [
        "packages",
        "dependencies",
        "license_detections",
        "license_references",
        "license_rule_references",
    ] {
        if !count_delta_is_hard_regression_comparable(key, left, right) {
            continue;
        }
        let left_value = left.count(key);
        let right_value = right.count(key);
        if left_is_scancode {
            if right_value < left_value {
                output.insert(key.to_string(), left_value - right_value);
            }
        } else if left_value > right_value {
            output.insert(key.to_string(), left_value - right_value);
        }
    }
    output
}

fn skipped_comparisons(left: &TopLevelCounts, right: &TopLevelCounts) -> BTreeMap<String, String> {
    ["packages", "dependencies"]
        .into_iter()
        .filter(|metric| !count_delta_is_hard_regression_comparable(metric, left, right))
        .map(|metric| {
            (
                metric.to_string(),
                mixed_source_skip_reason(metric, left, right),
            )
        })
        .collect()
}

fn count_delta_is_hard_regression_comparable(
    key: &str,
    left: &TopLevelCounts,
    right: &TopLevelCounts,
) -> bool {
    match key {
        "packages" => {
            left.source(key) == PACKAGES_COUNT_SOURCE && right.source(key) == PACKAGES_COUNT_SOURCE
        }
        "dependencies" => {
            left.source(key) == DEPENDENCIES_COUNT_SOURCE
                && right.source(key) == DEPENDENCIES_COUNT_SOURCE
        }
        _ => true,
    }
}

fn mixed_source_skip_reason(
    metric: &str,
    scancode: &TopLevelCounts,
    provenant: &TopLevelCounts,
) -> String {
    format!(
        "top-level {metric} comparison skipped: ScanCode {}; Provenant {}",
        scancode.source(metric),
        provenant.source(metric)
    )
}

fn top_level_count_note(
    metric: &str,
    scancode: &TopLevelCounts,
    provenant: &TopLevelCounts,
) -> String {
    if !matches!(metric, "packages" | "dependencies") {
        return "top-level count".to_string();
    }

    if count_delta_is_hard_regression_comparable(metric, scancode, provenant) {
        return "top-level count".to_string();
    }

    mixed_source_skip_reason(metric, scancode, provenant)
}

fn compare_uses_only_findings(scan_args: &[String], scancode: &Value, provenant: &Value) -> bool {
    scan_args.iter().any(|arg| arg == "--only-findings")
        || json_output_uses_only_findings(scancode)
        || json_output_uses_only_findings(provenant)
}

fn json_output_uses_only_findings(value: &Value) -> bool {
    value
        .get("headers")
        .and_then(Value::as_array)
        .into_iter()
        .flatten()
        .any(|header| {
            header
                .get("options")
                .and_then(Value::as_object)
                .is_some_and(|options| {
                    option_value_is_truthy(options.get("--only-findings"))
                        || option_value_is_truthy(options.get("only_findings"))
                })
        })
}

fn option_value_is_truthy(value: Option<&Value>) -> bool {
    matches!(value, Some(Value::Bool(true)))
        || matches!(value, Some(Value::String(text)) if text.eq_ignore_ascii_case("true"))
}

fn output_only_path_note(tool_name: &str, path_kind: &str, only_findings_active: bool) -> String {
    let mut note = format!("{path_kind} paths present only in {tool_name} final output");
    if only_findings_active {
        note.push_str(
            "; with --only-findings, the other output may have filtered these paths away after finding nothing",
        );
    }
    note
}

fn tsv_row(metric: &str, scancode: i64, provenant: i64, delta: i64, notes: &str) -> Vec<String> {
    vec![
        metric.to_string(),
        scancode.to_string(),
        provenant.to_string(),
        delta.to_string(),
        notes.to_string(),
    ]
}