zentone 0.1.0 - Docs.rs

//! Reference-implementation parity tests.
//!
//! These tests read CSV "golden" files that were generated by standalone
//! C++ extractions of authoritative reference math (see
//! `reference-checks/` at the crate root) and verify that zentone's own
//! implementations reproduce the same outputs within a tight f32
//! tolerance.
//!
//! Current coverage:
//! - libultrahdr `ReinhardMap` (extended Reinhard) → `zentone::curves::reinhard_extended`
//! - libultrahdr `globalTonemap` (max-channel extended Reinhard composition)
//!   → a local helper that calls `zentone::curves::reinhard_extended` the same way
//!
//! The `libultrahdr_apply_gain` and `libultrahdr_compute_gain` golden files
//! are committed for future use in a reference-parity test inside
//! `ultrahdr-core`, which owns the gain-map application and encode math.
//! Zentone itself does not implement those and shouldn't — it's a tone
//! mapping crate, not a gain map codec.
//!
//! See `reference-checks/README.md` for the full story.

use std::fs;
use std::path::PathBuf;

use zentone::curves::{bt2390_tonemap, bt2390_tonemap_ext, reinhard_extended};
use zentone::gamut::{
    BT709_TO_BT2020, BT709_TO_P3, BT2020_TO_BT709, BT2020_TO_P3, P3_TO_BT709, P3_TO_BT2020,
    apply_matrix,
};
use zentone::hlg::{
    hlg_inverse_ootf, hlg_inverse_ootf_approx, hlg_inverse_ootf_approx_row_simd,
    hlg_inverse_ootf_row_simd, hlg_ootf, hlg_ootf_approx, hlg_ootf_approx_row_simd,
    hlg_ootf_row_simd,
};
use zentone::{
    Bt2408Tonemapper, CompiledFilmicSpline, FilmicSplineConfig, LUMA_BT709, LUMA_BT2020, LUMA_P3,
    ToneMap,
};

/// f32 absolute-error tolerance. ~8× the ulp at 1.0, generous enough to
/// absorb a reordered multiply/divide but tight enough to catch any real
/// formula divergence.
const TOLERANCE: f32 = 1e-6;

/// Wider tolerance for PQ-domain comparisons. The PQ OETF/EOTF involve
/// `powf` with fractional exponents, where libm and the C++ stdlib can
/// diverge by 1–3 ULP per call. A full EETF pipeline has 4 `powf` calls
/// (2× OETF + 2× EOTF), so the cumulative error can reach ~1e-4 at
/// extreme luminance values where PQ's slope is steep.
const PQ_TOLERANCE: f32 = 5e-4;

fn golden_path(name: &str) -> PathBuf {
    let mut p = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
    p.push("reference-checks");
    p.push("golden");
    p.push(name);
    p
}

fn read_csv(name: &str) -> String {
    let path = golden_path(name);
    fs::read_to_string(&path).unwrap_or_else(|e| {
        panic!(
            "failed to read golden {}: {e}. Did you run reference-checks/build.sh?",
            path.display()
        )
    })
}

/// Iterate data rows in a specific CSV section. Finds the header line
/// matching `header_prefix`, then yields parsed rows until it hits an
/// empty line, a comment line, or another header-like line (one that
/// doesn't start with a digit, '-', or '+'). This correctly handles
/// multi-section CSV files separated by blank lines and comments.
fn parse_rows<'a>(body: &'a str, header_prefix: &str) -> impl Iterator<Item = Vec<&'a str>> {
    let mut in_section = false;
    let mut past_header = false;
    body.lines().map(str::trim).filter_map(move |line| {
        if !in_section {
            if line.starts_with(header_prefix) {
                in_section = true;
                past_header = false;
            }
            return None;
        }
        if !past_header {
            past_header = true;
            return None; // skip the header line itself
        }
        // Stop at section boundary: blank line or comment line
        if line.is_empty() || line.starts_with('#') {
            in_section = false;
            return None;
        }
        Some(line.split(',').map(str::trim).collect())
    })
}

/// libultrahdr's globalTonemap, faithfully reimplemented with zentone's
/// `reinhard_extended` as the scalar kernel. Matches the logic in
/// `lib/src/jpegr.cpp:1823` — max channel through ReinhardMap, then scale
/// all channels by the resulting ratio.
fn global_tonemap(rgb_in: [f32; 3], headroom: f32, is_normalized: bool) -> [f32; 3] {
    let rgb_hdr = if is_normalized {
        [
            rgb_in[0] * headroom,
            rgb_in[1] * headroom,
            rgb_in[2] * headroom,
        ]
    } else {
        rgb_in
    };
    let max_hdr = rgb_hdr[0].max(rgb_hdr[1]).max(rgb_hdr[2]);
    let max_sdr = reinhard_extended(max_hdr, headroom);
    if max_hdr <= 0.0 {
        return [0.0, 0.0, 0.0];
    }
    let scale = max_sdr / max_hdr;
    [
        if rgb_hdr[0] > 0.0 {
            rgb_hdr[0] * scale
        } else {
            0.0
        },
        if rgb_hdr[1] > 0.0 {
            rgb_hdr[1] * scale
        } else {
            0.0
        },
        if rgb_hdr[2] > 0.0 {
            rgb_hdr[2] * scale
        } else {
            0.0
        },
    ]
}

#[test]
fn reinhard_map_scalar_matches_libultrahdr() {
    let csv = read_csv("libultrahdr_reinhard.csv");

    let mut checked = 0;
    for cols in parse_rows(&csv, "headroom,y_hdr,y_sdr") {
        if cols.len() != 3 {
            continue;
        }
        let h: f32 = cols[0].parse().unwrap();
        let y: f32 = cols[1].parse().unwrap();
        let expected: f32 = cols[2].parse().unwrap();
        let actual = reinhard_extended(y, h);
        let err = (actual - expected).abs();
        assert!(
            err < TOLERANCE,
            "ReinhardMap mismatch at headroom={h}, y={y}: zentone={actual}, libultrahdr={expected}, err={err}"
        );
        checked += 1;
    }
    assert!(checked > 0, "no rows parsed — CSV format regression?");
    println!("reinhard_map_scalar: checked {checked} rows");
}

#[test]
fn global_tonemap_matches_libultrahdr() {
    let csv = read_csv("libultrahdr_reinhard.csv");

    let mut checked = 0;
    for cols in parse_rows(&csv, "headroom,r_in,g_in,b_in,r_out,g_out,b_out") {
        if cols.len() != 7 {
            continue;
        }
        let h: f32 = cols[0].parse().unwrap();
        let rgb_in = [
            cols[1].parse::<f32>().unwrap(),
            cols[2].parse::<f32>().unwrap(),
            cols[3].parse::<f32>().unwrap(),
        ];
        let expected = [
            cols[4].parse::<f32>().unwrap(),
            cols[5].parse::<f32>().unwrap(),
            cols[6].parse::<f32>().unwrap(),
        ];
        let actual = global_tonemap(rgb_in, h, true);
        for i in 0..3 {
            let err = (actual[i] - expected[i]).abs();
            assert!(
                err < TOLERANCE,
                "globalTonemap mismatch at headroom={h}, rgb_in={rgb_in:?}, channel={i}: zentone={}, libultrahdr={}, err={err}",
                actual[i],
                expected[i]
            );
        }
        checked += 1;
    }
    assert!(checked > 0, "no globalTonemap rows parsed");
    println!("global_tonemap: checked {checked} rows");
}

// ============================================================================
// BT.2390 / BT.2408 parity against libplacebo
// ============================================================================

/// Test zentone's Bt2408Tonemapper::tonemap_nits against libplacebo's
/// PQ-domain BT.2390 EETF at knee_offset=0.5 (the ITU standard value,
/// which gives KS = 1.5*maxLum - 0.5, matching zentone's formula).
///
/// Uses PQ_TOLERANCE because the full pipeline involves 4× powf calls
/// through the PQ transfer function.
#[test]
fn bt2408_tonemap_nits_matches_libplacebo_pq() {
    let csv = read_csv("libplacebo_bt2390.csv");

    let mut checked = 0;
    let mut max_err: f32 = 0.0;

    for cols in parse_rows(
        &csv,
        "content_nits,display_nits,knee_offset,input_nits,output_linear",
    ) {
        if cols.len() != 5 {
            continue;
        }
        let content: f32 = cols[0].parse().unwrap();
        let display: f32 = cols[1].parse().unwrap();
        let offset: f32 = cols[2].parse().unwrap();
        let input_nits: f32 = cols[3].parse().unwrap();
        let expected_nits: f32 = cols[4].parse().unwrap();

        // Only test knee_offset=0.5 rows (the ITU standard zentone implements).
        // knee_offset=1.0 is libplacebo's default but a different formula.
        if (offset - 0.5).abs() > 0.01 {
            continue;
        }

        // Skip passthrough configs (content <= display)
        if content <= display {
            continue;
        }

        let tm = Bt2408Tonemapper::with_luma(content, display, LUMA_BT709);
        let actual_nits = tm.tonemap_nits(input_nits);

        // Use relative tolerance for large values, absolute for small
        let err = (actual_nits - expected_nits).abs();
        let rel = if expected_nits.abs() > 1.0 {
            err / expected_nits.abs()
        } else {
            err
        };

        max_err = max_err.max(rel);

        assert!(
            rel < PQ_TOLERANCE,
            "BT.2408 mismatch: content={content}, display={display}, \
             input={input_nits} nits: zentone={actual_nits}, \
             libplacebo={expected_nits}, rel_err={rel}"
        );
        checked += 1;
    }
    assert!(checked > 10, "too few PQ-domain rows checked: {checked}");
    println!("bt2408_tonemap_nits: checked {checked} rows, max_rel_err={max_err:.6e}");
}

/// Test zentone's bt2390_tonemap and bt2390_tonemap_ext against
/// libplacebo's scene-linear BT.2390 EETF. This is an exact comparison
/// (no PQ conversion involved), so we use tight tolerance.
#[test]
fn bt2390_scene_linear_matches_libplacebo() {
    let csv = read_csv("libplacebo_bt2390.csv");

    let mut checked = 0;
    let mut max_err: f32 = 0.0;

    for cols in parse_rows(&csv, "source_peak,target_peak,min_lum,input,output") {
        if cols.len() != 5 {
            continue;
        }
        let source: f32 = cols[0].parse().unwrap();
        let target: f32 = cols[1].parse().unwrap();
        let min_lum: f32 = cols[2].parse().unwrap();
        let input: f32 = cols[3].parse().unwrap();
        let expected: f32 = cols[4].parse().unwrap();

        let actual = if min_lum > 0.0 {
            bt2390_tonemap_ext(input, source, target, Some(min_lum))
        } else {
            bt2390_tonemap(input, source, target)
        };

        let err = (actual - expected).abs();
        max_err = max_err.max(err);

        assert!(
            err < TOLERANCE,
            "bt2390 scene-linear mismatch: source={source}, target={target}, \
             min_lum={min_lum}, input={input}: zentone={actual}, \
             libplacebo={expected}, err={err}"
        );
        checked += 1;
    }
    assert!(checked > 20, "too few scene-linear rows checked: {checked}");
    println!("bt2390_scene_linear: checked {checked} rows, max_err={max_err:.6e}");
}

// ============================================================================
// darktable filmic spline parity
// ============================================================================

/// Make a FilmicSplineConfig matching the named darktable golden config.
fn filmic_config(name: &str) -> FilmicSplineConfig {
    let mut cfg = FilmicSplineConfig::default();
    match name {
        "zentone_defaults" => {}
        "darktable_defaults" => {
            cfg.output_power = 4.0;
            cfg.latitude = 0.01;
            cfg.white_point_source = 4.0;
            cfg.contrast = 1.0;
        }
        "high_contrast" => {
            cfg.contrast = 2.5;
        }
        "with_saturation" => {
            cfg.saturation = 50.0;
        }
        _ => panic!("unknown config: {name}"),
    }
    cfg
}

/// Test CompiledFilmicSpline::map_rgb against darktable's filmic_rgb
/// output at multiple parameter configurations and RGB inputs.
#[test]
fn filmic_spline_matches_darktable_rgb() {
    let csv = read_csv("darktable_filmic.csv");
    let mut checked = 0;
    let mut max_err: f32 = 0.0;

    for cols in parse_rows(&csv, "config,r_in,g_in,b_in,r_out,g_out,b_out") {
        if cols.len() != 7 {
            continue;
        }
        let config_name = cols[0];
        let rgb_in = [
            cols[1].parse::<f32>().unwrap(),
            cols[2].parse::<f32>().unwrap(),
            cols[3].parse::<f32>().unwrap(),
        ];
        let expected = [
            cols[4].parse::<f32>().unwrap(),
            cols[5].parse::<f32>().unwrap(),
            cols[6].parse::<f32>().unwrap(),
        ];

        let cfg = filmic_config(config_name);
        let spline = CompiledFilmicSpline::new(&cfg);
        let actual = spline.map_rgb(rgb_in);

        for i in 0..3 {
            let err = (actual[i] - expected[i]).abs();
            max_err = max_err.max(err);
            assert!(
                err < 1e-4,
                "filmic {config_name} at {rgb_in:?}[{i}]: zentone={}, darktable={}, err={err}",
                actual[i],
                expected[i]
            );
        }
        checked += 1;
    }
    assert!(checked > 20, "too few filmic RGB rows: {checked}");
    println!("filmic_spline_rgb: checked {checked} rows, max_err={max_err:.6e}");
}

/// Strip-form companion: drive the SIMD `map_strip_simd` path over the same
/// darktable goldens. Tolerance widens to absorb the SIMD log/exp helpers
/// (`log2_midp`, `exp2_midp` are ~3 ULP each); the desat exp2 path can
/// shift the final mapped value by a few units in the fourth decimal.
#[test]
fn filmic_spline_strip_matches_darktable_rgb() {
    let csv = read_csv("darktable_filmic.csv");
    let mut checked = 0;
    let mut max_err: f32 = 0.0;

    // Group golden rows by config so we can build per-config strips and
    // hand them to `map_strip_simd` in one call.
    use std::collections::BTreeMap;
    type Row = ([f32; 3], [f32; 3]);
    let mut by_cfg: BTreeMap<String, Vec<Row>> = BTreeMap::new();
    for cols in parse_rows(&csv, "config,r_in,g_in,b_in,r_out,g_out,b_out") {
        if cols.len() != 7 {
            continue;
        }
        let config_name = cols[0].to_string();
        let rgb_in = [
            cols[1].parse::<f32>().unwrap(),
            cols[2].parse::<f32>().unwrap(),
            cols[3].parse::<f32>().unwrap(),
        ];
        let expected = [
            cols[4].parse::<f32>().unwrap(),
            cols[5].parse::<f32>().unwrap(),
            cols[6].parse::<f32>().unwrap(),
        ];
        by_cfg
            .entry(config_name)
            .or_default()
            .push((rgb_in, expected));
    }

    for (cfg_name, rows) in &by_cfg {
        let cfg = filmic_config(cfg_name);
        let spline = CompiledFilmicSpline::new(&cfg);
        let inputs: Vec<[f32; 3]> = rows.iter().map(|(i, _)| *i).collect();
        let mut strip = inputs.clone();
        spline.map_strip_simd(&mut strip);

        for (i, (rgb_in, expected)) in rows.iter().enumerate() {
            for c in 0..3 {
                let err = (strip[i][c] - expected[c]).abs();
                max_err = max_err.max(err);
                assert!(
                    err < 5e-4,
                    "filmic_strip {cfg_name} at {rgb_in:?}[{c}]: simd={}, darktable={}, err={err}",
                    strip[i][c],
                    expected[c]
                );
                checked += 1;
            }
        }
    }
    assert!(checked > 60, "too few filmic strip rows: {checked}");
    println!("filmic_spline_strip: checked {checked} rows, max_err={max_err:.6e}");
}

/// Strip-form companion to `bt2408_tonemap_nits_matches_libplacebo_pq`.
/// The libplacebo golden is a 1D nits→nits function; we drive
/// `map_strip_simd` on a neutral gray strip whose values map back to the
/// per-channel scale the kernel applies, then compare against the scalar
/// `tonemap_nits` reference. This pins the SIMD strip kernel to the same
/// libplacebo golden chain (PQ EOTF → spline → PQ OETF).
#[test]
fn bt2408_strip_matches_libplacebo_neutral_gray() {
    let csv = read_csv("libplacebo_bt2390.csv");

    // Group rows by (content, display) so each test config gets its own strip.
    use std::collections::BTreeMap;
    let mut by_pair: BTreeMap<(u32, u32), Vec<(f32, f32)>> = BTreeMap::new();
    for cols in parse_rows(
        &csv,
        "content_nits,display_nits,knee_offset,input_nits,output_linear",
    ) {
        if cols.len() != 5 {
            continue;
        }
        let content: f32 = cols[0].parse().unwrap();
        let display: f32 = cols[1].parse().unwrap();
        let offset: f32 = cols[2].parse().unwrap();
        let input_nits: f32 = cols[3].parse().unwrap();
        let expected_nits: f32 = cols[4].parse().unwrap();
        if (offset - 0.5).abs() > 0.01 || content <= display {
            continue;
        }
        by_pair
            .entry((content.to_bits(), display.to_bits()))
            .or_default()
            .push((input_nits, expected_nits));
    }

    let mut total_checked = 0;
    let mut max_rel: f32 = 0.0;
    for ((c_bits, d_bits), rows) in &by_pair {
        let content = f32::from_bits(*c_bits);
        let display = f32::from_bits(*d_bits);
        let tm = Bt2408Tonemapper::with_luma(content, display, LUMA_BT709);

        // Build a neutral-gray strip in content-normalized linear. For each
        // input_nits, the strip pixel is `[input_nits/content, ...]` so the
        // YRGB luminance equals the requested value.
        let strip_in: Vec<[f32; 3]> = rows
            .iter()
            .map(|(input_nits, _)| {
                let v = input_nits / content;
                [v, v, v]
            })
            .collect();
        let mut strip = strip_in.clone();
        tm.map_strip_simd(&mut strip);

        for (i, (input_nits, expected_nits)) in rows.iter().enumerate() {
            // Strip output is display-normalized linear; convert back to nits.
            let actual_nits = strip[i][0] * display;
            let err = (actual_nits - expected_nits).abs();
            let rel = if expected_nits.abs() > 1.0 {
                err / expected_nits.abs()
            } else {
                err
            };
            max_rel = max_rel.max(rel);
            // Looser than the scalar test (PQ_TOLERANCE) because the SIMD
            // path adds `log2_midp`/`exp2_midp` precision loss on top of
            // libplacebo's PQ rational polynomial.
            assert!(
                rel < 5e-4,
                "BT.2408 strip mismatch: content={content}, display={display}, \
                 input={input_nits} nits: simd={actual_nits}, libplacebo={expected_nits}, \
                 rel_err={rel}"
            );
            total_checked += 1;
            // unused-var supression
            let _ = (input_nits, i);
        }
    }
    assert!(
        total_checked > 10,
        "too few BT.2408 strip rows: {total_checked}"
    );
    println!("bt2408_strip_libplacebo: checked {total_checked} rows, max_rel_err={max_rel:.6e}");
}

/// Test the raw spline evaluation (apply_spline) against darktable's
/// filmic_spline at multiple x values.
#[test]
fn filmic_spline_eval_matches_darktable() {
    let csv = read_csv("darktable_filmic.csv");
    let mut checked = 0;
    let mut max_err: f32 = 0.0;

    for cols in parse_rows(&csv, "config,x,spline_y") {
        if cols.len() != 3 {
            continue;
        }
        let config_name = cols[0];
        let x: f32 = cols[1].parse().unwrap();
        let expected: f32 = cols[2].parse().unwrap();

        let cfg = filmic_config(config_name);
        let spline = CompiledFilmicSpline::new(&cfg);
        let actual = spline.apply_spline(x);

        let err = (actual - expected).abs();
        max_err = max_err.max(err);
        assert!(
            err < 1e-4,
            "filmic spline {config_name} at x={x}: zentone={actual}, darktable={expected}, err={err}"
        );
        checked += 1;
    }
    assert!(checked > 20, "too few spline eval rows: {checked}");
    println!("filmic_spline_eval: checked {checked} rows, max_err={max_err:.6e}");
}

// ============================================================================
// libultrahdr per-gamut luminance constants
// ============================================================================

/// libultrahdr's `srgbLuminance` uses the IEC 61966-2-1 / BT.709 weights
/// `(0.212639, 0.715169, 0.072192)`, identical to zentone's `LUMA_BT709`
/// (`(0.2126, 0.7152, 0.0722)`) within four-decimal rounding. The tolerance
/// absorbs that rounding plus the multiply-add reordering — both
/// implementations evaluate the same mathematical function on the same
/// chromaticities.
///
/// libultrahdr's `p3Luminance` uses the SMPTE EG 432-1 weights
/// `(0.2289746, 0.6917385, 0.0792869)`, bit-identical to zentone's
/// `LUMA_P3`.
///
/// libultrahdr's `bt2100Luminance` uses ITU-R BT.2100-2 weights
/// `(0.2627, 0.677998, 0.059302)` quoted to six decimals; zentone's
/// `LUMA_BT2020` (BT.2020 == BT.2100 primaries) uses
/// `(0.2627, 0.6780, 0.0593)`. Both round to the same canonical CIE
/// 1931 chromaticity-derived weights but at different precisions, so the
/// bt2100 column allows a 5e-4 absolute tolerance.
#[test]
fn luminance_dot_products_match_libultrahdr() {
    let csv = read_csv("libultrahdr_luminance.csv");

    let mut checked = 0;
    let mut max_err_srgb: f32 = 0.0;
    let mut max_err_p3: f32 = 0.0;
    let mut max_err_2100: f32 = 0.0;

    for cols in parse_rows(&csv, "r,g,b,srgb_luma,p3_luma,bt2100_luma") {
        if cols.len() != 6 {
            continue;
        }
        let r: f32 = cols[0].parse().unwrap();
        let g: f32 = cols[1].parse().unwrap();
        let b: f32 = cols[2].parse().unwrap();
        let exp_srgb: f32 = cols[3].parse().unwrap();
        let exp_p3: f32 = cols[4].parse().unwrap();
        let exp_2100: f32 = cols[5].parse().unwrap();

        let dot = |w: [f32; 3]| w[0] * r + w[1] * g + w[2] * b;
        let act_srgb = dot(LUMA_BT709);
        let act_p3 = dot(LUMA_P3);
        let act_2100 = dot(LUMA_BT2020);

        // sRGB / BT.709: zentone uses 4-decimal weights vs libultrahdr's
        // 6-decimal. Allow an input-magnitude-scaled tolerance so we don't
        // false-trip on `r=g=b=8.0` where the rounding error scales linearly.
        let scale = (r.abs() + g.abs() + b.abs()).max(1.0);
        let tol_srgb = 5e-4 * scale;
        let err_srgb = (act_srgb - exp_srgb).abs();
        max_err_srgb = max_err_srgb.max(err_srgb);
        assert!(
            err_srgb < tol_srgb,
            "sRGB luminance mismatch at ({r},{g},{b}): zentone={act_srgb}, \
             libultrahdr={exp_srgb}, err={err_srgb}, tol={tol_srgb}"
        );

        // Display-P3: bit-identical weights, expect tight match.
        let err_p3 = (act_p3 - exp_p3).abs();
        max_err_p3 = max_err_p3.max(err_p3);
        assert!(
            err_p3 < TOLERANCE * scale,
            "P3 luminance mismatch at ({r},{g},{b}): zentone={act_p3}, \
             libultrahdr={exp_p3}, err={err_p3}"
        );

        // BT.2100/BT.2020: 4-decimal vs 6-decimal weights, looser tolerance.
        let tol_2100 = 5e-4 * scale;
        let err_2100 = (act_2100 - exp_2100).abs();
        max_err_2100 = max_err_2100.max(err_2100);
        assert!(
            err_2100 < tol_2100,
            "BT.2100 luminance mismatch at ({r},{g},{b}): zentone={act_2100}, \
             libultrahdr={exp_2100}, err={err_2100}, tol={tol_2100}"
        );

        checked += 1;
    }
    assert!(checked > 100, "too few luminance rows: {checked}");
    println!(
        "luminance: checked {checked} rows, max_err sRGB={max_err_srgb:.3e} \
         P3={max_err_p3:.3e} BT.2100={max_err_2100:.3e}"
    );
}

// ============================================================================
// libultrahdr gamut-conversion matrices
// ============================================================================

/// Compare zentone's six gamut matrices against libultrahdr's at every
/// row of the brute-force grid. Returns per-conversion max-absolute-error
/// (after dividing by max input magnitude) so callers can split the
/// "tight" matches from "documented divergences".
fn collect_gamut_errors() -> std::collections::BTreeMap<String, f32> {
    let csv = read_csv("libultrahdr_gamut.csv");
    let mut max_err_by_conv: std::collections::BTreeMap<String, f32> = Default::default();

    for cols in parse_rows(&csv, "conv,r_in,g_in,b_in,r_out,g_out,b_out") {
        if cols.len() != 7 {
            continue;
        }
        let conv = cols[0].to_string();
        let rgb_in = [
            cols[1].parse::<f32>().unwrap(),
            cols[2].parse::<f32>().unwrap(),
            cols[3].parse::<f32>().unwrap(),
        ];
        let expected = [
            cols[4].parse::<f32>().unwrap(),
            cols[5].parse::<f32>().unwrap(),
            cols[6].parse::<f32>().unwrap(),
        ];
        let actual = match conv.as_str() {
            "bt709_to_p3" => apply_matrix(&BT709_TO_P3, rgb_in),
            "bt709_to_bt2100" => apply_matrix(&BT709_TO_BT2020, rgb_in),
            "p3_to_bt709" => apply_matrix(&P3_TO_BT709, rgb_in),
            "p3_to_bt2100" => apply_matrix(&P3_TO_BT2020, rgb_in),
            "bt2100_to_bt709" => apply_matrix(&BT2020_TO_BT709, rgb_in),
            "bt2100_to_p3" => apply_matrix(&BT2020_TO_P3, rgb_in),
            other => panic!("unknown conv tag: {other}"),
        };
        let scale = rgb_in.iter().fold(0.0_f32, |a, &x| a.max(x.abs())).max(1.0);
        let entry = max_err_by_conv.entry(conv).or_insert(0.0);
        for i in 0..3 {
            let err = (actual[i] - expected[i]).abs() / scale;
            if err > *entry {
                *entry = err;
            }
        }
    }
    max_err_by_conv
}

/// libultrahdr's gamut matrices are quoted to six decimal places (e.g.
/// `kBt709ToP3 = {0.822462, 0.177537, ...}`); zentone's are quoted to four
/// (e.g. `BT709_TO_P3 = [0.8225, 0.1774, ...]`). For BT.709↔P3 and
/// BT.709↔BT.2020 the rounding agrees within ~1.5e-3 per channel
/// per unit input. The BT.2020↔P3 pair has a real coefficient
/// divergence (see the documented-divergence test below), so it's
/// excluded from this strict comparison.
///
/// Note: libultrahdr's `bt2100*` paths use BT.2100 primaries which are
/// identical to BT.2020 primaries (ITU-R BT.2100 Table 2). We compare
/// libultrahdr's `*Bt2100*` against zentone's `*BT2020*`.
#[test]
fn gamut_conversions_match_libultrahdr_within_rounding() {
    let errors = collect_gamut_errors();

    // BT.709 ↔ P3 and BT.709 ↔ BT.2020 must agree within 1.5e-3 per unit
    // input. These four matrices both come from the standard 4-decimal
    // BT.709 + D65 derivation that libultrahdr's 6-decimal entries round
    // to the same canonical values.
    const TIGHT: &[&str] = &[
        "bt709_to_p3",
        "p3_to_bt709",
        "bt709_to_bt2100",
        "bt2100_to_bt709",
    ];
    for conv in TIGHT {
        let err = errors.get(*conv).copied().unwrap_or(f32::INFINITY);
        assert!(
            err < 1.5e-3,
            "{conv} max relative error {err:.6e} exceeds 1.5e-3 tolerance"
        );
    }
    println!("gamut tight: {errors:?}");
}

/// Documented divergence: zentone's `BT2020_TO_P3` and `P3_TO_BT2020`
/// matrices do not numerically match libultrahdr's `kBt2100ToP3` and
/// `kP3ToBt2100`.
///
/// Specifically, zentone's `BT2020_TO_P3` row 2 is
/// `[-0.0028, -0.0196, 1.0219]`, while libultrahdr quotes
/// `[0.002822, -0.019598, 1.016777]`. The first coefficient flipped sign
/// (libultrahdr says +0.0028 for the R contribution to B, zentone says
/// -0.0028) and the diagonal is 1.0219 vs 1.016777. Net effect: at unit
/// blue input, the two matrices' B output differs by 0.005, a real
/// 5× the rounding budget.
///
/// This is not a zentone bug — both matrices roundtrip cleanly with their
/// own inverse — but it means a zentone BT.2020→P3 rendering will be
/// numerically different from libultrahdr's. Anyone needing bit parity
/// with libultrahdr's gain-map applier should compose conversions
/// through BT.709 (BT.2020 → BT.709 → P3) instead, since the BT.709
/// matrices DO match libultrahdr.
///
/// We assert the divergence is at most 1e-2 (so a regression that made
/// it worse would still trip), and emit the value for visibility.
#[test]
fn gamut_p3_bt2020_documented_divergence() {
    let errors = collect_gamut_errors();
    let bt2020_to_p3 = errors.get("bt2100_to_p3").copied().unwrap_or(0.0);
    let p3_to_bt2020 = errors.get("p3_to_bt2100").copied().unwrap_or(0.0);

    // The divergence is real but bounded; if it ever grew past 1e-2 the
    // matrices likely got reauthored and this test is a tripwire.
    assert!(
        bt2020_to_p3 < 1e-2,
        "BT.2020→P3 divergence {bt2020_to_p3:.6e} ≥ 1e-2 (matrix rewrite?)"
    );
    assert!(
        p3_to_bt2020 < 1e-2,
        "P3→BT.2020 divergence {p3_to_bt2020:.6e} ≥ 1e-2 (matrix rewrite?)"
    );
    // And the divergence is genuinely there — assert it's NOT below the
    // tight tolerance, otherwise this documentation is stale.
    assert!(
        bt2020_to_p3 > 1.5e-3 || p3_to_bt2020 > 1.5e-3,
        "BT.2020↔P3 now agrees within tight tolerance — \
         move these into the strict test and remove this divergence doc"
    );
    println!(
        "gamut documented divergence: BT.2020→P3 max_err_per_unit={bt2020_to_p3:.6e}, \
         P3→BT.2020 max_err_per_unit={p3_to_bt2020:.6e}"
    );
}

// ============================================================================
// libultrahdr HLG OOTF / inverse OOTF
// ============================================================================

/// libultrahdr's `hlgOotf(e, luminance) = e * pow(Y, gamma-1)` and
/// `hlgInverseOotf(e, luminance) = e * pow(Y, (1/gamma)-1)` use a
/// caller-supplied luminance functor (`bt2100Luminance` for the BT.2100
/// path) and a hardcoded `kOotfGamma = 1.2`. The golden CSV varies gamma
/// across `hlg_system_gamma` values (1.0, 1.033, 1.2, 1.453, 1.5) so we
/// exercise the parameterized zentone API.
///
/// The only divergence is the BT.2100 luminance precision (zentone uses
/// `0.2627, 0.6780, 0.0593` vs libultrahdr's `0.2627, 0.677998, 0.059302`).
/// At gamma=1.0 the OOTF reduces to identity (any luminance) so the test
/// is bit-exact there; for gamma≠1 the luminance precision propagates
/// through `pow(Y, gamma-1)` and grows with `|gamma-1|`. We size the
/// tolerance accordingly.
#[test]
fn hlg_ootf_matches_libultrahdr() {
    let csv = read_csv("libultrahdr_hlg_ootf.csv");

    let mut checked = 0;
    let mut max_err: f32 = 0.0;

    for cols in parse_rows(&csv, "dir,gamma,r_in,g_in,b_in,r_out,g_out,b_out") {
        if cols.len() != 8 {
            continue;
        }
        let dir = cols[0];
        let gamma: f32 = cols[1].parse().unwrap();
        let rgb_in = [
            cols[2].parse::<f32>().unwrap(),
            cols[3].parse::<f32>().unwrap(),
            cols[4].parse::<f32>().unwrap(),
        ];
        let expected = [
            cols[5].parse::<f32>().unwrap(),
            cols[6].parse::<f32>().unwrap(),
            cols[7].parse::<f32>().unwrap(),
        ];

        let actual = match dir {
            "ootf" => hlg_ootf(rgb_in, gamma),
            "inverse_ootf" => hlg_inverse_ootf(rgb_in, gamma),
            // Approx rows are checked by `hlg_ootf_approx_matches_libultrahdr`.
            "ootf_approx" | "inverse_ootf_approx" => continue,
            other => panic!("unknown dir tag: {other}"),
        };

        // BT.2100 luminance precision delta is at most ~1.4e-4 (the worst
        // case is bt2100_lum at full green, where 0.677998 vs 0.6780 differ
        // by ~2e-6). Through pow(y, gamma-1) at |gamma-1| ≤ 0.5 with y ≤ 1,
        // the multiplicative factor grows but stays below 5e-4 for the
        // grids used here. Scale by output magnitude (1e-4 + 1e-3*|out|).
        let mag = expected
            .iter()
            .fold(0.0_f32, |a, &x| a.max(x.abs()))
            .max(1.0);
        let tol = 5e-4 * mag;

        for i in 0..3 {
            let err = (actual[i] - expected[i]).abs();
            max_err = max_err.max(err);
            assert!(
                err < tol,
                "{dir} gamma={gamma} mismatch at {rgb_in:?}[{i}]: zentone={}, \
                 libultrahdr={}, err={err}, tol={tol}",
                actual[i],
                expected[i]
            );
        }
        checked += 1;
    }
    assert!(checked > 100, "too few HLG OOTF rows: {checked}");
    println!("hlg_ootf: checked {checked} rows, max_err={max_err:.6e}");
}

/// Parity test for `hlg_ootf_approx` / `hlg_inverse_ootf_approx` against
/// libultrahdr's `hlgOotfApprox` / `hlgInverseOotfApprox` (gainmapmath.cpp:299/309).
///
/// The approx variants are pure per-channel `pow(c, γ)` with no luminance
/// dependency, so there's no precision-divergence issue — they should agree
/// to within float `pow` rounding (well below 5e-4 × output).
#[test]
fn hlg_ootf_approx_matches_libultrahdr() {
    let csv = read_csv("libultrahdr_hlg_ootf.csv");

    let mut checked = 0;
    let mut max_err: f32 = 0.0;

    for cols in parse_rows(&csv, "dir,gamma,r_in,g_in,b_in,r_out,g_out,b_out") {
        if cols.len() != 8 {
            continue;
        }
        let dir = cols[0];
        // Skip the chromaticity-preserving rows; those are covered by
        // `hlg_ootf_matches_libultrahdr`.
        if dir != "ootf_approx" && dir != "inverse_ootf_approx" {
            continue;
        }
        let gamma: f32 = cols[1].parse().unwrap();
        let rgb_in = [
            cols[2].parse::<f32>().unwrap(),
            cols[3].parse::<f32>().unwrap(),
            cols[4].parse::<f32>().unwrap(),
        ];
        let expected = [
            cols[5].parse::<f32>().unwrap(),
            cols[6].parse::<f32>().unwrap(),
            cols[7].parse::<f32>().unwrap(),
        ];

        let actual = match dir {
            "ootf_approx" => hlg_ootf_approx(rgb_in, gamma),
            "inverse_ootf_approx" => hlg_inverse_ootf_approx(rgb_in, gamma),
            _ => unreachable!(),
        };

        // Per-channel pow has no luminance term, so the only divergence is
        // libm vs libstdc++ pow rounding. Use the same scaled tolerance as
        // the exact OOTF test (5e-4 × max output magnitude) for parity.
        let mag = expected
            .iter()
            .fold(0.0_f32, |a, &x| a.max(x.abs()))
            .max(1.0);
        let tol = 5e-4 * mag;

        for i in 0..3 {
            let err = (actual[i] - expected[i]).abs();
            max_err = max_err.max(err);
            assert!(
                err < tol,
                "{dir} gamma={gamma} mismatch at {rgb_in:?}[{i}]: zentone={}, \
                 libultrahdr={}, err={err}, tol={tol}",
                actual[i],
                expected[i]
            );
        }
        checked += 1;
    }
    assert!(checked > 100, "too few HLG OOTF approx rows: {checked}");
    println!("hlg_ootf_approx: checked {checked} rows, max_err={max_err:.6e}");
}

/// Row-form companion of `hlg_ootf_matches_libultrahdr`: drive the SIMD strip
/// kernels (`hlg_ootf_row_simd`, `hlg_inverse_ootf_row_simd`,
/// `hlg_ootf_approx_row_simd`, `hlg_inverse_ootf_approx_row_simd`) over the
/// same libultrahdr golden CSV and assert the same tolerance band. Inputs are
/// grouped by `(dir, gamma)` so each kernel sees a multi-pixel strip — that
/// exercises the SIMD chunk-of-8 path rather than just the scalar tail.
#[test]
fn hlg_ootf_row_matches_libultrahdr() {
    let csv = read_csv("libultrahdr_hlg_ootf.csv");

    // Group rows by (dir, gamma_bits) so we can build per-group strips.
    use std::collections::BTreeMap;
    type IoPair = ([f32; 3], [f32; 3]);
    let mut groups: BTreeMap<(String, u32), Vec<IoPair>> = BTreeMap::new();
    for cols in parse_rows(&csv, "dir,gamma,r_in,g_in,b_in,r_out,g_out,b_out") {
        if cols.len() != 8 {
            continue;
        }
        let dir = cols[0].to_string();
        let gamma: f32 = cols[1].parse().unwrap();
        let rgb_in = [
            cols[2].parse::<f32>().unwrap(),
            cols[3].parse::<f32>().unwrap(),
            cols[4].parse::<f32>().unwrap(),
        ];
        let rgb_out = [
            cols[5].parse::<f32>().unwrap(),
            cols[6].parse::<f32>().unwrap(),
            cols[7].parse::<f32>().unwrap(),
        ];
        groups
            .entry((dir, gamma.to_bits()))
            .or_default()
            .push((rgb_in, rgb_out));
    }

    let mut max_err: f32 = 0.0;
    let mut total_checked = 0;
    for ((dir, gamma_bits), entries) in &groups {
        let gamma = f32::from_bits(*gamma_bits);
        let inputs: Vec<[f32; 3]> = entries.iter().map(|(i, _)| *i).collect();
        let mut row = inputs.clone();

        match dir.as_str() {
            "ootf" => hlg_ootf_row_simd(&mut row, gamma),
            "inverse_ootf" => hlg_inverse_ootf_row_simd(&mut row, gamma),
            "ootf_approx" => hlg_ootf_approx_row_simd(&mut row, gamma),
            "inverse_ootf_approx" => hlg_inverse_ootf_approx_row_simd(&mut row, gamma),
            other => panic!("unknown dir tag: {other}"),
        }

        for (i, (rgb_in, expected)) in entries.iter().enumerate() {
            let actual = row[i];
            let mag = expected
                .iter()
                .fold(0.0_f32, |a, &x| a.max(x.abs()))
                .max(1.0);
            let tol = 5e-4 * mag;
            for c in 0..3 {
                let err = (actual[c] - expected[c]).abs();
                max_err = max_err.max(err);
                assert!(
                    err < tol,
                    "{dir} (row form) gamma={gamma} in={rgb_in:?}[{c}]: \
                     simd={} libultrahdr={} err={err:.3e} tol={tol:.3e}",
                    actual[c],
                    expected[c]
                );
                total_checked += 1;
            }
        }
    }
    assert!(
        total_checked > 100,
        "too few HLG OOTF row-form comparisons: {total_checked}"
    );
    println!("hlg_ootf row form: {total_checked} comparisons, max_err={max_err:.6e}");
}