zenpixels-convert 0.2.9

//! Built-in profile registry: hand-coded converters for well-known ICC profiles.
//!
//! Most ICC profiles need a general-purpose CMS (moxcms, lcms2) to parse the
//! profile and build a transform. For a handful of profiles that show up
//! everywhere in the wild — e.g. jpegli's fixed XYB ICC — we can skip the
//! parser entirely by hashing the profile bytes, matching against a known
//! fingerprint, and dispatching to a hand-written transform.
//!
//! That saves both code size (for consumers who never need a full CMS) and
//! runtime cost (no profile parsing per decode).
//!
//! # Current entries
//!
//! | Profile | Source | Direction |
//! |---------|--------|-----------|
//! | [`BuiltinProfile::XybScaled`] | jpegli / libjxl (720-byte fixed blob) | XYB scaled u8 → sRGB u8 |
//!
//! # Internal use only
//!
//! This module is `pub(crate)` per the YAGNI policy in the repo root
//! CLAUDE.md. Callers inside `zenpixels-convert` reach it via
//! `crate::builtin_profiles::{recognize, maybe_convert_via_builtin, ...}`.
//! If an external consumer ever needs this behavior, promote specific
//! items to `pub` at the same time as wiring up that consumer — not
//! speculatively.
//!
//! # Why not make recognition opinionated?
//!
//! The registry explicitly does NOT own the full decode pipeline — it just
//! gives fast conversions for *known* source profiles. The general
//! "arbitrary ICC → sRGB" path belongs in the [`cms`](crate::cms) module.
//! Think of this as a targeted peephole optimization, not a replacement
//! for a CMS.
//!
//! # Fallback behavior
//!
//! [`maybe_convert_via_builtin`] returns `false` for any (profile, target)
//! pair that isn't in the table. Callers must be prepared to run the
//! general CMS path when it returns `false`.

extern crate alloc;

use zenpixels::Cicp;

use crate::error::ConvertError;

// =========================================================================
// Public types
// =========================================================================

/// A well-known ICC profile the registry recognizes and can convert
/// without a general-purpose CMS.
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)]
#[non_exhaustive]
pub(crate) enum BuiltinProfile {
    /// The fixed 720-byte jpegli/libjxl XYB profile.
    ///
    /// Pixel data tagged with this profile is in **scaled XYB** form, as
    /// produced by jpegli's encoder (after opsin matrix → cube root → final
    /// X/Y/B mixing → scale/offset). The inverse transform undoes all of
    /// that and produces sRGB in BT.709 primaries with the sRGB transfer
    /// function.
    XybScaled,
}

/// The canonical 720-byte XYB ICC profile.
///
/// Copied verbatim from zenjpeg's `foundation::consts::XYB_ICC_PROFILE`,
/// which was generated by the C++ jpegli encoder (libjxl). jpegli and
/// zenjpeg both embed this exact blob in every XYB-encoded JPEG so a
/// standard CMS decoder can round-trip back to sRGB.
///
/// jpegli / libjxl is Apache-2.0 / BSD-3 licensed; this blob is a pure
/// data constant and is distributed under the same dual license as the
/// rest of zenpixels-convert.
#[rustfmt::skip]
pub(crate) const XYB_ICC_BYTES: &[u8; 720] = &[
    0x00, 0x00, 0x02, 0xd0, 0x6a, 0x78, 0x6c, 0x20, 0x04, 0x40, 0x00, 0x00, 0x73, 0x63, 0x6e, 0x72,
    0x52, 0x47, 0x42, 0x20, 0x58, 0x59, 0x5a, 0x20, 0x07, 0xe3, 0x00, 0x0c, 0x00, 0x01, 0x00, 0x00,
    0x00, 0x00, 0x00, 0x00, 0x61, 0x63, 0x73, 0x70, 0x41, 0x50, 0x50, 0x4c, 0x00, 0x00, 0x00, 0x00,
    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf6, 0xd6, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0xd3, 0x2d,
    0x6a, 0x78, 0x6c, 0x20, 0x55, 0x8e, 0xdd, 0x94, 0x0f, 0x32, 0x04, 0x06, 0x99, 0xc6, 0x8a, 0x17,
    0xb4, 0x0d, 0x3f, 0x7b, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
    0x00, 0x00, 0x00, 0x06, 0x64, 0x65, 0x73, 0x63, 0x00, 0x00, 0x00, 0xcc, 0x00, 0x00, 0x00, 0x2c,
    0x63, 0x70, 0x72, 0x74, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x00, 0x00, 0x24, 0x77, 0x74, 0x70, 0x74,
    0x00, 0x00, 0x01, 0x1c, 0x00, 0x00, 0x00, 0x14, 0x63, 0x68, 0x61, 0x64, 0x00, 0x00, 0x01, 0x30,
    0x00, 0x00, 0x00, 0x2c, 0x41, 0x32, 0x42, 0x30, 0x00, 0x00, 0x01, 0x5c, 0x00, 0x00, 0x01, 0x24,
    0x42, 0x32, 0x41, 0x30, 0x00, 0x00, 0x02, 0x80, 0x00, 0x00, 0x00, 0x50, 0x6d, 0x6c, 0x75, 0x63,
    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x0c, 0x65, 0x6e, 0x55, 0x53,
    0x00, 0x00, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x58, 0x00, 0x59, 0x00, 0x42, 0x00, 0x5f,
    0x00, 0x50, 0x00, 0x65, 0x00, 0x72, 0x00, 0x00, 0x6d, 0x6c, 0x75, 0x63, 0x00, 0x00, 0x00, 0x00,
    0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x0c, 0x65, 0x6e, 0x55, 0x53, 0x00, 0x00, 0x00, 0x06,
    0x00, 0x00, 0x00, 0x1c, 0x00, 0x43, 0x00, 0x43, 0x00, 0x30, 0x00, 0x00, 0x58, 0x59, 0x5a, 0x20,
    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf6, 0xd6, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0xd3, 0x2d,
    0x73, 0x66, 0x33, 0x32, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x0c, 0x40, 0x00, 0x00, 0x05, 0xdd,
    0xff, 0xff, 0xf3, 0x29, 0x00, 0x00, 0x07, 0x92, 0x00, 0x00, 0xfd, 0x90, 0xff, 0xff, 0xfb, 0xa2,
    0xff, 0xff, 0xfd, 0xa2, 0x00, 0x00, 0x03, 0xdb, 0x00, 0x00, 0xc0, 0x81, 0x6d, 0x41, 0x42, 0x20,
    0x00, 0x00, 0x00, 0x00, 0x03, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0xf4,
    0x00, 0x00, 0x00, 0x94, 0x00, 0x00, 0x00, 0x50, 0x00, 0x00, 0x00, 0x20, 0x70, 0x61, 0x72, 0x61,
    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x70, 0x61, 0x72, 0x61,
    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x70, 0x61, 0x72, 0x61,
    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x02, 0x02, 0x02, 0x00,
    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
    0x00, 0x00, 0x0c, 0x86, 0x00, 0x00, 0x00, 0x00, 0x0c, 0x86, 0x70, 0xc9, 0xf3, 0x79, 0xff, 0xff,
    0x8f, 0x36, 0xf3, 0x79, 0xff, 0xff, 0xff, 0xff, 0x0c, 0x86, 0x00, 0x00, 0x00, 0x00, 0x0c, 0x86,
    0x00, 0x00, 0x70, 0xc9, 0xff, 0xff, 0xf3, 0x79, 0x8f, 0x36, 0xff, 0xff, 0xf3, 0x79, 0xff, 0xff,
    0x70, 0x61, 0x72, 0x61, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00,
    0x00, 0x00, 0xe3, 0x88, 0x00, 0x00, 0x23, 0xfc, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
    0x70, 0x61, 0x72, 0x61, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00,
    0x00, 0x00, 0xe3, 0x88, 0x00, 0x00, 0x20, 0xbb, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
    0x70, 0x61, 0x72, 0x61, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00,
    0x00, 0x01, 0x82, 0xd3, 0xff, 0xff, 0xe0, 0xd5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x14, 0xa1,
    0x00, 0x01, 0x84, 0x5b, 0xff, 0xfe, 0xe4, 0xbb, 0x00, 0x00, 0x12, 0x56, 0xff, 0xff, 0xf3, 0x32,
    0x00, 0x00, 0x91, 0x80, 0xff, 0xff, 0xfb, 0x4e, 0xff, 0xfe, 0x9c, 0xc1, 0x00, 0x01, 0x1d, 0x54,
    0x00, 0x00, 0xaf, 0x8c, 0xff, 0xff, 0xff, 0x88, 0xff, 0xff, 0xff, 0x84, 0xff, 0xff, 0xff, 0x99,
    0x6d, 0x42, 0x41, 0x20, 0x00, 0x00, 0x00, 0x00, 0x03, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x20,
    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
    0x70, 0x61, 0x72, 0x61, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00,
    0x70, 0x61, 0x72, 0x61, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00,
    0x70, 0x61, 0x72, 0x61, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00,
];

/// Substring marker that appears in the XYB ICC profile description
/// (ASCII "XYB"). Used as a recognition fallback for XYB profiles
/// produced by older or slightly different generators (anything that
/// isn't exactly the canonical 720 bytes).
const XYB_ASCII_MARKER: &[u8] = b"XYB";
/// UTF-16BE encoding of "XYB" as it appears in ICC `mluc` description
/// tags — same fallback reason as above.
const XYB_UTF16BE_MARKER: [u8; 6] = [0, b'X', 0, b'Y', 0, b'B'];

// =========================================================================
// Recognition
// =========================================================================

/// Recognize a profile blob against the built-in registry.
///
/// Returns `Some(BuiltinProfile)` if the bytes exactly match a known
/// profile, or a close variant we can safely treat as equivalent.
/// Returns `None` for unknown profiles — callers should fall back to a
/// general-purpose CMS.
///
/// # Matching policy
///
/// - **Exact match** against [`XYB_ICC_BYTES`] → `XybScaled`.
/// - **Substring match** on "XYB" (ASCII or UTF-16BE): if the profile
///   carries the marker but differs byte-for-byte from the canonical
///   blob, we still recognize it as XYB. This catches profiles written
///   by earlier jpegli versions or generators that re-emit the metadata
///   with padding differences. The inverse transform is fixed so the
///   fallback match is safe.
///
/// Recognition is intentionally conservative: we do not try to
/// recognize arbitrary XYB-like profiles, only the canonical jpegli
/// blob and files that explicitly self-identify.
#[must_use]
pub(crate) fn recognize(icc: &[u8]) -> Option<BuiltinProfile> {
    if icc == XYB_ICC_BYTES.as_slice() {
        return Some(BuiltinProfile::XybScaled);
    }

    // Fallback: scan for "XYB" markers. Only meaningful for profiles
    // that are at least long enough to contain the description tag.
    if icc.len() >= 128
        && (icc
            .windows(XYB_ASCII_MARKER.len())
            .any(|w| w == XYB_ASCII_MARKER)
            || icc
                .windows(XYB_UTF16BE_MARKER.len())
                .any(|w| w == XYB_UTF16BE_MARKER))
    {
        return Some(BuiltinProfile::XybScaled);
    }

    None
}

// =========================================================================
// Constants — XYB transform (mirrors jpegli / libjxl)
// =========================================================================

/// Pre-scale offset applied in `scale_xyb` (encode side); we subtract it
/// in the inverse.
#[rustfmt::skip]
#[allow(clippy::inconsistent_digit_grouping, clippy::excessive_precision)]
const SCALED_XYB_OFFSET: [f32; 3] = [0.015_386_134, 0.0, 0.277_704_59];
/// Pre-scale multiplier applied in `scale_xyb`; we divide in the inverse.
#[rustfmt::skip]
#[allow(clippy::inconsistent_digit_grouping, clippy::excessive_precision)]
const SCALED_XYB_SCALE: [f32; 3] = [22.995_788_804, 1.183_000_077, 1.502_141_333];

/// Inverse opsin absorbance matrix (LMS'-like → linear RGB).
/// Identical to zenjpeg's `INV_OPSIN`.
#[rustfmt::skip]
#[allow(clippy::inconsistent_digit_grouping, clippy::excessive_precision)]
const INV_OPSIN: [[f32; 3]; 3] = [
    [ 11.031_567, -9.866_944, -0.164_623],
    [ -3.254_147,  4.418_770, -0.164_623],
    [ -3.658_851,  2.712_923,  1.945_928],
];

/// Opsin absorbance bias (subtracted from the cubed LMS' values before
/// applying the inverse matrix).
const OPSIN_BIAS: [f32; 3] = [0.003_793_073_3, 0.003_793_073_3, 0.003_793_073_3];

/// `cbrt(bias)` (pre-cubed form of the bias). The encoder subtracts
/// this from each cube-rooted channel; we add it back on the way out.
const NEG_BIAS_CBRT: [f32; 3] = [-0.155_954_12, -0.155_954_12, -0.155_954_12];

// =========================================================================
// Scalar inverse XYB (reference implementation)
// =========================================================================

#[inline]
fn mixed_cube(v: f32) -> f32 {
    // Preserve sign across the cubing step — the encoder's cbrt path
    // produced a signed value, so we have to undo it symmetrically.
    if v < 0.0 { -((-v).powi(3)) } else { v.powi(3) }
}

#[inline]
fn unscale_xyb(scaled_x: f32, scaled_y: f32, scaled_b: f32) -> (f32, f32, f32) {
    let y = scaled_y / SCALED_XYB_SCALE[1] - SCALED_XYB_OFFSET[1];
    let x = scaled_x / SCALED_XYB_SCALE[0] - SCALED_XYB_OFFSET[0];
    // Note the `+ y` — jpegli's `scale_xyb` encoded B as (b - y + off) * scale.
    let b = scaled_b / SCALED_XYB_SCALE[2] - SCALED_XYB_OFFSET[2] + y;
    (x, y, b)
}

#[inline]
fn xyb_to_linear_rgb(x: f32, y: f32, b: f32) -> (f32, f32, f32) {
    let cbrt_r = y + x - NEG_BIAS_CBRT[0];
    let cbrt_g = y - x - NEG_BIAS_CBRT[1];
    let cbrt_b = b - NEG_BIAS_CBRT[2];

    let opsin_r = mixed_cube(cbrt_r) - OPSIN_BIAS[0];
    let opsin_g = mixed_cube(cbrt_g) - OPSIN_BIAS[1];
    let opsin_b = mixed_cube(cbrt_b) - OPSIN_BIAS[2];

    let r = INV_OPSIN[0][0].mul_add(
        opsin_r,
        INV_OPSIN[0][1].mul_add(opsin_g, INV_OPSIN[0][2] * opsin_b),
    );
    let g = INV_OPSIN[1][0].mul_add(
        opsin_r,
        INV_OPSIN[1][1].mul_add(opsin_g, INV_OPSIN[1][2] * opsin_b),
    );
    let b_out = INV_OPSIN[2][0].mul_add(
        opsin_r,
        INV_OPSIN[2][1].mul_add(opsin_g, INV_OPSIN[2][2] * opsin_b),
    );

    (r, g, b_out)
}

#[inline]
fn linear_to_srgb_u8(v: f32) -> u8 {
    let clamped = v.clamp(0.0, 1.0);
    let encoded = linear_srgb::tf::linear_to_srgb(clamped);
    (encoded * 255.0 + 0.5) as u8
}

/// JPEG level shift — scaled XYB f32 gets +128 on encode to land in
/// unsigned u8 range, so we subtract it on the inverse.
const JPEG_LEVEL_SHIFT: f32 = 128.0;

/// Convert one scaled-XYB pixel (u8 triple) to sRGB u8.
///
/// The input is interpreted as the raw 8-bit pixel value produced by a
/// standard JPEG decoder that decoded a jpegli XYB-encoded stream
/// without applying the embedded ICC profile — i.e. the post-IDCT,
/// post-level-shift u8 view of scaled XYB. On encode, jpegli computes
/// scaled-XYB f32 (small signed values clustered near 0), then JPEG
/// encoding adds the standard +128 level shift and clamps to u8. On
/// decode we undo the level shift first, then run the inverse XYB
/// pipeline.
#[inline]
#[must_use]
pub(crate) fn xyb_scaled_u8_pixel_to_srgb(r: u8, g: u8, b: u8) -> (u8, u8, u8) {
    // Undo the JPEG level shift — scaled XYB f32 lives around 0.
    let fx = r as f32 - JPEG_LEVEL_SHIFT;
    let fy = g as f32 - JPEG_LEVEL_SHIFT;
    let fb = b as f32 - JPEG_LEVEL_SHIFT;

    let (x, y, b_xyb) = unscale_xyb(fx, fy, fb);
    let (lr, lg, lb) = xyb_to_linear_rgb(x, y, b_xyb);
    (
        linear_to_srgb_u8(lr),
        linear_to_srgb_u8(lg),
        linear_to_srgb_u8(lb),
    )
}

/// Scalar fallback: convert an RGB u8 image from scaled XYB to sRGB.
///
/// `rgb_in` and `rgb_out` are tightly-packed RGB u8 buffers of equal
/// length, which must be a multiple of 3. `rgb_in` and `rgb_out` may
/// alias.
///
/// This is the portable reference implementation. On x86_64 with the
/// SIMD token available, [`convert_xyb_scaled_to_srgb_u8`]
/// transparently routes to a SIMD kernel that processes 8 pixels at a
/// time.
pub(crate) fn convert_xyb_scaled_to_srgb_u8_scalar(rgb_in: &[u8], rgb_out: &mut [u8]) {
    assert_eq!(rgb_in.len(), rgb_out.len(), "buffers must be equal length");
    assert!(
        rgb_in.len().is_multiple_of(3),
        "buffer length must be multiple of 3"
    );

    let pixels = rgb_in.len() / 3;
    for i in 0..pixels {
        let (r, g, b) = (rgb_in[i * 3], rgb_in[i * 3 + 1], rgb_in[i * 3 + 2]);
        let (or, og, ob) = xyb_scaled_u8_pixel_to_srgb(r, g, b);
        rgb_out[i * 3] = or;
        rgb_out[i * 3 + 1] = og;
        rgb_out[i * 3 + 2] = ob;
    }
}

// =========================================================================
// SIMD inverse XYB (AVX2 via magetypes)
// =========================================================================

#[cfg(target_arch = "x86_64")]
mod simd {
    use super::*;
    use archmage::prelude::*;
    use linear_srgb::tokens::x8 as trc_x8;
    use magetypes::simd::f32x8 as mt_f32x8;

    #[rite]
    fn invert_8px(token: X64V3Token, rgb_in: &[u8; 24], rgb_out: &mut [u8; 24]) {
        // Deinterleave 8 RGB triples into three lanes of f32x8.
        let mut r_in = [0.0f32; 8];
        let mut g_in = [0.0f32; 8];
        let mut b_in = [0.0f32; 8];
        for i in 0..8 {
            r_in[i] = rgb_in[i * 3] as f32;
            g_in[i] = rgb_in[i * 3 + 1] as f32;
            b_in[i] = rgb_in[i * 3 + 2] as f32;
        }

        // Undo the JPEG level shift (+128) applied during encode.
        let shift = mt_f32x8::splat(token, super::JPEG_LEVEL_SHIFT);
        let sx = mt_f32x8::from_array(token, r_in) - shift;
        let sy = mt_f32x8::from_array(token, g_in) - shift;
        let sb = mt_f32x8::from_array(token, b_in) - shift;

        // Step 1: unscale.
        let inv_scale_x = mt_f32x8::splat(token, 1.0 / SCALED_XYB_SCALE[0]);
        let inv_scale_y = mt_f32x8::splat(token, 1.0 / SCALED_XYB_SCALE[1]);
        let inv_scale_b = mt_f32x8::splat(token, 1.0 / SCALED_XYB_SCALE[2]);
        let off_x = mt_f32x8::splat(token, SCALED_XYB_OFFSET[0]);
        // off_y is 0.0, skip.
        let off_b = mt_f32x8::splat(token, SCALED_XYB_OFFSET[2]);

        let y = sy * inv_scale_y; // - 0
        let x = sx * inv_scale_x - off_x;
        let b_xyb = sb * inv_scale_b - off_b + y;

        // Step 2: recover cbrt_* values.
        let neg_bias = mt_f32x8::splat(token, NEG_BIAS_CBRT[0]);
        let cbrt_r = y + x - neg_bias;
        let cbrt_g = y - x - neg_bias;
        let cbrt_b = b_xyb - neg_bias;

        // Step 3: inverse cube (sign-preserving).
        // Use `cube(v) = v * v * v` directly; magetypes has no signed
        // cube intrinsic, so we rely on FMA-free multiplies preserving
        // sign.
        let opsin_r = cube_signed(cbrt_r);
        let opsin_g = cube_signed(cbrt_g);
        let opsin_b = cube_signed(cbrt_b);

        let bias = mt_f32x8::splat(token, OPSIN_BIAS[0]);
        let opsin_r = opsin_r - bias;
        let opsin_g = opsin_g - bias;
        let opsin_b = opsin_b - bias;

        // Step 4: inverse opsin matrix (3x3).
        let m00 = mt_f32x8::splat(token, INV_OPSIN[0][0]);
        let m01 = mt_f32x8::splat(token, INV_OPSIN[0][1]);
        let m02 = mt_f32x8::splat(token, INV_OPSIN[0][2]);
        let m10 = mt_f32x8::splat(token, INV_OPSIN[1][0]);
        let m11 = mt_f32x8::splat(token, INV_OPSIN[1][1]);
        let m12 = mt_f32x8::splat(token, INV_OPSIN[1][2]);
        let m20 = mt_f32x8::splat(token, INV_OPSIN[2][0]);
        let m21 = mt_f32x8::splat(token, INV_OPSIN[2][1]);
        let m22 = mt_f32x8::splat(token, INV_OPSIN[2][2]);

        let lin_r = m00.mul_add(opsin_r, m01.mul_add(opsin_g, m02 * opsin_b));
        let lin_g = m10.mul_add(opsin_r, m11.mul_add(opsin_g, m12 * opsin_b));
        let lin_b = m20.mul_add(opsin_r, m21.mul_add(opsin_g, m22 * opsin_b));

        // Step 5: clamp to [0, 1], then OETF via linear-srgb's 8-wide
        // SIMD sRGB encoder.
        let zero = mt_f32x8::splat(token, 0.0);
        let one = mt_f32x8::splat(token, 1.0);
        let lin_r = lin_r.clamp(zero, one);
        let lin_g = lin_g.clamp(zero, one);
        let lin_b = lin_b.clamp(zero, one);

        let sr = trc_x8::linear_to_srgb_v3(token, lin_r.to_array());
        let sg = trc_x8::linear_to_srgb_v3(token, lin_g.to_array());
        let sb_o = trc_x8::linear_to_srgb_v3(token, lin_b.to_array());

        // Re-interleave and round to u8.
        for i in 0..8 {
            rgb_out[i * 3] = f32_to_u8_round(sr[i]);
            rgb_out[i * 3 + 1] = f32_to_u8_round(sg[i]);
            rgb_out[i * 3 + 2] = f32_to_u8_round(sb_o[i]);
        }
    }

    #[inline(always)]
    fn cube_signed(v: mt_f32x8) -> mt_f32x8 {
        // v^3 preserves the sign of v because sign(v^3) = sign(v).
        // Implemented as v * v * v; magetypes has no fused cube op.
        v * v * v
    }

    #[inline(always)]
    fn f32_to_u8_round(v: f32) -> u8 {
        // `linear_to_srgb_v3` returns a clamped-ish float in [0, 1]
        // (with tiny overshoot near the endpoints); re-clamp before
        // rounding so the u8 cast doesn't UB on out-of-range values.
        let c = v.clamp(0.0, 1.0);
        (c * 255.0 + 0.5) as u8
    }

    #[arcane]
    pub(crate) fn convert_rgb_xyb_to_srgb_v3(token: X64V3Token, rgb_in: &[u8], rgb_out: &mut [u8]) {
        let pixels = rgb_in.len() / 3;
        let simd_pixels = pixels / 8 * 8;
        let simd_bytes = simd_pixels * 3;

        let (in_bulk, in_tail) = rgb_in.split_at(simd_bytes);
        let (out_bulk, out_tail) = rgb_out.split_at_mut(simd_bytes);

        for (in_chunk, out_chunk) in in_bulk.chunks_exact(24).zip(out_bulk.chunks_exact_mut(24)) {
            let in_arr: &[u8; 24] = in_chunk.try_into().expect("24 bytes");
            let out_arr: &mut [u8; 24] = out_chunk.try_into().expect("24 bytes");
            invert_8px(token, in_arr, out_arr);
        }

        // Scalar tail.
        super::convert_xyb_scaled_to_srgb_u8_scalar(in_tail, out_tail);
    }
}

// =========================================================================
// Public dispatch
// =========================================================================

/// Convert a packed RGB u8 buffer from scaled XYB to sRGB.
///
/// Dispatches to an AVX2/FMA kernel on x86_64 when the AVX2
/// feature is enabled; otherwise falls back to the scalar implementation.
///
/// Both buffers must be the same length and a multiple of 3.
///
/// # Example (crate-internal)
///
/// ```rust,ignore
/// use crate::builtin_profiles;
///
/// let src: Vec<u8> = vec![128, 128, 128, /* ... */]; // neutral gray scaled XYB
/// let mut dst = vec![0u8; src.len()];
/// builtin_profiles::convert_xyb_scaled_to_srgb_u8(&src, &mut dst);
/// ```
pub(crate) fn convert_xyb_scaled_to_srgb_u8(rgb_in: &[u8], rgb_out: &mut [u8]) {
    assert_eq!(rgb_in.len(), rgb_out.len(), "buffers must be equal length");
    assert!(
        rgb_in.len().is_multiple_of(3),
        "buffer length must be multiple of 3"
    );

    #[cfg(target_arch = "x86_64")]
    {
        use archmage::prelude::*;
        if let Some(token) = X64V3Token::summon() {
            simd::convert_rgb_xyb_to_srgb_v3(token, rgb_in, rgb_out);
            return;
        }
    }

    convert_xyb_scaled_to_srgb_u8_scalar(rgb_in, rgb_out);
}

/// Dispatch helper: if the ICC blob matches a built-in profile and the
/// target matches a supported output, convert and return `true`.
/// Otherwise return `false` so the caller can fall through to a general
/// CMS path.
///
/// Currently supports: `XybScaled` → `Cicp::SRGB`.
///
/// Buffers must be same length and a multiple of 3.
#[must_use]
pub(crate) fn maybe_convert_via_builtin(
    icc: &[u8],
    target: Cicp,
    rgb_in: &[u8],
    rgb_out: &mut [u8],
) -> bool {
    let profile = match recognize(icc) {
        Some(p) => p,
        None => return false,
    };

    match (profile, target) {
        (BuiltinProfile::XybScaled, t) if t == Cicp::SRGB => {
            convert_xyb_scaled_to_srgb_u8(rgb_in, rgb_out);
            true
        }
        _ => false,
    }
}

/// Fallible variant of [`maybe_convert_via_builtin`]: returns
/// [`ConvertError::CmsError`] when the profile isn't recognized or the
/// target isn't supported. Useful when the caller has already decided
/// the built-in path is mandatory and doesn't want a silent `false`.
pub(crate) fn convert_via_builtin(
    icc: &[u8],
    target: Cicp,
    rgb_in: &[u8],
    rgb_out: &mut [u8],
) -> Result<(), ConvertError> {
    if maybe_convert_via_builtin(icc, target, rgb_in, rgb_out) {
        Ok(())
    } else {
        Err(ConvertError::CmsError(alloc::format!(
            "no built-in profile for icc ({} bytes) → {:?}",
            icc.len(),
            target
        )))
    }
}

// =========================================================================
// Unit tests (small; integration tests live in tests/builtin_xyb.rs)
// =========================================================================

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn xyb_icc_bytes_is_720_bytes() {
        assert_eq!(XYB_ICC_BYTES.len(), 720);
    }

    #[test]
    fn recognize_canonical() {
        assert_eq!(recognize(XYB_ICC_BYTES), Some(BuiltinProfile::XybScaled));
    }

    #[test]
    fn recognize_rejects_short_buffer() {
        assert_eq!(recognize(&[]), None);
        assert_eq!(recognize(&[0u8; 10]), None);
    }

    #[test]
    fn unrecognized_returns_false() {
        let src = [0u8; 30];
        let mut dst = [0u8; 30];
        let fake_icc = [0u8; 32];
        assert!(!maybe_convert_via_builtin(
            &fake_icc,
            Cicp::SRGB,
            &src,
            &mut dst
        ));
    }
}

// =========================================================================
// Integration tests (relocated from tests/builtin_xyb.rs)
//
// This module is pub(crate) per zenpixels CLAUDE.md YAGNI policy, so
// integration tests cannot import from outside the crate — they live
// here instead.
// =========================================================================

#[cfg(test)]
mod integration_tests {
    use super::*;

    // =========================================================================
    // Recognition
    // =========================================================================

    #[test]
    fn recognize_canonical_xyb_bytes() {
        assert_eq!(
            recognize(XYB_ICC_BYTES),
            Some(BuiltinProfile::XybScaled),
            "canonical 720-byte XYB profile must be recognized"
        );
    }

    #[test]
    fn recognize_rejects_srgb_profile() {
        // A minimally-plausible "not XYB" blob: ICC header shape but with
        // sRGB identifier text. This isn't a real sRGB profile — we're
        // just checking that recognize() does not misfire on arbitrary
        // plausible bytes.
        let mut fake_srgb = [0u8; 512];
        // `desc` tag text region — write "sRGB" in ASCII and UTF-16BE so
        // neither of our XYB fallback markers matches.
        fake_srgb[128..132].copy_from_slice(b"sRGB");
        fake_srgb[200..208].copy_from_slice(&[0, b's', 0, b'R', 0, b'G', 0, b'B']);
        assert_eq!(recognize(&fake_srgb), None);
    }

    #[test]
    fn recognize_short_buffers_are_unknown() {
        // Buffers shorter than an ICC header are never recognized, even
        // if they happen to contain the "XYB" substring.
        let tiny = b"XYB";
        assert_eq!(recognize(tiny), None);
        let short = b"random bytes including XYB somewhere";
        assert_eq!(recognize(short), None);
    }

    #[test]
    fn recognize_fallback_for_utf16_marker() {
        // Profile with an XYB UTF-16BE description somewhere in the middle
        // of a 256-byte blob — we accept this under the fallback rule.
        let mut blob = [0u8; 256];
        blob[128..134].copy_from_slice(&[0, b'X', 0, b'Y', 0, b'B']);
        assert_eq!(recognize(&blob), Some(BuiltinProfile::XybScaled));
    }

    // =========================================================================
    // SIMD vs scalar parity
    // =========================================================================

    fn grid_samples() -> Vec<u8> {
        // 6 values per channel × 3 channels = 216 triples = 648 bytes.
        // Deliberately spans the full 0..=255 range at step=51.
        let steps = [0u8, 51, 102, 153, 204, 255];
        let mut out = Vec::with_capacity(216 * 3);
        for &r in &steps {
            for &g in &steps {
                for &b in &steps {
                    out.push(r);
                    out.push(g);
                    out.push(b);
                }
            }
        }
        assert_eq!(out.len(), 216 * 3);
        out
    }

    #[test]
    fn convert_xyb_scaled_simd_matches_scalar() {
        let src = grid_samples();
        let mut out_simd = vec![0u8; src.len()];
        let mut out_scalar = vec![0u8; src.len()];

        convert_xyb_scaled_to_srgb_u8(&src, &mut out_simd);
        convert_xyb_scaled_to_srgb_u8_scalar(&src, &mut out_scalar);

        // Tolerance: ≤1 per channel. SIMD uses AVX2/FMA on x86_64 where
        // available, scalar uses plain f32 (no FMA). Fused multiply-add
        // legitimately differs from separate mul+add by ≤1 ULP per stage,
        // and the final u8 rounding can turn that into a 1-byte diff.
        // In practice on this pipeline the margins are wide enough that
        // the 216-sample grid sees zero drift on current hardware, but the
        // ≤1 tolerance guards against future CPU / toolchain flakiness.
        //
        // ≥2-byte diffs would indicate a real bug (constant mismatch,
        // wrong matrix row, etc.), so that stays hard-asserted.
        let mut max_diff: i32 = 0;
        let mut over_one = 0usize;
        let mut over_one_log = 0usize;
        for (i, (&s, &v)) in out_simd.iter().zip(out_scalar.iter()).enumerate() {
            let d = (s as i32 - v as i32).abs();
            if d > max_diff {
                max_diff = d;
            }
            if d > 1 {
                if over_one_log < 10 {
                    eprintln!(
                        "pixel byte {i}: simd={s}, scalar={v}, src={:?}",
                        &src[(i / 3) * 3..(i / 3) * 3 + 3]
                    );
                    over_one_log += 1;
                }
                over_one += 1;
            }
        }
        assert_eq!(
            over_one, 0,
            "SIMD diverges from scalar by >1 at {over_one} byte positions \
             (max diff {max_diff}). ±1 is tolerated for FMA rounding; \
             ≥2 indicates a real kernel bug."
        );
    }

    #[test]
    fn convert_xyb_scaled_single_pixel_vs_buffer() {
        // Sanity: per-pixel API should agree with the buffer API.
        let src = grid_samples();
        let mut out = vec![0u8; src.len()];
        convert_xyb_scaled_to_srgb_u8_scalar(&src, &mut out);

        for i in 0..(src.len() / 3) {
            let (r, g, b) = (src[i * 3], src[i * 3 + 1], src[i * 3 + 2]);
            let (pr, pg, pb) = xyb_scaled_u8_pixel_to_srgb(r, g, b);
            assert_eq!(
                (pr, pg, pb),
                (out[i * 3], out[i * 3 + 1], out[i * 3 + 2]),
                "pixel {i}: per-pixel API diverges from buffer API"
            );
        }
    }

    // =========================================================================
    // Dispatch helper
    // =========================================================================

    #[test]
    fn maybe_convert_returns_true_on_recognized_srgb_target() {
        let src = grid_samples();
        let mut out = vec![0u8; src.len()];
        let handled = maybe_convert_via_builtin(XYB_ICC_BYTES, Cicp::SRGB, &src, &mut out);
        assert!(
            handled,
            "built-in dispatch must handle (XybScaled, sRGB target)"
        );

        // Output should be non-trivial (not all zeros) for a grid that
        // spans the sRGB range.
        assert!(out.iter().any(|&b| b != 0));
    }

    #[test]
    fn maybe_convert_returns_false_on_unrecognized_profile() {
        let fake_icc = [0u8; 64];
        let src = vec![128u8; 30];
        let mut out = vec![0u8; 30];
        let handled = maybe_convert_via_builtin(&fake_icc, Cicp::SRGB, &src, &mut out);
        assert!(!handled);
        // Output buffer should be untouched (still all zeros).
        assert!(out.iter().all(|&b| b == 0));
    }

    #[test]
    fn maybe_convert_returns_false_on_unsupported_target() {
        // Recognized profile, unsupported target CICP (BT.2100 PQ).
        let src = grid_samples();
        let mut out = vec![0u8; src.len()];
        let handled = maybe_convert_via_builtin(XYB_ICC_BYTES, Cicp::BT2100_PQ, &src, &mut out);
        assert!(
            !handled,
            "only sRGB target is supported today — PQ must fall through"
        );
    }

    // =========================================================================
    // Roundtrip via a locally-reimplemented encode side
    // =========================================================================

    // We don't want to depend on zenjpeg here (would introduce a cyclic-ish
    // dev-dep), so we reimplement the scalar encode side of the XYB
    // transform in the test. These constants must match
    // `zenjpeg::color::xyb` exactly — see `docs/XYB_ICC_HANDLING.md`.

    #[rustfmt::skip]
    const OPSIN_MATRIX: [f32; 9] = [
        0.30,          0.622,         0.078,
        0.23,          0.692,         0.078,
        0.243_422_69,  0.204_767_44,  0.551_809_87,
    ];
    const OPSIN_BIAS: f32 = 0.003_793_073_3;
    #[allow(clippy::inconsistent_digit_grouping, clippy::excessive_precision)]
    const SCALED_XYB_OFFSET: [f32; 3] = [0.015_386_134, 0.0, 0.277_704_59];
    #[allow(clippy::inconsistent_digit_grouping, clippy::excessive_precision)]
    const SCALED_XYB_SCALE: [f32; 3] = [22.995_788_804, 1.183_000_077, 1.502_141_333];

    fn cbrtf_ref(x: f32) -> f32 {
        // Use libm-free std cbrtf — precision well above the 6-ULP jpegli
        // approximation; that's fine for testing, we'll compare outputs
        // within a ±2 tolerance.
        x.cbrt()
    }

    fn srgb_u8_to_linear_exact(v: u8) -> f32 {
        let x = v as f32 / 255.0;
        if x <= 0.040_45 {
            x / 12.92
        } else {
            ((x + 0.055) / 1.055).powf(2.4)
        }
    }

    fn linear_rgb_to_xyb(r: f32, g: f32, b: f32) -> (f32, f32, f32) {
        let m = &OPSIN_MATRIX;
        let bias = OPSIN_BIAS;
        let opsin_r = (m[0] * r + m[1] * g + m[2] * b + bias).max(0.0);
        let opsin_g = (m[3] * r + m[4] * g + m[5] * b + bias).max(0.0);
        let opsin_b = (m[6] * r + m[7] * g + m[8] * b + bias).max(0.0);
        let neg_bias_cbrt = -cbrtf_ref(bias);
        let cbrt_r = cbrtf_ref(opsin_r) + neg_bias_cbrt;
        let cbrt_g = cbrtf_ref(opsin_g) + neg_bias_cbrt;
        let cbrt_b = cbrtf_ref(opsin_b) + neg_bias_cbrt;
        let x = 0.5 * (cbrt_r - cbrt_g);
        let y = 0.5 * (cbrt_r + cbrt_g);
        (x, y, cbrt_b)
    }

    fn scale_xyb(x: f32, y: f32, b: f32) -> (f32, f32, f32) {
        let sx = (x + SCALED_XYB_OFFSET[0]) * SCALED_XYB_SCALE[0];
        let sy = (y + SCALED_XYB_OFFSET[1]) * SCALED_XYB_SCALE[1];
        let sb = (b - y + SCALED_XYB_OFFSET[2]) * SCALED_XYB_SCALE[2];
        (sx, sy, sb)
    }

    fn encode_srgb_to_scaled_xyb_u8(r: u8, g: u8, b: u8) -> (u8, u8, u8) {
        // Matches zenjpeg's encoder: compute scaled XYB f32, apply the
        // standard JPEG level shift (+128), clamp to u8. The scaled XYB
        // values are small signed floats around 0, so the level shift
        // centers them in the u8 range.
        let (x, y, bx) = linear_rgb_to_xyb(
            srgb_u8_to_linear_exact(r),
            srgb_u8_to_linear_exact(g),
            srgb_u8_to_linear_exact(b),
        );
        let (sx, sy, sb) = scale_xyb(x, y, bx);
        let to_u8 = |v: f32| (v + 128.0).clamp(0.0, 255.0).round() as u8;
        (to_u8(sx), to_u8(sy), to_u8(sb))
    }

    #[test]
    fn sanity_decode_produces_reasonable_output() {
        // Scaled XYB at 8-bit precision is lossy by construction — the
        // encoder produces f32 scaled values clustered around 0 with a
        // magnitude < 1 u8 step for neutral grays, so a u8 round-trip
        // cannot recover the original sRGB. That precision only survives
        // in the f32 pipeline inside a full codec.
        //
        // What we CAN check: the inverse produces sensible non-degenerate
        // output when fed scaled XYB u8 values from a realistic range.
        // The absence of NaN/Inf, saturation monotonicity, and a non-flat
        // gray image are the meaningful checks here.
        let steps = [80u8, 128, 176, 224];
        let mut src = Vec::new();
        for &r in &steps {
            for &g in &steps {
                for &b in &steps {
                    let (sx, sy, sb) = encode_srgb_to_scaled_xyb_u8(r, g, b);
                    src.extend_from_slice(&[sx, sy, sb]);
                }
            }
        }
        let mut out = vec![0u8; src.len()];
        convert_xyb_scaled_to_srgb_u8(&src, &mut out);

        // Some pixels must exercise both ends of the u8 range.
        assert!(out.iter().any(|&v| v > 200), "saturated highlights missing");
        assert!(out.iter().any(|&v| v < 50), "dark tones missing");

        // No stuck values: output should exhibit channel-level variation
        // across the input sweep (not all pixels identical).
        let first = out[0];
        assert!(
            out.iter().any(|&v| v != first),
            "output is uniformly constant, inverse is likely broken"
        );

        // Every byte is a valid u8 (tautology, but this catches any future
        // regression where the round-to-u8 path accidentally returns
        // out-of-range floats cast unsafely).
        for &v in &out {
            let _: u8 = v;
        }
    }

    // =========================================================================
    // Deterministic regression table (cross-platform byte-exact)
    // =========================================================================

    /// Golden-reference outputs for the SCALAR inverse XYB kernel.
    ///
    /// Locks in the exact bytes produced by `xyb_scaled_u8_pixel_to_srgb`
    /// on representative scaled-XYB u8 inputs. The scalar kernel uses plain
    /// f32 arithmetic (no FMA fusion, no vector-width ordering), so these
    /// numbers are deterministic across x86_64 / aarch64 / wasm32 / CI
    /// runners. If any constant in the inverse pipeline drifts — opsin
    /// matrix, scale/offset, sRGB OETF — this table catches it immediately.
    ///
    /// Inputs are sampled from:
    ///   - the level-shift center and both u8 extremes,
    ///   - each of the three channel axes driven to its extreme in isolation,
    ///   - a handful of arbitrary interior points.
    ///
    /// The scaled-XYB representation is intrinsically lossy at u8 (see
    /// `sanity_decode_produces_reasonable_output` for the narrative), so
    /// these outputs are NOT an approximation of any sRGB source — they
    /// are whatever the inverse happens to produce for each input. That's
    /// fine: the test's only job is "detect any drift in the kernel".
    type Rgb = (u8, u8, u8);

    #[rustfmt::skip]
    const XYB_SCALAR_GOLD: &[(Rgb, Rgb)] = &[
        ((128, 128, 128), (0,   25,  0)),
        ((0,   0,   0),   (0,   255, 0)),
        ((255, 255, 255), (255, 0,   255)),
        ((128, 255, 128), (255, 255, 255)),
        ((128, 0,   128), (0,   0,   0)),
        ((255, 128, 128), (255, 0,   0)),
        ((0,   128, 128), (0,   255, 255)),
        ((128, 128, 255), (0,   0,   255)),
        ((128, 128, 0),   (255, 255, 0)),
        ((200, 140, 150), (255, 0,   255)),
        ((50,  100, 80),  (0,   255, 0)),
        ((160, 128, 200), (0,   0,   255)),
        ((100, 200, 60),  (0,   255, 0)),
    ];

    #[test]
    fn convert_xyb_scaled_scalar_regression_table() {
        // Per-pixel scalar API: must be byte-exact against the gold table.
        for &((ir, ig, ib), (er, eg, eb)) in XYB_SCALAR_GOLD {
            let got = xyb_scaled_u8_pixel_to_srgb(ir, ig, ib);
            assert_eq!(
                got,
                (er, eg, eb),
                "scalar xyb_scaled_u8_pixel_to_srgb({ir},{ig},{ib}) drifted: \
                 got {got:?} expected ({er},{eg},{eb}). Update the gold table \
                 only if this change is intentional — per-channel drift here \
                 indicates the XYB inverse pipeline constants moved."
            );
        }

        // Buffer scalar API: must agree with the per-pixel API across the
        // same sample set. This is redundant with
        // `convert_xyb_scaled_single_pixel_vs_buffer` for the grid samples,
        // but explicit here so a CI failure report points at the specific
        // input that broke.
        for &((ir, ig, ib), (er, eg, eb)) in XYB_SCALAR_GOLD {
            let src = [ir, ig, ib];
            let mut out = [0u8; 3];
            convert_xyb_scaled_to_srgb_u8_scalar(&src, &mut out);
            assert_eq!(
                (out[0], out[1], out[2]),
                (er, eg, eb),
                "buffer scalar diverges from gold at input ({ir},{ig},{ib})"
            );
        }
    }

    #[test]
    fn convert_xyb_scaled_dispatch_regression_table() {
        // Dispatch path (SIMD on x86_64 with AVX2/FMA, scalar elsewhere).
        // Tolerance is 0 per channel — we expect byte-identical to the
        // scalar reference. FMA fusion in the SIMD kernel can in principle
        // produce ±1 ULP differences that round to a different u8 byte,
        // but in practice this pipeline's rounding margins are wide enough
        // that the table above reproduces byte-exact on AVX2+FMA as well.
        // If a future CPU / toolchain / FMA lowering trips this, widen
        // the tolerance to ±1 and pin the flaky inputs explicitly.
        for &((ir, ig, ib), (er, eg, eb)) in XYB_SCALAR_GOLD {
            let src = [ir, ig, ib];
            let mut out = [0u8; 3];
            convert_xyb_scaled_to_srgb_u8(&src, &mut out);
            let dr = (out[0] as i32 - er as i32).abs();
            let dg = (out[1] as i32 - eg as i32).abs();
            let db = (out[2] as i32 - eb as i32).abs();
            assert!(
                dr <= 1 && dg <= 1 && db <= 1,
                "dispatch path diverges from gold at ({ir},{ig},{ib}): \
                 got ({}, {}, {}) expected ({er},{eg},{eb}), per-channel \
                 ΔE=({dr},{dg},{db})",
                out[0],
                out[1],
                out[2]
            );
        }
    }
}