zenwebp 0.4.5 - Docs.rs

//! Closed-loop target-zensim adaptive encoder.
//!
//! Mirrors the design of zenjpeg's `target_zq` module (`src/encode/zq.rs`):
//! the encoder iteratively encodes the image, decodes it, measures the
//! resulting zensim score against the source, and either ships the result
//! (within the tolerance band) or adjusts global VP8 quality and tries again.
//!
//! # Convergence
//!
//! Three mechanisms keep most encodes to one or two passes:
//!
//! 1. **Per-bucket starting-q calibration** — the first-pass quality is
//!    chosen by interpolating in a per-content-type anchor table. Photo,
//!    Drawing, and Icon get their own tables (3 buckets vs zenjpeg's 5);
//!    these mirror zenjpeg's PHOTO_DETAILED / SCREEN_CONTENT shapes.
//! 2. **Asymmetric tolerance band** — `max_overshoot=Some(t)` means we ship
//!    if achieved is in `[target, target+t]` even when more passes are
//!    available. `max_undershoot=None` (default) is best-effort; setting
//!    `Some(t)` makes a final achieved < target-t a hard error.
//! 3. **Secant step** — when off-band, the next q is computed via a
//!    one-pair secant fitted to the most recent two (q, achieved) probes.
//!    Falls back to a fixed step on the first iteration.
//!
//! # Calibration recipe
//!
//! The per-bucket anchors below come from `dev/zensim_calibrate.rs`. To
//! re-fit (e.g. after touching the encoder's RD path or upgrading zensim):
//!
//! ```text
//! cargo run --release --features target-zensim --example zensim_calibrate -- <corpus>
//! ```
//!
//! The tool emits a TSV plus a Rust-formatted const block. Replace the
//! `PHOTO`, `DRAWING`, `ICON` arrays below with the harness output. Anchors
//! are `(target_zensim, starting_q)` pairs ordered by ascending target.

use super::analysis::ImageContentType;

/// Explicit target-perceptual-quality specification.
///
/// Default: target=80, max_overshoot=Some(1.5),
/// max_undershoot_ship=Some(0.5), max_undershoot=None, max_passes=2 —
/// best-effort behavior tuned to land in band on pass 1 for typical
/// photo content and never iterate more than once.
///
/// # Pixel layout
///
/// `target_zensim` supports `PixelLayout::Rgb8` and
/// `PixelLayout::Rgba8`. RGBA inputs are encoded as alpha-bearing
/// WebP and measured against the source via zensim's
/// deterministic-noise compositing — the same composite is applied to
/// source and reconstruction, so alpha quality is fully defined.
/// Other layouts (BGR, BGRA, ARGB, L8, LA8, YUV420) with
/// `target_zensim` set return
/// [`EncodeError::TargetZensimUnsupportedLayout`](super::api::EncodeError::TargetZensimUnsupportedLayout).
///
/// # Tolerance bands
///
/// The encoder uses an asymmetric tolerance band built from three
/// fields:
///
/// - **Ship band** — the range of `achieved_score` values that count
///   as "close enough" to ship without iterating further. After any
///   pass, if the result is in `[target - max_undershoot_ship,
///   target + max_overshoot]`, the encoder ships immediately and the
///   loop exits. With the defaults above, that's `[79.5, 81.5]` for
///   `target = 80`.
/// - **Fail band** — the range of `achieved_score` values below
///   `target - max_undershoot` (when `max_undershoot.is_some()`). On
///   the final pass, if the result lands in this band the encoder
///   returns `Err` instead of shipping. With `max_undershoot = None`
///   (the default), the encoder never errors on undershoot — it
///   always ships its best attempt.
///
/// `max_undershoot_ship` (the SHIP threshold) is intentionally
/// distinct from `max_undershoot` (the FAILURE threshold). The ship
/// threshold says "calibration was good enough — don't risk
/// overshooting on the next pass"; the failure threshold says "this
/// undershoot is too big to ship at all".
#[non_exhaustive]
#[derive(Debug, Clone, Copy, PartialEq)]
pub struct ZensimTarget {
    /// Ideal zensim score. The encoder iterates to reach or exceed this
    /// within the [`Self::max_passes`] budget.
    pub target: f32,

    /// Distance ABOVE target the encoder will accept without further
    /// iteration. `None` = ship the first feasible result (single pass
    /// once target is met). `Some(t)` = if `achieved > target + t`,
    /// claw back bytes by loosening quality.
    ///
    /// Default: `Some(1.5)`.
    pub max_overshoot: Option<f32>,

    /// Distance BELOW target the encoder will accept as a successful
    /// pass-0 ship, without iterating. Asymmetric with
    /// [`Self::max_overshoot`]: undershoot up to this much is treated
    /// as "close enough — calibration was already good, don't risk
    /// overshooting on the next pass". `None` (or `Some(0.0)`) =
    /// strict at-or-above-target behavior; any undershoot triggers
    /// another pass when budget allows.
    ///
    /// Distinct from [`Self::max_undershoot`]: that field is the
    /// FAILURE threshold (final-pass error), this is the SHIP
    /// threshold (early-exit acceptance).
    ///
    /// Default: `Some(0.5)`.
    pub max_undershoot_ship: Option<f32>,

    /// Distance BELOW target the encoder will accept as a SUCCESSFUL
    /// encode. `None` = best-effort, never error (default; encoder ships
    /// whatever it managed within `max_passes`). `Some(t)` = if final
    /// `achieved < target - t` after exhausting `max_passes`, the encoder
    /// returns `Err`.
    ///
    /// Set this when you NEED a strictness guarantee (archival, SLA-bound
    /// serving). Permissive callers leave it `None` and inspect
    /// [`ZensimEncodeMetrics::achieved_score`] themselves.
    ///
    /// Distinct from [`Self::max_undershoot_ship`]: that field is the
    /// SHIP threshold (early-exit acceptance), this is the FAILURE
    /// threshold (final-pass error).
    ///
    /// Default: `None`.
    pub max_undershoot: Option<f32>,

    /// Iteration budget. `1` = single-pass (no correction; behaves like a
    /// regular encode at the calibrated starting q). `2` = default; one
    /// initial encode plus one correction pass.
    pub max_passes: u8,
}

impl Default for ZensimTarget {
    fn default() -> Self {
        Self {
            target: 80.0,
            max_overshoot: Some(1.5),
            max_undershoot_ship: Some(0.5),
            max_undershoot: None,
            max_passes: 2,
        }
    }
}

impl ZensimTarget {
    /// Construct a `ZensimTarget` with the given target and default
    /// tolerances / passes. Equivalent to `ZensimTarget { target,
    /// ..Default::default() }`.
    #[must_use]
    pub fn new(target: f32) -> Self {
        Self {
            target,
            ..Default::default()
        }
    }
}

impl From<f32> for ZensimTarget {
    /// Build a `ZensimTarget` from a bare target value, using default
    /// tolerances / passes. Lets [`LossyConfig::with_target_zensim`]
    /// accept either an `f32` or a fully-built `ZensimTarget`.
    fn from(target: f32) -> Self {
        Self::new(target)
    }
}

impl ZensimTarget {
    /// Builder-style override of [`Self::max_overshoot`].
    #[must_use]
    pub fn with_max_overshoot(mut self, v: Option<f32>) -> Self {
        self.max_overshoot = v;
        self
    }

    /// Builder-style override of [`Self::max_undershoot`].
    #[must_use]
    pub fn with_max_undershoot(mut self, v: Option<f32>) -> Self {
        self.max_undershoot = v;
        self
    }

    /// Builder-style override of [`Self::max_undershoot_ship`].
    ///
    /// Ship-threshold (early-exit) — distinct from
    /// [`Self::with_max_undershoot`] (final-pass error threshold).
    #[must_use]
    pub fn with_max_undershoot_ship(mut self, v: Option<f32>) -> Self {
        self.max_undershoot_ship = v;
        self
    }

    /// Builder-style override of [`Self::max_passes`].
    #[must_use]
    pub fn with_max_passes(mut self, n: u8) -> Self {
        self.max_passes = n;
        self
    }
}

/// Outcome of a target-zensim encode, returned alongside the WebP bytes
/// from [`EncodeRequest::encode_with_metrics`](super::api::EncodeRequest::encode_with_metrics).
///
/// `targets_met` is `false` when:
/// - the iteration ran (target_zensim was set with the feature enabled), AND
/// - `achieved_score < target.target`, AND
/// - `max_undershoot.is_some()` AND `target - achieved > max_undershoot.unwrap()`
///
/// In all other cases — including configs without `target_zensim`, configs
/// where the feature is compiled out, and best-effort runs that landed
/// below target — `targets_met` is `true`.
#[non_exhaustive]
#[derive(Debug, Clone, Copy, PartialEq)]
pub struct ZensimEncodeMetrics {
    /// Final achieved zensim score. `f32::NAN` for non-target-zensim
    /// configs or when the feature is disabled (no measurement done).
    pub achieved_score: f32,

    /// Number of encode passes performed (including the initial pass).
    /// `1` for non-target-zensim configs.
    pub passes_used: u8,

    /// Encoded WebP byte count.
    pub bytes: usize,

    /// Whether the strictness contract was honored. See struct docs.
    pub targets_met: bool,
}

impl ZensimEncodeMetrics {
    /// Construct metrics for a non-target-zensim encode. `bytes` is the
    /// final output size.
    pub(crate) fn no_target(bytes: usize) -> Self {
        Self {
            achieved_score: f32::NAN,
            passes_used: 1,
            bytes,
            targets_met: true,
        }
    }
}

/// Per-bucket starting-q calibration. Maps a target zensim score to the
/// VP8 quality value that landed at-or-just-above the target on the
/// calibration corpus for `bucket`.
///
/// Anchors are `(target_zensim, starting_q)`. Linear interpolation between
/// adjacent anchors; clamped to the lowest/highest quality at the
/// endpoints.
///
/// # Anchor table source
///
/// PHOTO and DRAWING are fitted by `dev/zensim_calibrate.rs` against
/// 20 images of CID22 validation (median smallest-q-meeting-target per
/// content bucket). ICON is hand-distilled and conservative — re-fit
/// when an icon corpus is added (the CID22 subset has no ≤128px
/// images). To re-fit, see this module's top-level comment.
///
/// Fit data (CID22 validation, 20 images, 2026-04-26):
///   Photo   n=12, p25/p75 spread ≤ 7q across all 7 anchors.
///   Drawing n=8,  p25/p75 spread ≤ 5q across all 7 anchors.
#[must_use]
pub(crate) fn zensim_to_starting_q_for_bucket(target: f32, bucket: ImageContentType) -> f32 {
    // Photo bucket: natural photographs from CID22.
    const PHOTO: &[(f32, f32)] = &[
        (60.0, 30.0),
        (70.0, 60.0),
        (75.0, 75.0),
        (80.0, 85.0),
        (85.0, 90.0),
        (90.0, 98.0),
        (95.0, 100.0),
    ];
    // Drawing bucket: low-uniformity / complex texture content from
    // CID22 (the classifier puts mixed photo+UI here).
    const DRAWING: &[(f32, f32)] = &[
        (60.0, 30.0),
        (70.0, 60.0),
        (75.0, 72.5),
        (80.0, 85.0),
        (85.0, 90.0),
        (90.0, 100.0),
        (95.0, 100.0),
    ];
    // Icon bucket: ≤128px. Tiny images — every coefficient counts. Push
    // q higher to preserve detail. Hand-distilled (no fit corpus yet).
    const ICON: &[(f32, f32)] = &[
        (60.0, 65.0),
        (70.0, 78.0),
        (75.0, 85.0),
        (80.0, 90.0),
        (85.0, 95.0),
        (90.0, 98.0),
        (95.0, 100.0),
    ];
    let anchors = match bucket {
        ImageContentType::Photo => PHOTO,
        ImageContentType::Drawing | ImageContentType::Text => DRAWING,
        ImageContentType::Icon => ICON,
    };
    interpolate_anchors(target, anchors)
}

/// Continuous-blend starting-q calibration via zenanalyze likelihoods.
///
/// Replaces the discrete 3-bucket lookup with a soft mix of the Photo
/// and Drawing anchor tables, weighted by `NaturalLikelihood` and
/// `ScreenContentLikelihood + TextLikelihood`. The Icon path is
/// unchanged: small images (≤128 px on either side) still snap to the
/// `ICON` anchor table without running the analyzer.
///
/// Falls back to `None` (caller uses the discrete bucket table) when
/// the analyzer can't run (unsupported layout, length mismatch, RGBA
/// edge case). The blend formula:
///
/// ```text
/// w_screen = clamp01(screen + text)
/// w_photo  = clamp01(natural)
/// total    = w_screen + w_photo
/// q        = (w_screen * q_drawing + w_photo * q_photo) / total
/// ```
///
/// `total > 0` is guaranteed in practice because zenanalyze normalises
/// the three soft scores so they don't all collapse to zero on real
/// content. If `total == 0` (degenerate / 1×1 input), we fall back to
/// the Photo anchor — same default as the discrete-bucket path.
#[cfg(feature = "analyzer")]
#[allow(clippy::needless_pass_by_value, dead_code)]
fn starting_q_via_likelihoods(
    target: f32,
    pixels: &[u8],
    layout: crate::PixelLayout,
    width: u32,
    height: u32,
) -> Option<f32> {
    use crate::PixelLayout;
    use crate::encoder::analysis::ZenanalyzeDiag;
    use crate::encoder::analysis::classifier::{classify_image_type_rgb8_diag, rgba8_to_rgb8};
    let w = width as usize;
    let h = height as usize;
    if w < 8 || h < 8 {
        return None;
    }
    if width <= 128 && height <= 128 {
        // Tiny — keep the hand-distilled ICON ramp. The analyzer's
        // likelihoods are noisy on sub-128px input.
        return Some(zensim_to_starting_q_for_bucket(
            target,
            ImageContentType::Icon,
        ));
    }
    let (_bucket, diag): (ImageContentType, ZenanalyzeDiag) = match layout {
        PixelLayout::Rgb8 => {
            let n = w * h * 3;
            if pixels.len() < n {
                return None;
            }
            classify_image_type_rgb8_diag(&pixels[..n], width, height)
        }
        PixelLayout::Rgba8 => {
            let n = w * h * 4;
            if pixels.len() < n {
                return None;
            }
            let rgb = rgba8_to_rgb8(&pixels[..n]);
            classify_image_type_rgb8_diag(&rgb, width, height)
        }
        _ => return None,
    };

    let q_photo = zensim_to_starting_q_for_bucket(target, ImageContentType::Photo);
    let q_drawing = zensim_to_starting_q_for_bucket(target, ImageContentType::Drawing);

    let w_screen = (diag.screen_content + diag.text_likelihood).clamp(0.0, 1.0);
    let w_photo = diag.natural_likelihood.clamp(0.0, 1.0);
    let total = w_screen + w_photo;
    if total <= f32::EPSILON {
        return Some(q_photo);
    }
    Some((w_screen * q_drawing + w_photo * q_photo) / total)
}

/// Linear interpolation over a sorted (target, q) anchor table. Clamps
/// to the endpoints' q values outside the bracketed range. Returns
/// `target` itself if the table is empty.
fn interpolate_anchors(target: f32, anchors: &[(f32, f32)]) -> f32 {
    if anchors.is_empty() {
        return target;
    }
    if target <= anchors[0].0 {
        return anchors[0].1;
    }
    let last = anchors[anchors.len() - 1];
    if target >= last.0 {
        return last.1.min(100.0);
    }
    for w in anchors.windows(2) {
        let (lo, hi) = (w[0], w[1]);
        if target >= lo.0 && target <= hi.0 {
            let t = (target - lo.0) / (hi.0 - lo.0);
            return lo.1 + t * (hi.1 - lo.1);
        }
    }
    target
}

// ============================================================================
// Ablation toggles (dev-only, gated on `ablation` feature)
// ============================================================================
//
// The PR shipping target_zensim (Phases 1/2/3) is a stack of distinct
// mechanisms. The `ablation` feature exposes per-chunk disable toggles
// via thread-local cells set explicitly by `dev/zensim_*.rs` measurement
// binaries. Production builds (the `target-zensim` feature alone, no
// `ablation`) get const `false` for every toggle — the compiler folds
// them away entirely. **No environment variables are read in either
// configuration**: ablation is off-by-default, and turning it on requires
// a typed call to [`ablation::set_toggles`].
//
// None of these toggles touches the encoded bytestream — they only
// change the iteration loop's decisions (which q to start at, whether
// to take a per-segment vs global-q correction step, etc.).

/// Dev-only ablation toggles for the closed-loop `target_zensim`
/// iteration. **Gated on the unstable `ablation` Cargo feature** —
/// production callers do not see this type and the iteration loop uses
/// constant defaults.
///
/// All fields are off by default (matching production behavior).
/// Set them via [`set_toggles`] to disable individual phases of the
/// adaptive encoder for measurement binaries (`dev/zensim_*.rs`).
///
/// This API is **not** part of the stable zenwebp surface — it is
/// dev-only by design and may change without a major-version bump.
#[cfg(feature = "ablation")]
#[derive(Debug, Default, Clone, Copy, PartialEq)]
pub struct AblationToggles {
    /// Force the global-q fallback path even when segments are active
    /// (Phase 3 disabled).
    pub disable_phase3: bool,
    /// Emit `PHASE3_TRACE` lines to stderr per pass.
    pub trace_phase3: bool,
    /// Skip the bucket classifier and use a single linear ramp.
    pub naive_starting_q: bool,
    /// Don't force `multi_pass_stats=true` inside the iteration loop.
    pub no_multi_pass_stats: bool,
    /// Restore the pre-Phase-2 hand-distilled anchors.
    pub pre_phase2_anchors: bool,
    /// Use a fixed proportional Δq instead of the secant.
    pub no_secant: bool,
    /// Phase 3 aggregator: use the 2x2 spatial-quadrant proxy instead of
    /// the encoder's real k-means `segment_map`.
    pub use_quadrant_proxy: bool,
    /// Threshold (`|target - achieved|`) below which Phase 3 per-segment
    /// correction is preferred over the global-q secant. `None` keeps
    /// the production default (0.5).
    pub phase3_fine_gap: Option<f32>,
}

#[cfg(feature = "ablation")]
mod ablation_runtime {
    use core::cell::Cell;
    use std::thread_local;

    thread_local! {
        pub(crate) static DISABLE_PHASE3: Cell<bool> = const { Cell::new(false) };
        pub(crate) static TRACE_PHASE3: Cell<bool> = const { Cell::new(false) };
        pub(crate) static NAIVE_STARTING_Q: Cell<bool> = const { Cell::new(false) };
        pub(crate) static NO_MULTI_PASS_STATS: Cell<bool> = const { Cell::new(false) };
        pub(crate) static PRE_PHASE2_ANCHORS: Cell<bool> = const { Cell::new(false) };
        pub(crate) static NO_SECANT: Cell<bool> = const { Cell::new(false) };
        pub(crate) static USE_QUADRANT_PROXY: Cell<bool> = const { Cell::new(false) };
        // f32::NAN means "use the production default (0.5)".
        pub(crate) static PHASE3_FINE_GAP: Cell<f32> = const { Cell::new(f32::NAN) };
    }
}

/// Set ablation toggles for the **current thread**. Dev-only — exposed
/// only when the `ablation` Cargo feature is enabled.
///
/// Each call replaces the entire toggle state (any field not set in
/// `t` reverts to its default value). Effects are scoped to the calling
/// thread; toggles do NOT propagate to threads spawned afterwards.
///
/// The `target_zensim` iteration loop reads these values once at the
/// start of each phase decision; toggling mid-encode is allowed but the
/// loop won't observe the change until its next polling point.
#[cfg(feature = "ablation")]
pub fn set_toggles(t: AblationToggles) {
    use ablation_runtime::*;
    DISABLE_PHASE3.with(|c| c.set(t.disable_phase3));
    TRACE_PHASE3.with(|c| c.set(t.trace_phase3));
    NAIVE_STARTING_Q.with(|c| c.set(t.naive_starting_q));
    NO_MULTI_PASS_STATS.with(|c| c.set(t.no_multi_pass_stats));
    PRE_PHASE2_ANCHORS.with(|c| c.set(t.pre_phase2_anchors));
    NO_SECANT.with(|c| c.set(t.no_secant));
    USE_QUADRANT_PROXY.with(|c| c.set(t.use_quadrant_proxy));
    // None → NaN sentinel, meaning "use production default 0.5".
    PHASE3_FINE_GAP.with(|c| c.set(t.phase3_fine_gap.unwrap_or(f32::NAN)));
}

/// Production default for the Phase 3 fine-gap threshold. See the
/// long-form comment on the iteration loop for why 0.5.
#[cfg(feature = "target-zensim")]
const PHASE3_FINE_GAP_DEFAULT: f32 = 0.5;

// ============================================================================
// Iteration loop (gated on the `target-zensim` feature)
// ============================================================================

#[cfg(feature = "target-zensim")]
pub(crate) mod iteration {
    use super::*;
    use crate::PixelLayout;
    use crate::encoder::api::{EncodeDiagnostics, EncodeError};
    use crate::encoder::config::LossyConfig;
    use alloc::format;
    use alloc::vec::Vec;

    /// Result of running the closed-loop iteration: bytes + metrics, or
    /// an error if a hard constraint was violated.
    pub(crate) type IterationResult = Result<(Vec<u8>, ZensimEncodeMetrics), EncodeError>;

    /// Disable Phase 3 per-segment correction. When set, every pass after
    /// pass 0 takes the global-q secant step (or fallback step) instead of
    /// computing per-segment quant overrides from the diffmap. Equivalent
    /// to "Phase 1 (+ Phase 2 anchors) only".
    #[cfg(feature = "ablation")]
    fn ablate_disable_phase3() -> bool {
        super::ablation_runtime::DISABLE_PHASE3.with(|c| c.get())
    }
    #[cfg(not(feature = "ablation"))]
    const fn ablate_disable_phase3() -> bool {
        false
    }

    /// Emit a per-pass investigation trace to stderr when set. Format
    /// (one line per pass, prefix `PHASE3_TRACE`):
    ///   PHASE3_TRACE pass=<n> target=<t> q=<q> mps=<bool>
    ///     achieved=<f> bytes=<u> num_segs=<n>
    ///     means=[s0,s1,s2,s3] counts=[n0,n1,n2,n3]
    ///     cum_overrides=[d0,d1,d2,d3] decision=<reason>
    #[cfg(feature = "ablation")]
    fn trace_phase3() -> bool {
        super::ablation_runtime::TRACE_PHASE3.with(|c| c.get())
    }
    #[cfg(not(feature = "ablation"))]
    const fn trace_phase3() -> bool {
        false
    }

    /// Skip the bucket classifier and use a single naive starting-q anchor
    /// table for all images regardless of content type. Disables Phase 1's
    /// per-bucket calibration AND Phase 2's refit (since both Photo and
    /// Drawing get replaced with the naive identity-ish ramp).
    #[cfg(feature = "ablation")]
    fn ablate_naive_starting_q() -> bool {
        super::ablation_runtime::NAIVE_STARTING_Q.with(|c| c.get())
    }
    #[cfg(not(feature = "ablation"))]
    const fn ablate_naive_starting_q() -> bool {
        false
    }

    /// Don't force `multi_pass_stats=true` inside the iteration loop on
    /// pass 1+. Leaves the user's `LossyConfig.multi_pass_stats` value
    /// (typically `false`).
    #[cfg(feature = "ablation")]
    fn ablate_no_multi_pass_stats() -> bool {
        super::ablation_runtime::NO_MULTI_PASS_STATS.with(|c| c.get())
    }
    #[cfg(not(feature = "ablation"))]
    const fn ablate_no_multi_pass_stats() -> bool {
        false
    }

    /// Restore the pre-Phase-2 hand-distilled anchors for Photo/Drawing
    /// (Icon was hand-distilled in both versions and is unchanged). Tests
    /// whether the Phase 2 calibration tool's refit is actually pulling
    /// its weight vs the original judgement-based table.
    #[cfg(feature = "ablation")]
    fn ablate_pre_phase2_anchors() -> bool {
        super::ablation_runtime::PRE_PHASE2_ANCHORS.with(|c| c.get())
    }
    #[cfg(not(feature = "ablation"))]
    const fn ablate_pre_phase2_anchors() -> bool {
        false
    }

    /// Use a fixed proportional Δq instead of the secant. Only meaningful
    /// when Phase 3 is also disabled (the global-q fallback path).
    #[cfg(feature = "ablation")]
    fn ablate_no_secant() -> bool {
        super::ablation_runtime::NO_SECANT.with(|c| c.get())
    }
    #[cfg(not(feature = "ablation"))]
    const fn ablate_no_secant() -> bool {
        false
    }

    /// Naive starting-q anchor table — single linear ramp (target, q) used
    /// when the bucket classifier is bypassed. Shape mirrors zenjpeg's
    /// `zq_to_starting_jpegli_q`: a gentle ramp that crosses target=q at
    /// the high end and starts conservatively at the low end. Only reached
    /// under the `ablation` feature when
    /// [`AblationToggles::naive_starting_q`] is set.
    #[cfg_attr(not(feature = "ablation"), allow(dead_code))]
    fn naive_starting_q(target: f32) -> f32 {
        const NAIVE: &[(f32, f32)] = &[
            (60.0, 50.0),
            (70.0, 65.0),
            (75.0, 75.0),
            (80.0, 82.0),
            (85.0, 90.0),
            (90.0, 96.0),
            (95.0, 100.0),
        ];
        interpolate_anchors(target, NAIVE)
    }

    /// Pre-Phase-2 hand-distilled Photo / Drawing anchors. Recovered from
    /// commit `17b9c9a^:src/encoder/zensim_target.rs`. Used only when
    /// the `ablation` feature is on AND
    /// [`AblationToggles::pre_phase2_anchors`] is set (chunk E ablation).
    #[cfg_attr(not(feature = "ablation"), allow(dead_code))]
    fn pre_phase2_starting_q(target: f32, bucket: ImageContentType) -> f32 {
        const PHOTO_HAND: &[(f32, f32)] = &[
            (60.0, 50.0),
            (70.0, 65.0),
            (75.0, 72.0),
            (80.0, 80.0),
            (85.0, 88.0),
            (90.0, 94.0),
            (95.0, 98.0),
        ];
        const DRAWING_HAND: &[(f32, f32)] = &[
            (60.0, 55.0),
            (70.0, 70.0),
            (75.0, 78.0),
            (80.0, 84.0),
            (85.0, 90.0),
            (90.0, 96.0),
            (95.0, 100.0),
        ];
        const ICON_HAND: &[(f32, f32)] = &[
            (60.0, 65.0),
            (70.0, 78.0),
            (75.0, 85.0),
            (80.0, 90.0),
            (85.0, 95.0),
            (90.0, 98.0),
            (95.0, 100.0),
        ];
        let anchors = match bucket {
            ImageContentType::Photo => PHOTO_HAND,
            ImageContentType::Drawing | ImageContentType::Text => DRAWING_HAND,
            ImageContentType::Icon => ICON_HAND,
        };
        interpolate_anchors(target, anchors)
    }

    /// Maximum |Δq| applied per secant step. Prevents oscillation when
    /// the metric is locally non-linear.
    const MAX_DELTA_Q: f32 = 10.0;
    /// Default sensitivity (Δq per zensim-unit gap) for the FIRST step,
    /// before we have a (q, score) pair to fit a secant against.
    const DEFAULT_SENSITIVITY: f32 = 1.5;
    /// Minimum |Δq| applied per step (avoids no-op when sensitivity ×
    /// gap rounds to ~0).
    const MIN_DELTA_Q: f32 = 0.5;

    /// Run the closed loop. Pass 0 is global-q at the calibrated start;
    /// pass 1+ uses per-segment diffmap-driven correction when segments
    /// are active (Phase 3), falling back to a global-q secant step when
    /// segments are disabled (`num_segments == 1`) or unavailable.
    pub(crate) fn run(
        cfg: &LossyConfig,
        target: ZensimTarget,
        pixels: &[u8],
        layout: PixelLayout,
        width: u32,
        height: u32,
    ) -> IterationResult {
        // Layout gate: Rgb8 / Rgba8 only. The public API gate
        // (`EncodeRequest::try_encode_target_zensim_with_metrics`) already
        // enforces this; the redundant check here keeps the iteration
        // entry honest if an internal caller ever skips that gate.
        match layout {
            PixelLayout::Rgb8 | PixelLayout::Rgba8 => {}
            other => {
                return Err(EncodeError::TargetZensimUnsupportedLayout(other));
            }
        }
        // 1. Detect bucket from a quick classifier pass on the source.
        // Skipped entirely when the naive-starting-q ablation is on.
        let bucket = if ablate_naive_starting_q() {
            None
        } else {
            detect_bucket(pixels, layout, width, height)
        };

        // 2. Resolve the starting q.
        let mut q = if ablate_naive_starting_q() {
            // Chunk C ablation: skip per-bucket lookup, use a single ramp.
            naive_starting_q(target.target)
        } else if ablate_pre_phase2_anchors() {
            // Chunk E ablation: pre-Phase-2 hand-distilled anchors.
            match bucket {
                Some(b) => pre_phase2_starting_q(target.target, b),
                None => pre_phase2_starting_q(target.target, ImageContentType::Photo),
            }
        } else {
            // With the `analyzer` feature on: try the continuous blend
            // first, falling back to the discrete bucket-anchor table
            // when the analyzer signals are unavailable. With the
            // feature off, this reduces to the original lookup.
            #[cfg(feature = "analyzer")]
            {
                match starting_q_via_likelihoods(target.target, pixels, layout, width, height) {
                    Some(q) => q,
                    None => match bucket {
                        Some(b) => zensim_to_starting_q_for_bucket(target.target, b),
                        None => {
                            zensim_to_starting_q_for_bucket(target.target, ImageContentType::Photo)
                        }
                    },
                }
            }
            #[cfg(not(feature = "analyzer"))]
            {
                match bucket {
                    Some(b) => zensim_to_starting_q_for_bucket(target.target, b),
                    None => zensim_to_starting_q_for_bucket(target.target, ImageContentType::Photo),
                }
            }
        };
        q = q.clamp(0.0, 100.0);

        // 3. Encode pass 0 (global-q, no per-segment overrides). Pass 0
        // also captures the encoder's per-MB segment_map via the internal
        // `EncodeDiagnostics` struct so Phase 3 can aggregate the diffmap
        // per real k-means segment instead of a 2x2 spatial proxy.
        let (bytes0, diag0) =
            encode_at_with_diagnostics(cfg, q, false, None, pixels, layout, width, height)?;
        let max_passes = target.max_passes.max(1);

        if trace_phase3() {
            eprintln!(
                "PHASE3_TRACE pass=0 target={:.3} q={:.3} mps=false bytes={} num_segs={} mb={}x{}",
                target.target,
                q,
                bytes0.len(),
                diag0.num_segments,
                diag0.mb_width,
                diag0.mb_height
            );
        }

        if max_passes <= 1 {
            return Ok((
                bytes0.clone(),
                ZensimEncodeMetrics {
                    achieved_score: f32::NAN,
                    passes_used: 1,
                    bytes: bytes0.len(),
                    targets_met: true,
                },
            ));
        }

        // 4. Measure pass 0 with per-pixel diffmap (used by Phase 3 if
        // segments are active). The reference and the decoded probe
        // both go through the same layout-aware path so the pair is
        // measured consistently (RGB→RGB, RGBA→RGBA with deterministic
        // noise compositing).
        let z = zensim::Zensim::new(zensim::ZensimProfile::latest());
        let pre = build_source_reference(&z, pixels, layout, width, height).ok_or_else(|| {
            EncodeError::InvalidBufferSize(
                "zensim precompute_reference failed (image too small?)".into(),
            )
        })?;
        let (score0, dm0) = measure_score_and_diffmap(&z, &pre, &bytes0, layout, width, height)?;

        if trace_phase3() {
            eprintln!(
                "PHASE3_TRACE pass=0_measured target={:.3} q={:.3} achieved={:.4} bytes={} gap={:.4}",
                target.target,
                q,
                score0,
                bytes0.len(),
                target.target - score0
            );
        }

        let mut best = Candidate {
            bytes: bytes0,
            score: score0,
            q,
            seg_overrides: None,
        };

        // Already in band? Ship pass 0.
        if in_band(score0, &target) {
            return finalize(best, 1, &target);
        }

        // Per-segment correction setup. Phase 3 uses the encoder's actual
        // k-means `segment_map` (one segment_id per macroblock, threaded
        // through via `pub(crate) EncodeDiagnostics` — see
        // `encode_inner_with_diagnostics`). When the encoder reported
        // `num_segments == 1` (segmentation disabled or simplified down
        // to a single segment) or the segment_map is empty, we fall back
        // to global-q correction.
        //
        // Chunk A ablation: when [`AblationToggles::disable_phase3`] is
        // set, force the global-q fallback path even when segments are
        // active.
        let per_segment_enabled = !ablate_disable_phase3()
            && diag0.num_segments > 1
            && !diag0.segment_map.is_empty()
            && diag0.mb_width > 0
            && diag0.mb_height > 0;
        let num_segments = diag0.num_segments as usize;
        // The active diagnostics — segment_map and grid — used for
        // per-segment aggregation. Updated each pass with the most recent
        // probe's actual segment assignment (which can shift slightly
        // across passes since segments depend on quality-affected
        // segmentation thresholds).
        let mut last_diag = diag0;

        // prev_probe holds the SECOND-most-recent (q, score) so the
        // secant fits a slope between it and the current (q, last_score).
        let mut prev_probe: Option<(f32, f32)> = None;
        let mut last_q = q;
        let mut last_score = score0;
        let mut last_dm = dm0;
        // Cumulative per-segment overrides (applied across passes).
        let mut cum_overrides: [i8; 4] = [0; 4];

        // 5. Iterate.
        //
        // Phase 3 dispatch policy (fix for the previously net-negative
        // always-on per-segment correction; see traces in
        // `/mnt/v/output/zenwebp/zensim-investigate/`):
        //
        // Per-segment override moves global zensim by ~0.1-0.4 per pass
        // (one segment, small MB fraction). The global-q secant moves it
        // by 2-4 per pass. When `|gap| > PHASE3_FINE_GAP`, the global-q
        // secant is the right tool; per-segment only fires when:
        //   (a) we're already close to band ("final mile"), AND
        //   (b) we haven't started a global-q secant trajectory yet
        //       (`prev_probe.is_none()`).
        // Once a secant has two anchor points it converges faster than
        // per-segment ever could; switching mid-trajectory throws away
        // that information and tends to produce out-of-band finals.
        //
        // Under the `ablation` feature the
        // [`AblationToggles::phase3_fine_gap`] field (float, default 0.5)
        // overrides the threshold for tuning experiments. Set it to a
        // very large value (e.g. 1000) to recover the pre-fix always-on
        // Phase 3 behavior for A/B testing. Production builds use the
        // constant default.
        #[cfg(feature = "ablation")]
        let phase3_fine_gap: f32 = {
            let v = super::ablation_runtime::PHASE3_FINE_GAP.with(|c| c.get());
            if v.is_nan() {
                super::PHASE3_FINE_GAP_DEFAULT
            } else {
                v
            }
        };
        #[cfg(not(feature = "ablation"))]
        let phase3_fine_gap: f32 = super::PHASE3_FINE_GAP_DEFAULT;
        for pass in 1..max_passes {
            let abs_gap = (target.target - last_score).abs();
            let use_per_segment =
                per_segment_enabled && abs_gap <= phase3_fine_gap && prev_probe.is_none();

            let (next_q, next_overrides) = if use_per_segment {
                // Phase 3: aggregate the per-pixel diffmap into per-MB
                // means, then accumulate per real k-means segment using
                // `last_diag.segment_map`. Tighten the worst-mean segment
                // or loosen the best-mean segment in the claw-back case.
                let dec = next_segment_overrides(
                    cum_overrides,
                    &last_dm,
                    width,
                    height,
                    &last_diag,
                    last_score,
                    &target,
                );
                if trace_phase3() {
                    eprintln!(
                        "PHASE3_TRACE pass={}_decide use_per_segment=true gap={:.4} \
                        means=[{:.4},{:.4},{:.4},{:.4}] counts=[{},{},{},{}] \
                        cum_before=[{},{},{},{}] cum_after=[{},{},{},{}] picked_seg={:?} dir={}",
                        pass,
                        target.target - last_score,
                        dec.means[0],
                        dec.means[1],
                        dec.means[2],
                        dec.means[3],
                        dec.counts[0],
                        dec.counts[1],
                        dec.counts[2],
                        dec.counts[3],
                        cum_overrides[0],
                        cum_overrides[1],
                        cum_overrides[2],
                        cum_overrides[3],
                        dec.overrides[0],
                        dec.overrides[1],
                        dec.overrides[2],
                        dec.overrides[3],
                        dec.picked_seg,
                        dec.direction,
                    );
                }
                // Keep q the same when doing per-segment correction.
                (last_q, Some(dec.overrides))
            } else {
                let nq = compute_next_q(last_q, last_score, prev_probe, &target);
                if trace_phase3() {
                    eprintln!(
                        "PHASE3_TRACE pass={}_decide use_per_segment=false gap={:.4} \
                        last_q={:.3} next_q={:.3} prev_probe={:?}",
                        pass,
                        target.target - last_score,
                        last_q,
                        nq.clamp(0.0, 100.0),
                        prev_probe,
                    );
                }
                (nq.clamp(0.0, 100.0), None)
            };

            // If neither q changed nor overrides moved, bail.
            let q_moved = (next_q - last_q).abs() >= 0.05;
            let overrides_moved = match next_overrides {
                Some(o) => o != cum_overrides,
                None => false,
            };
            if !q_moved && !overrides_moved {
                break;
            }

            // multi_pass_stats=true on probe encodes — small size win
            // amortizes across passes. Chunk D ablation: leave it at the
            // user's LossyConfig value when
            // [`AblationToggles::no_multi_pass_stats`] is set.
            let mps = !ablate_no_multi_pass_stats();
            let (bytes_n, diag_n) = encode_at_with_diagnostics(
                cfg,
                next_q,
                mps,
                next_overrides,
                pixels,
                layout,
                width,
                height,
            )?;
            let (score_n, dm_n) =
                measure_score_and_diffmap(&z, &pre, &bytes_n, layout, width, height)?;
            let passes_used = pass + 1;

            if trace_phase3() {
                eprintln!(
                    "PHASE3_TRACE pass={}_measured target={:.3} q={:.3} achieved={:.4} bytes={} \
                    overrides={:?} num_segs={} delta_score={:.4}",
                    pass,
                    target.target,
                    next_q,
                    score_n,
                    bytes_n.len(),
                    next_overrides,
                    diag_n.num_segments,
                    score_n - last_score,
                );
            }

            best = pick_best(
                best,
                Candidate {
                    bytes: bytes_n,
                    score: score_n,
                    q: next_q,
                    seg_overrides: next_overrides,
                },
                &target,
            );

            if in_band(score_n, &target) {
                return finalize(best, passes_used, &target);
            }

            // Update state for next iteration.
            if let Some(ov) = next_overrides {
                cum_overrides = ov;
            }
            prev_probe = Some((last_q, last_score));
            last_q = next_q;
            last_score = score_n;
            last_dm = dm_n;
            // The encoder may re-segment differently when overrides shift
            // the per-segment quants — refresh from the latest probe.
            // (Only meaningful for the diffmap aggregation loop; if the
            // new diag has fewer segments we just operate on the smaller
            // index range, no re-init needed.)
            // Discard the old diag so the next aggregation uses the
            // assignment that produced `dm_n`.
            // (Per-segment fallback flag stays whatever pass 0 decided.)
            let _ = num_segments; // keep the per-segment count from pass 0
            last_diag = diag_n;
        }

        finalize(best, max_passes, &target)
    }

    struct Candidate {
        bytes: Vec<u8>,
        score: f32,
        // q and seg_overrides are populated for diagnostic tracing /
        // debugging (the trace lines reference them). pick_best only
        // reads bytes/score, so these are otherwise unread.
        #[allow(dead_code)]
        q: f32,
        #[allow(dead_code)]
        seg_overrides: Option<[i8; 4]>,
    }

    fn pick_best(prev: Candidate, cand: Candidate, target: &ZensimTarget) -> Candidate {
        let prev_feas = prev.score >= target.target;
        let cand_feas = cand.score >= target.target;
        match (prev_feas, cand_feas) {
            (false, true) => cand,
            (true, false) => prev,
            (true, true) => {
                // Both meet target → pick the one with fewer bytes (best
                // bytes-recovery outcome).
                if cand.bytes.len() < prev.bytes.len() {
                    cand
                } else {
                    prev
                }
            }
            (false, false) => {
                // Neither meets target → pick the higher score.
                if cand.score > prev.score { cand } else { prev }
            }
        }
    }

    /// Returns true if `score` is in the asymmetric ship band:
    ///   `[target - max_undershoot_ship.unwrap_or(0.0),
    ///     target + max_overshoot.unwrap_or(∞)]`.
    ///
    /// `max_undershoot_ship` lets callers accept a small undershoot as
    /// "close enough" without triggering another pass — useful when the
    /// calibrator landed near the band on pass 0 and a correction pass
    /// would risk swinging into overshoot.
    fn in_band(score: f32, target: &ZensimTarget) -> bool {
        let lower = target.target - target.max_undershoot_ship.unwrap_or(0.0);
        let upper = target.target + target.max_overshoot.unwrap_or(f32::INFINITY);
        score >= lower && score <= upper
    }

    /// Compute next q via secant when we have a previous probe, fixed-
    /// step otherwise. Chunk F ablation: when
    /// [`AblationToggles::no_secant`] is set, always use a fixed
    /// proportional step `(target - achieved) * 0.5`.
    fn compute_next_q(
        q: f32,
        last_score: f32,
        prev: Option<(f32, f32)>,
        target: &ZensimTarget,
    ) -> f32 {
        let gap = target.target - last_score;
        if ablate_no_secant() {
            let mut step = gap * 0.5;
            step = step.clamp(-MAX_DELTA_Q, MAX_DELTA_Q);
            if step.abs() < MIN_DELTA_Q {
                step = MIN_DELTA_Q.copysign(if gap == 0.0 { 1.0 } else { gap });
            }
            return q + step;
        }
        // Secant: estimate dscore/dq from the two most recent probes.
        // (Note: `q` and `last_score` are the SAME probe as the latest
        // one in `prev_probe` until we update. So the secant fits
        // against the second-most-recent point.)
        let delta = if let Some((q_prev, s_prev)) = prev {
            let dq = q - q_prev;
            let ds = last_score - s_prev;
            if dq.abs() > 0.1 && ds.abs() > 0.05 {
                let slope = ds / dq; // zensim units per q point
                let mut step = gap / slope.max(0.05);
                step = step.clamp(-MAX_DELTA_Q, MAX_DELTA_Q);
                if step.abs() < MIN_DELTA_Q {
                    step = MIN_DELTA_Q.copysign(step);
                }
                step
            } else {
                fallback_step(gap)
            }
        } else {
            fallback_step(gap)
        };
        q + delta
    }

    fn fallback_step(gap: f32) -> f32 {
        let mut step = gap * DEFAULT_SENSITIVITY;
        step = step.clamp(-MAX_DELTA_Q, MAX_DELTA_Q);
        if step.abs() < MIN_DELTA_Q {
            step = MIN_DELTA_Q.copysign(if gap == 0.0 { 1.0 } else { gap });
        }
        step
    }

    fn finalize(best: Candidate, passes_used: u8, target: &ZensimTarget) -> IterationResult {
        // Strict-mode failure check: if max_undershoot is set and we
        // missed by more than that, return an error.
        if let Some(slack) = target.max_undershoot
            && best.score < target.target - slack
        {
            return Err(EncodeError::InvalidBufferSize(format!(
                "target_zensim: achieved {:.3} below floor {:.3} (max_undershoot {:.3}) after {} passes",
                best.score, target.target, slack, passes_used,
            )));
        }
        let targets_met = best.score >= target.target
            || target
                .max_undershoot
                .is_none_or(|t| (target.target - best.score) <= t);
        let bytes_len = best.bytes.len();
        Ok((
            best.bytes,
            ZensimEncodeMetrics {
                achieved_score: best.score,
                passes_used,
                bytes: bytes_len,
                targets_met,
            },
        ))
    }

    /// Encode RGB or RGBA pixels at the given quality with optional
    /// per-segment quant-index overrides, returning the WebP bytes
    /// alongside the encoder's per-MB `segment_map` (via the internal
    /// [`EncodeDiagnostics`] companion struct). On-wire bytes are
    /// byte-identical to the regular [`crate::EncodeRequest::encode`]
    /// path for the same inputs.
    ///
    /// `enable_multi_pass` toggles `multi_pass_stats` on the probe — a
    /// small size-saving option that amortizes across passes.
    ///
    /// `layout` is propagated through to the encoder; for `Rgba8` the
    /// resulting WebP is alpha-bearing (VP8 + VP8L alpha), and the
    /// closed loop's measurement decodes back to the same layout to
    /// keep the comparison consistent.
    fn encode_at_with_diagnostics(
        cfg: &LossyConfig,
        q: f32,
        enable_multi_pass: bool,
        seg_overrides: Option<[i8; 4]>,
        pixels: &[u8],
        layout: PixelLayout,
        width: u32,
        height: u32,
    ) -> Result<(Vec<u8>, EncodeDiagnostics), EncodeError> {
        let mut probe_cfg = cfg.clone();
        probe_cfg.quality = q.clamp(0.0, 100.0);
        probe_cfg.multi_pass_stats = enable_multi_pass;
        // Iteration must NOT recurse — disable target_zensim/target_psnr/
        // target_size on the probe config.
        probe_cfg.target_size = 0;
        probe_cfg.target_psnr = 0.0;
        probe_cfg.target_zensim = None;
        probe_cfg.segment_quant_overrides = seg_overrides;

        let req =
            crate::encoder::api::EncodeRequest::lossy(&probe_cfg, pixels, layout, width, height);
        match req.encode_inner_with_diagnostics() {
            Ok((bytes, _stats, diag)) => Ok((bytes, diag)),
            Err(at_err) => Err(at_err.decompose().0),
        }
    }

    /// Build a precomputed zensim reference from interleaved RGB or
    /// RGBA bytes. RGBA goes through `zensim::RgbaSlice` which composites
    /// over a deterministic noise background internally — the same
    /// composite is applied to source and to the decoded probe so the
    /// pair is comparable.
    fn build_source_reference(
        z: &zensim::Zensim,
        pixels: &[u8],
        layout: PixelLayout,
        width: u32,
        height: u32,
    ) -> Option<zensim::PrecomputedReference> {
        let w = width as usize;
        let h = height as usize;
        match layout {
            PixelLayout::Rgb8 => {
                if pixels.len() < w * h * 3 {
                    return None;
                }
                let chunks: &[[u8; 3]] = bytemuck::cast_slice(&pixels[..w * h * 3]);
                let slice = zensim::RgbSlice::new(chunks, w, h);
                z.precompute_reference(&slice).ok()
            }
            PixelLayout::Rgba8 => {
                if pixels.len() < w * h * 4 {
                    return None;
                }
                let chunks: &[[u8; 4]] = bytemuck::cast_slice(&pixels[..w * h * 4]);
                let slice = zensim::RgbaSlice::new(chunks, w, h);
                z.precompute_reference(&slice).ok()
            }
            // Unreachable: the gate in iteration::run rejects everything else.
            _ => None,
        }
    }

    /// Decode `webp` and compute zensim score + per-pixel diffmap.
    /// The decode is layout-aware (RGB→`decode_rgb`, RGBA→`decode_rgba`)
    /// so the distorted side is fed into zensim with the same
    /// composite-vs-RGB shape as the precomputed reference. The
    /// diffmap is `Vec<f32>` of length `width * height` in row-major
    /// order — used by Phase 3 per-segment aggregation.
    fn measure_score_and_diffmap(
        z: &zensim::Zensim,
        pre: &zensim::PrecomputedReference,
        webp: &[u8],
        layout: PixelLayout,
        width: u32,
        height: u32,
    ) -> Result<(f32, Vec<f32>), EncodeError> {
        match layout {
            PixelLayout::Rgb8 => measure_rgb(z, pre, webp, width, height),
            PixelLayout::Rgba8 => measure_rgba(z, pre, webp, width, height),
            other => Err(EncodeError::TargetZensimUnsupportedLayout(other)),
        }
    }

    fn measure_rgb(
        z: &zensim::Zensim,
        pre: &zensim::PrecomputedReference,
        webp: &[u8],
        width: u32,
        height: u32,
    ) -> Result<(f32, Vec<f32>), EncodeError> {
        let (rgb, w, h) = crate::oneshot::decode_rgb(webp).map_err(|e| {
            EncodeError::InvalidBufferSize(format!(
                "target_zensim: decode for measurement failed: {:?}",
                e.decompose().0,
            ))
        })?;
        if w != width || h != height {
            return Err(EncodeError::InvalidBufferSize(format!(
                "target_zensim: decoded dims {}x{} != source {}x{}",
                w, h, width, height,
            )));
        }
        let n = (w as usize) * (h as usize) * 3;
        if rgb.len() < n {
            return Err(EncodeError::InvalidBufferSize(
                "target_zensim: short decoded buffer".into(),
            ));
        }
        let chunks: &[[u8; 3]] = bytemuck::cast_slice(&rgb[..n]);
        let slice = zensim::RgbSlice::new(chunks, w as usize, h as usize);
        let dm = z
            .compute_with_ref_and_diffmap(pre, &slice, zensim::DiffmapWeighting::Trained)
            .map_err(|e| {
                EncodeError::InvalidBufferSize(format!(
                    "zensim compute_with_ref_and_diffmap failed: {:?}",
                    e
                ))
            })?;
        let score = dm.score() as f32;
        Ok((score, dm.diffmap().to_vec()))
    }

    fn measure_rgba(
        z: &zensim::Zensim,
        pre: &zensim::PrecomputedReference,
        webp: &[u8],
        width: u32,
        height: u32,
    ) -> Result<(f32, Vec<f32>), EncodeError> {
        let (rgba, w, h) = crate::oneshot::decode_rgba(webp).map_err(|e| {
            EncodeError::InvalidBufferSize(format!(
                "target_zensim: rgba decode for measurement failed: {:?}",
                e.decompose().0,
            ))
        })?;
        if w != width || h != height {
            return Err(EncodeError::InvalidBufferSize(format!(
                "target_zensim: decoded dims {}x{} != source {}x{}",
                w, h, width, height,
            )));
        }
        let n = (w as usize) * (h as usize) * 4;
        if rgba.len() < n {
            return Err(EncodeError::InvalidBufferSize(
                "target_zensim: short decoded rgba buffer".into(),
            ));
        }
        let chunks: &[[u8; 4]] = bytemuck::cast_slice(&rgba[..n]);
        let slice = zensim::RgbaSlice::new(chunks, w as usize, h as usize);
        let dm = z
            .compute_with_ref_and_diffmap(pre, &slice, zensim::DiffmapWeighting::Trained)
            .map_err(|e| {
                EncodeError::InvalidBufferSize(format!(
                    "zensim compute_with_ref_and_diffmap failed: {:?}",
                    e
                ))
            })?;
        let score = dm.score() as f32;
        Ok((score, dm.diffmap().to_vec()))
    }

    /// Aggregate the per-pixel diffmap into per-macroblock means, then
    /// accumulate those into per-segment sums using the encoder's actual
    /// k-means `segment_map`. Tightens the worst-mean segment or loosens
    /// the best-mean segment in the claw-back case.
    ///
    /// Phase 3 used to do this against a 2x2 spatial quadrant proxy
    /// because exposing the segment_map on `EncodeStats` would have been
    /// a SemVer break. The real assignment is now threaded through the
    /// `pub(crate) EncodeDiagnostics` companion struct
    /// (`encode_inner_with_diagnostics`), so we operate on the encoder's
    /// actual per-MB clustering — important because zenwebp's segments
    /// are k-means in alpha-space, NOT spatial: an MB in the top-left
    /// can share a segment with a same-alpha MB in the bottom-right and
    /// the quadrant proxy would have lumped it elsewhere.
    ///
    /// Returns the NEW cumulative overrides (not deltas). Bounded to
    /// `[-16, 16]` cumulatively to stay in sensible VP8 quantizer range.
    ///
    /// MB-level aggregation: we walk the diffmap one MB at a time (16x16
    /// pixel block) and add its mean to the appropriate segment's
    /// accumulator. Edge MBs that overlap the right/bottom image border
    /// are handled by clipping the read region.
    /// Return value carries the new cumulative override AND the per-segment
    /// diagnostic stats (mean diffmap + MB count per segment) so the trace
    /// can show what the policy saw. `picked_seg` is the segment index that
    /// was tightened/loosened (or `None` if nothing was changed).
    pub(crate) struct OverrideDecision {
        pub overrides: [i8; 4],
        pub means: [f32; 4],
        pub counts: [u64; 4],
        pub picked_seg: Option<usize>,
        pub direction: i8, // -1 tightened worst, +1 loosened best, 0 no-op
    }

    fn next_segment_overrides(
        cum: [i8; 4],
        diffmap: &[f32],
        width: u32,
        height: u32,
        diag: &EncodeDiagnostics,
        score: f32,
        target: &ZensimTarget,
    ) -> OverrideDecision {
        // Content-type-dependent fallback hatch: when the
        // [`AblationToggles::use_quadrant_proxy`] flag is set on the
        // current thread, fall back to the 2x2 spatial-quadrant proxy
        // that this code shipped with originally.
        //
        // A/B run (CID22 + gb82 + gb82-sc, 76 images x 3 targets =
        // 228 cells, target ∈ {75, 80, 85}, max_overshoot=1.5,
        // max_passes=3, m4) shows mixed behavior:
        //   - gb82-sc (screen content): real segment_map slightly
        //     tighter (avg |achieved-target| 5.678 vs 5.754 for quad
        //     across all 30 cells; -0.134 across the 17 divergent
        //     cells). Hypothesis directionally validated.
        //   - CID22 + gb82 (photos): quadrant proxy slightly tighter
        //     (avg |achieved-target| +0.02 to +0.04 for seg across
        //     ~114 divergent photo cells).
        // Both legs hit targets_met = 228/228; seg has 6 fewer
        // undershoots overall (54 vs 60). Median bytes within ~2% of
        // each other.
        //
        // The default (real `segment_map`) is correct in spirit —
        // it operates on the encoder's actual k-means assignment —
        // but on photo content the quadrant proxy occasionally lands
        // closer to target. Under the `ablation` feature the
        // `dev/zensim_ab_quadrant_vs_segmap.rs` binary toggles this
        // via [`AblationToggles::use_quadrant_proxy`] to re-run the
        // comparison. Production builds always use the real segment_map.
        #[cfg(feature = "ablation")]
        let use_quadrant = super::ablation_runtime::USE_QUADRANT_PROXY.with(|c| c.get());
        #[cfg(not(feature = "ablation"))]
        let use_quadrant = false;
        if use_quadrant {
            let q_overrides =
                next_segment_overrides_quadrant_proxy(cum, diffmap, width, height, score, target);
            return OverrideDecision {
                overrides: q_overrides,
                means: [0.0; 4],
                counts: [0; 4],
                picked_seg: None,
                direction: 0,
            };
        }

        let n = (diag.num_segments as usize).clamp(2, 4);
        let w = width as usize;
        let h = height as usize;
        let mb_w = diag.mb_width as usize;
        let mb_h = diag.mb_height as usize;
        let expected = mb_w.saturating_mul(mb_h);
        // Defensive: if the segment_map shape doesn't match the encoder
        // grid (shouldn't happen, but the diag is plumbed through enough
        // layers that it's worth guarding), fall back to no-op overrides.
        if diag.segment_map.len() != expected || expected == 0 {
            return OverrideDecision {
                overrides: cum,
                means: [0.0; 4],
                counts: [0; 4],
                picked_seg: None,
                direction: 0,
            };
        }

        let mut sum = [0.0f64; 4];
        let mut counts = [0u64; 4];

        // Per-MB diffmap mean → accumulate into the MB's segment.
        for mb_y in 0..mb_h {
            let py0 = mb_y * 16;
            let py1 = (py0 + 16).min(h);
            if py0 >= h {
                break;
            }
            for mb_x in 0..mb_w {
                let px0 = mb_x * 16;
                let px1 = (px0 + 16).min(w);
                if px0 >= w {
                    continue;
                }
                let seg = diag.segment_map[mb_y * mb_w + mb_x] as usize;
                if seg >= n {
                    continue;
                }
                let mut block_sum = 0.0f64;
                let mut block_count = 0u64;
                for py in py0..py1 {
                    let row = &diffmap[py * w + px0..py * w + px1];
                    for &v in row {
                        block_sum += v as f64;
                        block_count += 1;
                    }
                }
                if block_count > 0 {
                    sum[seg] += block_sum;
                    counts[seg] += block_count;
                }
            }
        }

        let mut means = [0.0f32; 4];
        for s in 0..n {
            if counts[s] > 0 {
                means[s] = (sum[s] / counts[s] as f64) as f32;
            }
        }

        // Find worst (highest mean diffmap = most distorted) and best
        // (lowest mean diffmap = highest fidelity headroom) segments,
        // ignoring segments with no MB assignments.
        let mut worst = 0usize;
        let mut best = 0usize;
        let mut found_worst = false;
        let mut found_best = false;
        for s in 0..n {
            if counts[s] == 0 {
                continue;
            }
            if !found_worst || means[s] > means[worst] {
                worst = s;
                found_worst = true;
            }
            if !found_best || means[s] < means[best] {
                best = s;
                found_best = true;
            }
        }
        if !found_worst {
            return OverrideDecision {
                overrides: cum,
                means,
                counts,
                picked_seg: None,
                direction: 0,
            };
        }

        let mut out = cum;
        let gap = target.target - score;
        let mut picked: Option<usize> = None;
        let mut direction: i8 = 0;
        if gap > 0.0 {
            let step = if gap > 4.0 {
                -3
            } else if gap > 2.0 {
                -2
            } else {
                -1
            };
            out[worst] = (i32::from(out[worst]) + step).clamp(-16, 16) as i8;
            picked = Some(worst);
            direction = -1;
        } else if let Some(t) = target.max_overshoot
            && (score - target.target) > t
        {
            let overshoot = score - target.target - t;
            let step = if overshoot > 4.0 {
                3
            } else if overshoot > 2.0 {
                2
            } else {
                1
            };
            out[best] = (i32::from(out[best]) + step).clamp(-16, 16) as i8;
            picked = Some(best);
            direction = 1;
        }
        OverrideDecision {
            overrides: out,
            means,
            counts,
            picked_seg: picked,
            direction,
        }
    }

    /// Pre-deviation aggregation: 2x2 spatial-quadrant proxy. Kept for
    /// A/B comparison under the `ablation` feature via
    /// [`AblationToggles::use_quadrant_proxy`]. Production code uses
    /// the real `segment_map` via [`next_segment_overrides`].
    #[cfg_attr(not(feature = "ablation"), allow(dead_code))]
    fn next_segment_overrides_quadrant_proxy(
        cum: [i8; 4],
        diffmap: &[f32],
        width: u32,
        height: u32,
        score: f32,
        target: &ZensimTarget,
    ) -> [i8; 4] {
        let n: usize = 4;
        let mut sum = [0.0f64; 4];
        let mut count = [0u64; 4];
        let w = width as usize;
        let h = height as usize;
        let half_w = w / 2;
        let half_h = h / 2;
        for y in 0..h {
            let row = &diffmap[y * w..y * w + w];
            for (x, &v) in row.iter().enumerate() {
                let qx = usize::from(x >= half_w);
                let qy = usize::from(y >= half_h);
                let q = qy * 2 + qx;
                sum[q] += v as f64;
                count[q] += 1;
            }
        }
        let mut means = [0.0f32; 4];
        for s in 0..n {
            if count[s] > 0 {
                means[s] = (sum[s] / count[s] as f64) as f32;
            }
        }
        let mut worst = 0usize;
        let mut best = 0usize;
        let mut found = false;
        for s in 0..n {
            if count[s] == 0 {
                continue;
            }
            if !found || means[s] > means[worst] {
                worst = s;
            }
            if !found || means[s] < means[best] {
                best = s;
            }
            found = true;
        }
        let mut out = cum;
        let gap = target.target - score;
        if gap > 0.0 {
            let step = if gap > 4.0 {
                -3
            } else if gap > 2.0 {
                -2
            } else {
                -1
            };
            out[worst] = (i32::from(out[worst]) + step).clamp(-16, 16) as i8;
        } else if let Some(t) = target.max_overshoot
            && (score - target.target) > t
        {
            let overshoot = score - target.target - t;
            let step = if overshoot > 4.0 {
                3
            } else if overshoot > 2.0 {
                2
            } else {
                1
            };
            out[best] = (i32::from(out[best]) + step).clamp(-16, 16) as i8;
        }
        out
    }

    /// Run the bucket classifier on the RGB or RGBA source. We need a
    /// Y plane — derive a quick luma approximation from RGB rather than
    /// running the full encoder analyzer (which would re-encode an
    /// extra time). For RGBA inputs the alpha channel is fed into the
    /// classifier's alpha histogram (its primary signal for the Icon
    /// bucket); for RGB the histogram falls back to Y values
    /// themselves — the classifier primarily uses bimodality + edge
    /// density + uniformity, all of which are tolerant of histogram
    /// shape.
    fn detect_bucket(
        pixels: &[u8],
        layout: PixelLayout,
        width: u32,
        height: u32,
    ) -> Option<ImageContentType> {
        let w = width as usize;
        let h = height as usize;
        let bpp = match layout {
            PixelLayout::Rgb8 => 3usize,
            PixelLayout::Rgba8 => 4usize,
            _ => return None,
        };
        if w < 8 || h < 8 || pixels.len() < w * h * bpp {
            return None;
        }

        // When the `analyzer` feature is on, route through zenanalyze
        // so the bucket decision uses the same shared signal source as
        // the encoder's Auto-preset path. See `classify_image_type_rgb8`.
        #[cfg(feature = "analyzer")]
        {
            let bucket = match layout {
                PixelLayout::Rgb8 => crate::encoder::analysis::classify_image_type_rgb8(
                    &pixels[..w * h * 3],
                    width,
                    height,
                ),
                PixelLayout::Rgba8 => {
                    let rgb = crate::encoder::analysis::rgba8_to_rgb8(&pixels[..w * h * 4]);
                    crate::encoder::analysis::classify_image_type_rgb8(&rgb, width, height)
                }
                _ => return None,
            };
            Some(bucket)
        }

        // Fallback (no `analyzer` feature): the original Y-plane +
        // alpha-histogram heuristic. BT.601 Y = 0.299R + 0.587G + 0.114B.
        #[cfg(not(feature = "analyzer"))]
        {
            let mut y_plane: Vec<u8> = Vec::with_capacity(w * h);
            let mut alpha_hist = [0u32; 256];
            match layout {
                PixelLayout::Rgb8 => {
                    for px in pixels.chunks_exact(3).take(w * h) {
                        let y = ((u32::from(px[0]) * 76
                            + u32::from(px[1]) * 150
                            + u32::from(px[2]) * 30)
                            >> 8) as u8;
                        y_plane.push(y);
                        alpha_hist[y as usize] += 1;
                    }
                }
                PixelLayout::Rgba8 => {
                    for px in pixels.chunks_exact(4).take(w * h) {
                        let y = ((u32::from(px[0]) * 76
                            + u32::from(px[1]) * 150
                            + u32::from(px[2]) * 30)
                            >> 8) as u8;
                        y_plane.push(y);
                        alpha_hist[px[3] as usize] += 1;
                    }
                }
                _ => return None,
            }
            let bucket =
                crate::encoder::analysis::classify_image_type(&y_plane, w, h, w, &alpha_hist);
            Some(bucket)
        }
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn target_default() {
        let t = ZensimTarget::default();
        assert_eq!(t.target, 80.0);
        assert_eq!(t.max_overshoot, Some(1.5));
        assert_eq!(t.max_undershoot_ship, Some(0.5));
        assert_eq!(t.max_undershoot, None);
        assert_eq!(t.max_passes, 2);
    }

    #[test]
    fn target_builder() {
        let t = ZensimTarget::new(85.0)
            .with_max_overshoot(Some(0.5))
            .with_max_undershoot_ship(Some(0.25))
            .with_max_undershoot(Some(2.0))
            .with_max_passes(3);
        assert_eq!(t.target, 85.0);
        assert_eq!(t.max_overshoot, Some(0.5));
        assert_eq!(t.max_undershoot_ship, Some(0.25));
        assert_eq!(t.max_undershoot, Some(2.0));
        assert_eq!(t.max_passes, 3);
    }

    #[test]
    fn metrics_no_target() {
        let m = ZensimEncodeMetrics::no_target(1234);
        assert!(m.achieved_score.is_nan());
        assert_eq!(m.passes_used, 1);
        assert_eq!(m.bytes, 1234);
        assert!(m.targets_met);
    }

    #[test]
    fn calibration_monotonic_per_bucket() {
        for &b in &[
            ImageContentType::Photo,
            ImageContentType::Drawing,
            ImageContentType::Icon,
        ] {
            let mut prev = 0.0f32;
            for t in (60..=95).step_by(5) {
                let q = zensim_to_starting_q_for_bucket(t as f32, b);
                assert!(
                    q >= prev,
                    "non-monotonic at {b:?} target {t}: {prev} -> {q}"
                );
                assert!((1.0..=100.0).contains(&q), "{b:?} target {t} q={q}");
                prev = q;
            }
        }
    }

    #[test]
    fn calibration_clamps_at_endpoints() {
        // Below the lowest anchor: clamp to lowest q.
        let q_low = zensim_to_starting_q_for_bucket(40.0, ImageContentType::Photo);
        assert_eq!(q_low, 30.0);
        // Above the highest anchor: clamp to highest q.
        let q_high = zensim_to_starting_q_for_bucket(99.0, ImageContentType::Photo);
        assert_eq!(q_high, 100.0);
    }
}