gam 0.3.115 - Docs.rs

// Cross-block identifiability canonicalisation.
//
// The pre-fit `audit_identifiability` (see `identifiability_audit.rs`)
// runs a joint RRQR on `[X_block_0 | X_block_1 | ...]` and reports per-
// block (block_idx, local_col) drops attributing each demoted joint
// column back to its origin. This module **previously** converted that
// report into a concrete coordinate transform that reduced the inner
// solve to a (p_raw → r_reduced) subspace via selection-T.
//
// That reduction is unsafe under the current `CustomFamily` contract.
// Blockwise families capture their per-block designs at construction
// time (e.g. `SurvivalMarginalSlopeFamily::marginal_design`,
// `::logslope_design`) and the family's `evaluate_blockwise_exact_newton`
// row-Hessian assembly uses `DesignMatrix::syr_row_into_view` /
// `::row_outer_into_view`, which assert that the target slice's column
// count equals the captured design's column count. Substituting a
// column-reduced `ParameterBlockSpec` under such a family produces
// `DesignMatrix::syr_row_into shape mismatch` (matrix.rs:6529), which
// blockwise inner-solve callers unwrap via `.expect(...)` — a panic
// later in the pipeline, masking the audit's diagnostic.
//
// This module has two safe behaviors:
//   - plain single-channel dense blocks may be exactly orthogonalized with a
//     per-block transform, then lifted back to raw coordinates after fitting;
//   - callback-owned blocks keep raw block widths even when the audit attributes
//     weak columns, because their effective geometry is owned by the family
//     callback rather than by the placeholder `ParameterBlockSpec.design`.
//
// Fatal audit results still fail closed with an immediate
// `CustomFamilyError::IdentifiabilityFailure`, naming the offending blocks and a
// reparameterization hint in milliseconds rather than after a singular Newton
// solve. Clean or safely canonicalized results carry explicit per-block
// transforms so `lift_block_states_to_raw` and `lift_fit_geometry_to_raw` map
// the fit back to the raw coordinate system.

use std::sync::Arc;

use ndarray::{Array1, Array2, Array3};

use crate::families::custom_family::{
    BlockEffectiveJacobian, CustomFamilyError, FamilyLinearizationState, ParameterBlockSpec,
    PenaltyMatrix,
};
use crate::families::identifiability_compiler::{
    IdentityRowHessian, RowJacobianOperator, orthogonalize_design_blocks, symmetric_sqrt_into,
};
use crate::linalg::faer_ndarray::{default_rrqr_rank_alpha, rrqr_with_permutation};
use crate::linalg::matrix::{CoefficientTransformOperator, DenseDesignMatrix, DesignMatrix};
use crate::solver::gauge::Gauge;
use crate::solver::identifiability_audit::{
    IdentifiabilityAudit, audit_identifiability, audit_identifiability_channel_aware,
};

enum BlockJacobianSource {
    Callback(Arc<dyn BlockEffectiveJacobian>),
    FlatDesign(DesignMatrix),
}

/// A lazy [`RowJacobianOperator`] for identifiability audit blocks.
///
/// Callback blocks stream `BlockEffectiveJacobian::effective_jacobian_rows`;
/// plain blocks stream `DesignMatrix::row_chunk_into` and embed the flat rows in
/// channel 0. No `(n, p, K)` tensor is held across blocks.
struct BlockJacobianAsRowOp {
    source: BlockJacobianSource,
    n: usize,
    p: usize,
    k_block: usize,
    k_target: usize,
    block_name: String,
}

impl BlockJacobianAsRowOp {
    /// Build from a `BlockEffectiveJacobian` callback.
    ///
    /// `n_rows` is the number of training observations; `k_target` is the
    /// channel count of the emitted tensor. A callback whose `n_outputs()`
    /// is smaller than `k_target` has its Jacobian embedded in the leading
    /// channels with the trailing channels zero — built directly into the
    /// padded tensor, so no intermediate `(n, p, k_block)` tensor plus copy
    /// is ever materialized (at biobank scale every avoided `(n, p, k)`
    /// duplicate is hundreds of MiB, #979). The `effective_jacobian_at`
    /// call uses `beta = 0` and `family_scalars = None`.
    fn from_callback(
        cb: Arc<dyn BlockEffectiveJacobian>,
        n_rows: usize,
        p_block: usize,
        k_target: usize,
        block_name: &str,
    ) -> Result<Self, String> {
        let k = cb.n_outputs();
        if k == 0 {
            return Err(format!(
                "BlockJacobianAsRowOp block '{block_name}': n_outputs=0 is invalid"
            ));
        }
        if k > k_target {
            return Err(format!(
                "BlockJacobianAsRowOp block '{block_name}': n_outputs({k}) exceeds the \
                 audit channel count k_target({k_target})"
            ));
        }
        Ok(Self {
            source: BlockJacobianSource::Callback(cb),
            n: n_rows,
            p: p_block,
            k_block: k,
            k_target,
            block_name: block_name.to_string(),
        })
    }

    fn from_flat_design(
        design: DesignMatrix,
        n_rows: usize,
        k_target: usize,
        block_name: &str,
    ) -> Self {
        let p = design.ncols();
        Self {
            source: BlockJacobianSource::FlatDesign(design),
            n: n_rows,
            p,
            k_block: 1,
            k_target,
            block_name: block_name.to_string(),
        }
    }

    fn zero_state() -> FamilyLinearizationState<'static> {
        FamilyLinearizationState {
            beta: &[],
            family_scalars: None,
            channel_hessian: None,
            probit_frailty_scale: 1.0,
        }
    }

    fn stacked_rows(&self, start: usize, end: usize) -> Result<Array2<f64>, String> {
        match &self.source {
            BlockJacobianSource::Callback(cb) => {
                let state = Self::zero_state();
                let stacked = cb
                    .effective_jacobian_rows(&state, start..end)
                    .map_err(|e| {
                        format!("BlockJacobianAsRowOp block '{}': {e}", self.block_name)
                    })?;
                let chunk = end - start;
                if stacked.nrows() != self.k_block * chunk || stacked.ncols() != self.p {
                    return Err(format!(
                        "BlockJacobianAsRowOp block '{}': effective_jacobian_rows returned \
                         shape {:?}, expected [{}, {}]",
                        self.block_name,
                        stacked.shape(),
                        self.k_block * chunk,
                        self.p,
                    ));
                }
                Ok(stacked)
            }
            BlockJacobianSource::FlatDesign(design) => {
                let chunk = end - start;
                let mut out = Array2::<f64>::zeros((chunk, self.p));
                design
                    .row_chunk_into(start..end, out.view_mut())
                    .map_err(|e| {
                        format!(
                            "BlockJacobianAsRowOp block '{}': flat design row chunk failed: {e}",
                            self.block_name
                        )
                    })?;
                Ok(out)
            }
        }
    }
}

impl RowJacobianOperator for BlockJacobianAsRowOp {
    fn k(&self) -> usize {
        self.k_target
    }
    fn ncols(&self) -> usize {
        self.p
    }
    fn nrows(&self) -> usize {
        self.n
    }
    fn apply_row(&self, row: usize, delta_beta: &[f64], out: &mut [f64]) {
        let k = self.k();
        assert_eq!(out.len(), k);
        assert_eq!(delta_beta.len(), self.ncols());
        for r in 0..k {
            out[r] = 0.0;
        }
        let stacked = self
            .stacked_rows(row, row + 1)
            .expect("BlockJacobianAsRowOp::apply_row failed to read row");
        for r in 0..self.k_block {
            for (j, &b) in delta_beta.iter().enumerate() {
                out[r] += stacked[[r, j]] * b;
            }
        }
    }
    fn evaluate_full(&self) -> Array3<f64> {
        let entries = self.n.saturating_mul(self.p).saturating_mul(self.k_target);
        const MAX_EVALUATE_FULL_ENTRIES: usize = 10_000_000;
        assert!(
            entries <= MAX_EVALUATE_FULL_ENTRIES,
            "BlockJacobianAsRowOp::evaluate_full refused to materialize {entries} entries"
        );
        let mut out = Array3::<f64>::zeros((self.n, self.p, self.k_target));
        for start in (0..self.n).step_by(4096) {
            let end = (start + 4096).min(self.n);
            let stacked = self
                .stacked_rows(start, end)
                .expect("BlockJacobianAsRowOp::evaluate_full failed to read row chunk");
            let chunk = end - start;
            for ch in 0..self.k_block {
                for local_i in 0..chunk {
                    for col in 0..self.p {
                        out[[start + local_i, col, ch]] = stacked[[ch * chunk + local_i, col]];
                    }
                }
            }
        }
        out
    }
    fn scaled_design_by_sqrt_h(&self, h_full: &Array3<f64>) -> Array2<f64> {
        let n = self.nrows();
        let p = self.ncols();
        let k = self.k();
        assert_eq!(h_full.shape(), &[n, k, k]);
        let mut out = Array2::<f64>::zeros((n * k, p));
        let mut sqrt_h = Array2::<f64>::zeros((k, k));
        let mut h_i = Array2::<f64>::zeros((k, k));
        for start in (0..n).step_by(4096) {
            let end = (start + 4096).min(n);
            let chunk = end - start;
            let mut rows = Array2::<f64>::zeros((chunk * k, p));
            self.channel_flattened_rows(start..end, &mut rows);
            for local_i in 0..chunk {
                let row = start + local_i;
                for a in 0..k {
                    for b in 0..k {
                        h_i[[a, b]] = h_full[[row, a, b]];
                    }
                }
                symmetric_sqrt_into(&h_i, &mut sqrt_h);
                for ch in 0..k {
                    let dst = row * k + ch;
                    for col in 0..p {
                        let mut acc = 0.0;
                        for cp in 0..k {
                            acc += sqrt_h[[ch, cp]] * rows[[local_i * k + cp, col]];
                        }
                        out[[dst, col]] = acc;
                    }
                }
            }
        }
        out
    }
    fn channel_flattened_column(&self, col: usize, out: &mut [f64]) {
        let n = self.nrows();
        let k = self.k();
        assert!(
            col < self.ncols(),
            "BlockJacobianAsRowOp::channel_flattened_column col {col} out of range {}",
            self.ncols()
        );
        assert_eq!(out.len(), n * k);
        let mut offset = 0usize;
        for start in (0..n).step_by(4096) {
            let end = (start + 4096).min(n);
            let chunk = end - start;
            let mut rows = Array2::<f64>::zeros((chunk * k, self.p));
            self.channel_flattened_rows(start..end, &mut rows);
            for local_i in 0..chunk {
                for ch in 0..k {
                    out[offset + local_i * k + ch] = rows[[local_i * k + ch, col]];
                }
            }
            offset += chunk * k;
        }
    }

    fn channel_flattened_rows(&self, rows: std::ops::Range<usize>, out: &mut Array2<f64>) {
        let start = rows.start.min(self.n);
        let end = rows.end.min(self.n);
        let chunk = end - start;
        assert_eq!(out.shape(), &[chunk * self.k_target, self.p]);
        out.fill(0.0);
        let stacked = self
            .stacked_rows(start, end)
            .expect("BlockJacobianAsRowOp::channel_flattened_rows failed to read rows");
        for ch in 0..self.k_block {
            for local_i in 0..chunk {
                for col in 0..self.p {
                    out[[local_i * self.k_target + ch, col]] = stacked[[ch * chunk + local_i, col]];
                }
            }
        }
    }
}

/// Specs after pre-fit cross-block identifiability canonicalisation.
///
/// `reduced_specs[i]` carries an `r_i`-column design wrapping the raw
/// `p_i`-column design via `CoefficientTransformOperator`. Penalties
/// are pulled back as `T_iᵀ S_k T_i`. `gauge` is the block-diagonal
/// [`Gauge`] whose block `i` slab is the raw-to-reduced transform
/// `T_i` of shape `(p_i_raw, r_i)`; it owns every lift back to raw
/// coordinates (`lift_block_betas` for β, `lift_covariance` for the
/// joint covariance / penalized Hessian).
///
/// `used_channel_aware_audit` is `true` when the multi-channel path was
/// taken (i.e. at least one block declared `n_outputs > 1` via its
/// `jacobian_callback`).  Tests that assert routing correctness inspect
/// this field directly.
#[derive(Debug)]
pub struct CanonicalSpecs {
    pub reduced_specs: Vec<ParameterBlockSpec>,
    pub gauge: Gauge,
    pub audit: IdentifiabilityAudit,
    /// `true` iff the audit was routed through `audit_identifiability_channel_aware`
    /// (multi-channel families such as survival marginal-slope).
    pub used_channel_aware_audit: bool,
}

/// Run the pre-fit cross-block identifiability audit. Fail-closed
/// safety gate (see module docs).
///
/// Behaviour:
///   - If the audit cleanly passes (`!fatal`), each `T_i` is the
///     identity and the reduced specs are clones of the raw specs.
///     The lift/sandwich machinery downstream becomes a no-op.
///   - If the audit is `fatal=true` for **any** cause (joint rank
///     deficiency with attributed drops, joint rank deficiency
///     without attribution, or a hard-overlap alias pair), we refuse
///     the fit with `CustomFamilyError::IdentifiabilityFailure`. The
///     audit summary names the offending blocks and a reparameterisation
///     hint, giving the caller a millisecond-scale diagnostic instead
///     of a downstream `syr_row_into shape mismatch` panic when the
///     family captures raw-width designs.
///
/// # Multi-channel routing
///
/// When any spec's `jacobian_callback` reports `n_outputs > 1` (i.e.
/// the block contributes to multiple stacked output channels — as in
/// survival marginal-slope where marginal and logslope blocks target
/// orthogonal channels of the per-row Jacobian), this function routes
/// through [`audit_identifiability_channel_aware`] instead of the flat
/// [`audit_identifiability`].
///
/// The routing decision is principled: for each spec, call
/// `effective_jacobian_at(beta=0)` and check whether the returned
/// matrix has `nrows > n` (i.e. `nrows == n * k` for some `k > 1`).
/// If any block satisfies this, all blocks are treated as multi-channel
/// and [`BlockJacobianAsRowOp`] adapters are built from each spec's
/// callback to feed the channel-aware audit.
///
/// For specs without a `jacobian_callback`, the flat design with a
/// single-channel identity operator is used.
///
/// # Invariant assertion
///
/// After building the transform `T`, the post-T joint Jacobian
/// `J_can = J · T_full` is materialised and RRQR-checked.  If
/// `rank(J_can) != rank(J)`, the transform `T` is defective (a bug in
/// its construction) and the function returns
/// `CustomFamilyError::DimensionMismatch` with a diagnostic naming
/// `rank(J)`, `rank(J_can)`, and all per-block `T_i` shapes.
pub fn canonicalize_for_identifiability(
    specs: &[ParameterBlockSpec],
) -> Result<CanonicalSpecs, CustomFamilyError> {
    // Robustness is unconditional: always attempt the exact W-metric
    // orthogonalisation pass before the fail-closed audit. `try_orthogonalize_
    // blocks` is self-gating (it returns `None` — and the audit runs unchanged —
    // unless there are ≥2 plain single-channel dense blocks with an actual
    // structural overlap to remove, deferring on any family-owned-geometry block
    // and on clean designs), so this is byte-identical wherever there is nothing
    // to orthogonalise.
    canonicalize_for_identifiability_inner(specs, true)
}

/// Core canonicalisation worker.
///
/// `orthogonalize` is an INTERNAL recursion-control flag (NOT a user knob): the
/// public entry passes `true` to attempt the exact orthogonalisation pass; the
/// post-orthogonalisation recursion below passes `false` so the already-reduced
/// specs are audited without re-orthogonalising.
///
/// When orthogonalisation runs, a general exact W-metric pass reparameterises
/// overlapping design blocks (e.g. a logslope surface confounded with the
/// marginal surface) so the lower-priority block's overlap with the
/// higher-priority anchor is removed exactly, rather than being penalised by a
/// hand-tuned ridge. The reparam `V_b` is folded into each block's design via
/// [`CoefficientTransformOperator`], penalties are pulled back as `V_bᵀ S V_b`,
/// and the block-diagonal [`Gauge`] carries `V_b` so the shared
/// [`Gauge::lift_block_betas`] / [`Gauge::lift_covariance`] machinery maps the
/// reduced fit back to raw coordinates unchanged (the lift is `β_raw = V_b · θ`,
/// already supported for dense transforms).
fn canonicalize_for_identifiability_inner(
    specs: &[ParameterBlockSpec],
    orthogonalize: bool,
) -> Result<CanonicalSpecs, CustomFamilyError> {
    // Exact orthogonalisation of structural confounds. Runs only on the top-
    // level entry AND only where the design is single-channel dense (the general
    // multi-channel coupled path is handled by the Tier-B joint-Newton Jeffreys
    // term, not by a per-block design reparam). On any structural condition that
    // the orthogonaliser cannot express as a per-block transform, it falls
    // through to the unmodified audit gate below — never worse than today.
    if orthogonalize {
        if let Some(canon) = try_orthogonalize_blocks(specs)? {
            return Ok(canon);
        }
    }
    if specs.is_empty() {
        return Ok(CanonicalSpecs {
            reduced_specs: Vec::new(),
            gauge: Gauge::identity(&[]),
            audit: audit_identifiability(specs).map_err(|r| {
                CustomFamilyError::DimensionMismatch {
                    reason: format!("pre-fit identifiability audit failed: {r}"),
                }
            })?,
            used_channel_aware_audit: false,
        });
    }

    // `design.nrows() == n_obs` is a struct invariant for every
    // ParameterBlockSpec: the canonical n-row operator is what the audit
    // and shape policy read.  Multi-channel survival LS stacked operators
    // live in `stacked_design` and are deliberately invisible here.
    let n_rows = specs[0].design.nrows();

    // ── Multi-channel routing decision ───────────────────────────────────
    //
    // Probe each spec's effective Jacobian at beta=0 to detect
    // multi-output blocks.  A block is multi-output when the returned
    // matrix has nrows > n_rows — i.e. nrows == n_rows * k for k > 1.
    // We use the spec-level n_outputs() shortcut when a jacobian_callback
    // is present; otherwise the block is always single-output (k=1).
    let max_n_outputs = specs
        .iter()
        .map(|s| {
            s.jacobian_callback
                .as_ref()
                .map(|cb| cb.n_outputs())
                .unwrap_or(1)
        })
        .max()
        .unwrap_or(1);
    let use_channel_aware = max_n_outputs > 1;

    log::debug!(
        "[CANON] canonicalize_for_identifiability: blocks={} n_rows={} \
         max_n_outputs={} route={}",
        specs.len(),
        n_rows,
        max_n_outputs,
        if use_channel_aware {
            "channel-aware"
        } else {
            "flat"
        },
    );

    // ── Per-block Jacobian Frobenius-norm logging (instrumentation) ──────
    //
    // Log the Frobenius norm and row-count of each block's effective
    // Jacobian before the audit so discrepancies between pilot and outer-fit
    // audits are visible in the log stream.
    //
    // This is purely diagnostic: the only consumer of `frob_sq` is the
    // `log::debug!` below.  A full `effective_jacobian_at` probe materialises
    // the block's entire `(n·k, p)` effective Jacobian — an `(n, p, k)`-class
    // transient that at biobank scale is hundreds of MiB per block, paid every
    // canonicalisation even when debug logging is OFF (#979).  Gate the whole
    // loop behind the log level so production fits (info/warn) pay nothing, and
    // when it does run, accumulate the Frobenius norm by streaming 4096-row
    // chunks instead of holding the full Jacobian.
    if log::log_enabled!(log::Level::Debug) {
        const FROB_CHUNK: usize = 4096;
        for spec in specs.iter() {
            let k = spec
                .jacobian_callback
                .as_ref()
                .map(|cb| cb.n_outputs())
                .unwrap_or(1);
            let jac_nrows = if use_channel_aware {
                n_rows * k
            } else {
                n_rows
            };
            let p = spec.design.ncols();
            let zeros = vec![0.0f64; p];
            let state = FamilyLinearizationState {
                beta: &zeros,
                family_scalars: None,
                channel_hessian: None,
                probit_frailty_scale: 1.0,
            };
            let mut frob_sq = 0.0_f64;
            let mut probe_err: Option<String> = None;
            for start in (0..n_rows).step_by(FROB_CHUNK) {
                let end = (start + FROB_CHUNK).min(n_rows);
                let chunk = match spec.jacobian_callback.as_ref() {
                    Some(cb) => cb.effective_jacobian_rows(&state, start..end),
                    None => {
                        let mut out = Array2::<f64>::zeros((end - start, p));
                        spec.design
                            .row_chunk_into(start..end, out.view_mut())
                            .map(|()| out)
                            .map_err(|e| e.to_string())
                    }
                };
                match chunk {
                    Ok(rows) => frob_sq += rows.iter().map(|v| v * v).sum::<f64>(),
                    Err(e) => {
                        probe_err = Some(e);
                        break;
                    }
                }
            }
            match probe_err {
                Some(e) => log::debug!(
                    "[CANON]   block '{}': effective_jacobian probe failed: {e}",
                    spec.name,
                ),
                None => log::debug!(
                    "[CANON]   block '{}': p={} jac_nrows={} frob_norm={:.4e}",
                    spec.name,
                    p,
                    jac_nrows,
                    frob_sq.sqrt(),
                ),
            }
        }
    }

    // ── Run the audit ─────────────────────────────────────────────────────
    let audit = if use_channel_aware {
        // Determine the common k (all blocks must agree on the channel count;
        // blocks without a jacobian_callback get a single-channel identity
        // adapter at k = max_n_outputs).
        let k = max_n_outputs;
        let mut operators: Vec<Arc<dyn RowJacobianOperator>> = Vec::with_capacity(specs.len());
        for spec in specs.iter() {
            let op: Arc<dyn RowJacobianOperator> = match spec.jacobian_callback.as_ref() {
                Some(cb) => {
                    // `from_callback` zero-pads the trailing channels for
                    // blocks with fewer outputs than the audit's common k,
                    // building the padded tensor directly.
                    let row_op = BlockJacobianAsRowOp::from_callback(
                        Arc::clone(cb),
                        n_rows,
                        spec.design.ncols(),
                        k,
                        &spec.name,
                    )
                    .map_err(|e| CustomFamilyError::DimensionMismatch {
                        reason: format!(
                            "canonicalize_for_identifiability: build \
                                         BlockJacobianAsRowOp for block '{}': {e}",
                            spec.name,
                        ),
                    })?;
                    Arc::new(row_op)
                }
                None => Arc::new(BlockJacobianAsRowOp::from_flat_design(
                    spec.design.clone(),
                    n_rows,
                    k,
                    &spec.name,
                )),
            };
            operators.push(op);
        }
        let row_hess = IdentityRowHessian::new(n_rows, k);
        let audit_result = audit_identifiability_channel_aware(specs, &operators, &row_hess)
            .map_err(|reason| CustomFamilyError::DimensionMismatch {
                reason: format!("pre-fit channel-aware identifiability audit failed: {reason}"),
            })?;

        log::info!(
            "[CANON] channel-aware audit: {} blocks, joint_rank={}/{} (flat audit NOT used)",
            specs.len(),
            audit_result
                .blocks
                .iter()
                .map(|b| b.effective_dim)
                .sum::<usize>(),
            specs.iter().map(|s| s.design.ncols()).sum::<usize>(),
        );

        // NOTE: the flat audit (`audit_identifiability`) must NOT be run on
        // multi-output blocks. It re-materialises each block's effective
        // Jacobian — which the channel-aware path has already proven is
        // `(n·k)`-row — and then the surrounding reduced-design / post-T
        // reconstruction conflates those `(n·k)`-row operators with the
        // `n`-row placeholder designs, broadcasting `(n·k, p)` into `(n, p)`
        // and panicking inside ndarray. The channel-aware `audit_result` is
        // the authoritative verdict (consumed below); no comparison audit is
        // sound or needed here.
        audit_result
    } else {
        let audit_result = audit_identifiability(specs).map_err(|reason| {
            CustomFamilyError::DimensionMismatch {
                reason: format!("pre-fit identifiability audit failed: {reason}"),
            }
        })?;
        log::debug!(
            "[CANON] flat audit: {} blocks, joint_rank={}",
            specs.len(),
            audit_result
                .blocks
                .iter()
                .map(|b| b.effective_dim)
                .sum::<usize>(),
        );
        audit_result
    };

    if audit.fatal {
        return Err(CustomFamilyError::IdentifiabilityFailure { audit });
    }

    let family_owned_geometry = specs.iter().any(|spec| spec.jacobian_callback.is_some());
    if family_owned_geometry && !audit.dropped_columns.is_empty() {
        let raw_widths: Vec<usize> = specs.iter().map(|spec| spec.design.ncols()).collect();
        let dropped_summary = audit
            .dropped_columns
            .iter()
            .map(|drop| format!("{}[{}]", drop.block, drop.column))
            .collect::<Vec<_>>()
            .join(", ");
        log::info!(
            "[CANON] width-preserving callback-owned geometry path: audit attributed \
             dropped columns [{dropped_summary}], but at least one block owns its \
             effective geometry via jacobian_callback; keeping raw block widths and \
             deferring curvature on the weak directions to the robust/Firth path"
        );
        return Ok(CanonicalSpecs {
            reduced_specs: specs.to_vec(),
            gauge: Gauge::identity(&raw_widths),
            audit,
            used_channel_aware_audit: use_channel_aware,
        });
    }

    let mut per_block_transform: Vec<Array2<f64>> = Vec::with_capacity(specs.len());
    let mut reduced_specs: Vec<ParameterBlockSpec> = Vec::with_capacity(specs.len());

    for spec in specs.iter() {
        let p_raw = spec.design.ncols();
        let dropped_locals: Vec<usize> = audit
            .dropped_columns
            .iter()
            .filter(|drop| drop.block == spec.name)
            .map(|drop| drop.column)
            .collect();
        let mut dropped_sorted = dropped_locals.clone();
        dropped_sorted.sort_unstable();
        dropped_sorted.dedup();
        for &col in &dropped_sorted {
            if col >= p_raw {
                crate::bail_dim_custom!(
                    "canonicalize_for_identifiability: audit reported dropped column \
                         {col} for block '{}' which has only {} columns",
                    spec.name,
                    p_raw,
                );
            }
        }
        let kept: Vec<usize> = (0..p_raw)
            .filter(|c| dropped_sorted.binary_search(c).is_err())
            .collect();
        let r_block = kept.len();

        let mut t_i = Array2::<f64>::zeros((p_raw, r_block));
        for (col_out, &raw_col) in kept.iter().enumerate() {
            t_i[[raw_col, col_out]] = 1.0;
        }

        let reduced_design = if dropped_sorted.is_empty() {
            spec.design.clone()
        } else {
            build_reduced_design(&spec.design, &kept, &spec.name, &t_i)?
        };

        // Column-reduce the optional stacked solver operator alongside
        // `design` so the post-canonical β still indexes the same
        // surviving columns in both views.
        let reduced_stacked_design: Option<DesignMatrix> = match spec.stacked_design.as_ref() {
            Some(stacked) if !dropped_sorted.is_empty() => {
                Some(build_reduced_design(stacked, &kept, &spec.name, &t_i)?)
            }
            Some(stacked) => Some(stacked.clone()),
            None => None,
        };
        let reduced_stacked_offset = spec.stacked_offset.clone();

        let reduced_penalties: Vec<PenaltyMatrix> = spec
            .penalties
            .iter()
            .map(|p| pull_back_penalty(p, &kept))
            .collect();

        let reduced_initial_beta = match &spec.initial_beta {
            Some(beta_raw) => {
                if beta_raw.len() != p_raw {
                    crate::bail_dim_custom!(
                        "canonicalize_for_identifiability: block '{}' initial_beta \
                             length {} != design ncols {}",
                        spec.name,
                        beta_raw.len(),
                        p_raw,
                    );
                }
                let mut theta = Array1::<f64>::zeros(r_block);
                for (out_idx, &raw_col) in kept.iter().enumerate() {
                    theta[out_idx] = beta_raw[raw_col];
                }
                Some(theta)
            }
            None => None,
        };

        reduced_specs.push(ParameterBlockSpec {
            name: spec.name.clone(),
            design: reduced_design,
            offset: spec.offset.clone(),
            penalties: reduced_penalties,
            // Pulled-back penalties may carry an enlarged structural
            // nullspace (a column dropped from a smooth's pure-span
            // basis adds that direction to the penalty kernel).
            // Falling back to eigenvalue-based rank detection in the
            // pseudo-logdet path is the safe choice when the
            // selection-T pullback changes the kernel structurally.
            nullspace_dims: Vec::new(),
            initial_log_lambdas: spec.initial_log_lambdas.clone(),
            initial_beta: reduced_initial_beta,
            gauge_priority: spec.gauge_priority,
            // The jacobian_callback (if any) is forwarded: the callback
            // internally uses the raw design width, which the column-
            // selection T_i accounts for by selecting surviving columns.
            jacobian_callback: spec.jacobian_callback.clone(),
            stacked_design: reduced_stacked_design,
            stacked_offset: reduced_stacked_offset,
        });
        per_block_transform.push(t_i);
    }

    // ── Post-T invariant check + MAP uniqueness check ────────────────────
    //
    // Materialise the joint post-T Jacobian J_can = J · T_full where
    // J is the (n*k × p_total) stacked Jacobian and T_full is block-diagonal
    // of the per-block T_i.  Assert rank(J_can) == rank(J_pre_T).
    //
    // After confirming the rank invariant, run the MAP uniqueness check:
    //   ker(J^T W J) ∩ ker(S) = {0}
    // where S = blockdiag of the reduced-spec joint penalty.  If any null
    // direction of J^T W J also lies in ker(S), the MAP is non-unique —
    // refuse with MapUniquenessFailure naming the dominant block.
    {
        let p_total_raw: usize = specs.iter().map(|s| s.design.ncols()).sum();
        let p_total_red: usize = per_block_transform.iter().map(|t| t.ncols()).sum();
        let k = if use_channel_aware { max_n_outputs } else { 1 };
        let nk = n_rows * k;

        // Build J_pre_T: (nk, p_total_raw) by row-stacking per-block Jacobians.
        let mut j_pre = Array2::<f64>::zeros((nk, p_total_raw));
        let mut col_off = 0usize;
        for spec in specs.iter() {
            let p_b = spec.design.ncols();
            let zeros = vec![0.0f64; p_b];
            let state = FamilyLinearizationState {
                beta: &zeros,
                family_scalars: None,
                channel_hessian: None,
                probit_frailty_scale: 1.0,
            };
            match spec.effective_jacobian_at("canonicalize_rank_check", &state) {
                Ok(j_b) => {
                    // j_b is channel-major (k_b·n_rows, p_b): row `r·n_rows + i`
                    // carries observation `i`'s channel-`r` row Jacobian
                    // (the layout produced by every BlockEffectiveJacobian
                    // impl — see `BlockJacobianAsRowOp`).  j_pre is built in
                    // the audit-compiler's interleaved layout (row `i*k + r`
                    // for observation `i`, channel `r`), so this is a
                    // channel-major → interleaved transpose.
                    let k_b = j_b.nrows() / n_rows;
                    let r_max = k_b.min(k);
                    for r in 0..r_max {
                        let src_row_base = r * n_rows;
                        for i in 0..n_rows {
                            let dst_row = i * k + r;
                            let src_row = src_row_base + i;
                            for j in 0..p_b {
                                j_pre[[dst_row, col_off + j]] = j_b[[src_row, j]];
                            }
                        }
                    }
                }
                Err(_) => {
                    // Fall back: embed flat design as channel 0.
                    if let Ok(flat) = spec
                        .design
                        .try_to_dense_arc("canonicalize_rank_check")
                        .map(|a| a.as_ref().clone())
                    {
                        for i in 0..n_rows.min(flat.nrows()) {
                            for j in 0..p_b.min(flat.ncols()) {
                                j_pre[[i * k, col_off + j]] = flat[[i, j]];
                            }
                        }
                    }
                }
            }
            col_off += p_b;
        }

        // Build J_can = J_pre · T_full where T_full = blockdiag(T_i).
        let mut j_can = Array2::<f64>::zeros((nk, p_total_red));
        let mut raw_col_off = 0usize;
        let mut red_col_off = 0usize;
        for t_i in per_block_transform.iter() {
            let p_i = t_i.nrows();
            let r_i = t_i.ncols();
            if p_i > 0 && r_i > 0 {
                // J_can[:, red_col_off .. red_col_off+r_i]
                //   = J_pre[:, raw_col_off .. raw_col_off+p_i] · T_i
                for row in 0..nk {
                    for out_col in 0..r_i {
                        let mut acc = 0.0_f64;
                        for in_col in 0..p_i {
                            acc += j_pre[[row, raw_col_off + in_col]] * t_i[[in_col, out_col]];
                        }
                        j_can[[row, red_col_off + out_col]] = acc;
                    }
                }
            }
            raw_col_off += p_i;
            red_col_off += r_i;
        }

        // RRQR rank on J_pre and J_can.
        let rank_j_pre = rrqr_with_permutation(&j_pre, default_rrqr_rank_alpha())
            .map(|r| r.rank)
            .unwrap_or(0);
        let rank_j_can = rrqr_with_permutation(&j_can, default_rrqr_rank_alpha())
            .map(|r| r.rank)
            .unwrap_or(0);

        log::info!(
            "[CANON] post-T invariant: rank(J)={rank_j_pre} rank(J_can)={rank_j_can} \
             (p_raw={p_total_raw} p_red={p_total_red} k={k})",
        );

        if rank_j_pre != rank_j_can {
            let block_shapes: Vec<String> = per_block_transform
                .iter()
                .zip(specs.iter())
                .map(|(t, s)| format!("{}:({},{})", s.name, t.nrows(), t.ncols()))
                .collect();
            return Err(CustomFamilyError::DimensionMismatch {
                reason: format!(
                    "canonicalize_for_identifiability: post-T rank invariant violated — \
                     rank(J)={rank_j_pre} but rank(J_can)={rank_j_can} \
                     (p_raw={p_total_raw} p_red={p_total_red} k={k}); \
                     this is a bug in T construction; per-block T shapes: [{}]",
                    block_shapes.join(", "),
                ),
            });
        }

        // ── MAP uniqueness check ──────────────────────────────────────────
        //
        // Build the joint penalty S = blockdiag(sum_of_reduced_penalties_block_i)
        // in the reduced parameter space (p_total_red × p_total_red).
        // Each block's total penalty is the sum of its per-lambda penalty
        // matrices (all with equal weight 1.0 — the uniqueness condition is
        // independent of the specific λ values since we only need to know
        // whether ANY penalty covers the direction, not the magnitude).
        if p_total_red > 0 {
            let mut s_joint = Array2::<f64>::zeros((p_total_red, p_total_red));
            let mut red_off = 0usize;
            for spec in reduced_specs.iter() {
                let r_i = spec.design.ncols();
                for pen in spec.penalties.iter() {
                    let s_dense = pen.as_dense_cow();
                    // s_dense is (r_i, r_i).  Add it into the diagonal block.
                    if s_dense.nrows() == r_i && s_dense.ncols() == r_i {
                        for ii in 0..r_i {
                            for jj in 0..r_i {
                                s_joint[[red_off + ii, red_off + jj]] += s_dense[[ii, jj]];
                            }
                        }
                    }
                }
                red_off += r_i;
            }

            // Build col_offsets for the reduced specs.
            let mut red_col_offsets: Vec<usize> = Vec::with_capacity(reduced_specs.len() + 1);
            red_col_offsets.push(0);
            for spec in reduced_specs.iter() {
                let prev = *red_col_offsets.last().unwrap();
                red_col_offsets.push(prev + spec.design.ncols());
            }

            // The MAP uniqueness check operates on the flat (n_rows, p_total_red)
            // view of J_can.  For multi-channel families (k > 1), the channel
            // stacking increases the effective row count but the penalty still
            // lives in p_total_red dimensions.  We use the (nk, p_total_red)
            // J_can directly: the additional channel rows only help — if J^T W J
            // (with J being the full nk-row matrix) already has a non-trivial
            // null space, those extra rows could only shrink it relative to the
            // flat view.  Using the full J_can gives the tightest (most
            // conservative) null-space detection.
            crate::solver::identifiability_audit::check_map_uniqueness(
                &j_can,
                &[],
                &s_joint,
                &reduced_specs,
                &red_col_offsets,
            )
            .map_err(|error| {
                log::warn!("[CANON] MAP uniqueness check failed: {}", error.message,);
                CustomFamilyError::MapUniquenessFailure { error }
            })?;

            log::debug!(
                "[CANON] MAP uniqueness check passed \
                 (p_red={p_total_red} penalty_blocks={})",
                reduced_specs
                    .iter()
                    .map(|s| s.penalties.len())
                    .sum::<usize>(),
            );
        }
    }

    Ok(CanonicalSpecs {
        reduced_specs,
        gauge: Gauge::from_block_transforms(&per_block_transform),
        audit,
        used_channel_aware_audit: use_channel_aware,
    })
}

/// Flag-gated exact orthogonalisation of structural confounds across blocks.
///
/// Returns `Ok(Some(canon))` when an overlap was found and removed by exact
/// W-metric reparameterisation; `Ok(None)` when orthogonalisation is not
/// applicable (multi-channel families, sparse/operator-backed designs, or no
/// cross-block overlap detected) so the caller falls through to the unmodified
/// audit gate. Never returns a *worse* result than today: a clean design
/// yields `None` (byte-identical fall-through), and an unrepresentable
/// structural condition also yields `None`.
fn try_orthogonalize_blocks(
    specs: &[ParameterBlockSpec],
) -> Result<Option<CanonicalSpecs>, CustomFamilyError> {
    if specs.len() < 2 {
        return Ok(None);
    }
    // Families whose blocks carry a `jacobian_callback` (BMS marginal-slope,
    // survival LS, …) own their effective geometry: the family reconstructs the
    // additive predictor from internal full-width designs (e.g. BMS reads its
    // own `marginal_design`/`logslope_design` per row), and the block `design`
    // here is only the *raw* basis the callback consumes. A per-block reparam
    // `X_b · V_b` that drops the callback does NOT change what the family
    // computes, but it shrinks the block coefficient width below the family's
    // internal design width — leaving the inner solve's reduced β (e.g. 8) out
    // of sync with the family's full design (e.g. 12) and tripping the family's
    // own shape validation. Such families are robustified by the Tier-B
    // joint-Newton Jeffreys/Firth term, which adds curvature on the
    // under-identified span WITHOUT any design surgery and keeps every block β at
    // full width. Defer here so they take that path. (Single-channel, plain-design
    // blocks — `jacobian_callback: None` — are reparam'd here; a clean design with
    // no overlap to remove falls through to the audit gate byte-identically.)
    let family_owned_geometry = specs.iter().any(|s| s.jacobian_callback.is_some());
    if family_owned_geometry {
        return Ok(None);
    }

    // Densify every block design. Any non-densifiable (large/lazy operator)
    // block makes the per-block reparam non-representable here → defer.
    let n_rows = specs[0].design.nrows();
    let mut block_designs: Vec<Array2<f64>> = Vec::with_capacity(specs.len());
    for spec in specs.iter() {
        if spec.design.nrows() != n_rows {
            return Ok(None);
        }
        let dense = match spec
            .design
            .try_to_dense_arc("orthogonalize_design_blocks densify")
        {
            Ok(arc) => arc.as_ref().clone(),
            Err(_) => return Ok(None),
        };
        block_designs.push(dense);
    }

    // Pilot W-metric: the released structural audit runs in the unweighted
    // (Euclidean) row metric, and structural rank-overlap removal is exact in
    // that metric. Use uniform weights so the reparam matches the audit's
    // geometry; the family-curvature W-metric refinement belongs to the
    // Tier-B coupled path.
    let weight = vec![1.0_f64; n_rows];
    let priority: Vec<u32> = specs.iter().map(|s| s.gauge_priority as u32).collect();

    let ortho = orthogonalize_design_blocks(&block_designs, &priority, &weight).map_err(|e| {
        CustomFamilyError::DimensionMismatch {
            reason: format!("orthogonalize_design_blocks failed: {e}"),
        }
    })?;

    // No overlap removed ⇒ nothing to do; fall through to the standard gate so
    // behaviour is byte-identical to today on clean / square-rotation designs.
    if ortho.dropped.is_empty() {
        return Ok(None);
    }

    // Equal-priority gauge-ambiguity guard. Orthogonalisation removes a block's
    // overlap by residualising it against the cumulative anchor of all
    // higher-or-equal-priority blocks already visited (descending-priority,
    // stable-on-ties order — identical to `orthogonalize_design_blocks`). When a
    // direction is absorbed *into an equal-priority anchor block* there is NO
    // gauge ordering to decide which block loses the shared column: the inner
    // KKT system is structurally rank-deficient regardless of penalty, exactly
    // the contract the flat audit gate encodes (`all_priorities_equal` forces
    // `gauge_resolves_rank_deficiency = false`, so the alias is fatal). Reducing
    // here would silently drop the later block's column instead of refusing.
    // Defer to the audit gate, which raises `IdentifiabilityFailure`, whenever
    // an absorbed block shares its priority with any earlier-visited block.
    let mut visit_order: Vec<usize> = (0..specs.len()).collect();
    visit_order.sort_by(|&a, &b| priority[b].cmp(&priority[a]));
    let visit_rank: Vec<usize> = {
        let mut rank = vec![0usize; specs.len()];
        for (r, &b) in visit_order.iter().enumerate() {
            rank[b] = r;
        }
        rank
    };
    for annotation in ortho
        .direction_annotations
        .iter()
        .filter(|annotation| annotation.absorbed_width > 0)
    {
        let absorbed = annotation.block_idx;
        let equal_priority_anchor_exists = (0..specs.len()).any(|other| {
            other != absorbed
                && visit_rank[other] < visit_rank[absorbed]
                && priority[other] == priority[absorbed]
        });
        if equal_priority_anchor_exists {
            log::info!(
                "[CANON] orthogonalisation declined: block {} (priority {}) was absorbed into an \
                 equal-priority anchor — exact alias has no gauge ordering; deferring to the fatal \
                 audit gate instead of arbitrarily dropping the later block's column",
                absorbed,
                priority[absorbed],
            );
            return Ok(None);
        }
    }

    for annotation in ortho
        .direction_annotations
        .iter()
        .filter(|annotation| annotation.absorbed_width > 0)
    {
        log::info!(
            "[IDENT] structural direction annotation: block={} raw_width={} kept_width={} absorbed_width={} kind={:?}",
            annotation.block_idx,
            annotation.raw_width,
            annotation.kept_width,
            annotation.absorbed_width,
            annotation.kind,
        );
    }

    // Build orthogonalised specs: design ← X_b · V_b (via
    // CoefficientTransformOperator), penalties ← V_bᵀ S V_b, initial_beta ←
    // V_bᵀ β₀ (least-squares image; V_b has orthonormal columns so V_bᵀ is the
    // pseudo-inverse), and remember V_b for the round-trip composition.
    let mut ortho_specs: Vec<ParameterBlockSpec> = Vec::with_capacity(specs.len());
    for (spec, v_b) in specs.iter().zip(ortho.block_transforms.iter()) {
        let p_b = spec.design.ncols();
        if v_b.nrows() != p_b {
            return Err(CustomFamilyError::DimensionMismatch {
                reason: format!(
                    "orthogonalize: block '{}' transform has {} rows but design has {p_b} columns",
                    spec.name,
                    v_b.nrows(),
                ),
            });
        }
        let inner_dense = match &spec.design {
            DesignMatrix::Dense(d) => d.clone(),
            DesignMatrix::Sparse(_) => {
                let dense = spec
                    .design
                    .try_to_dense_arc("orthogonalize reduced-design densify")
                    .map_err(|reason| CustomFamilyError::DimensionMismatch {
                        reason: format!(
                            "orthogonalize: densify block '{}' failed: {reason}",
                            spec.name,
                        ),
                    })?;
                DenseDesignMatrix::from(dense)
            }
        };
        let op = CoefficientTransformOperator::new(inner_dense, v_b.clone()).map_err(|reason| {
            CustomFamilyError::DimensionMismatch {
                reason: format!(
                    "orthogonalize: build CoefficientTransformOperator for block '{}': {reason}",
                    spec.name,
                ),
            }
        })?;
        let reduced_design = DesignMatrix::Dense(DenseDesignMatrix::from(Arc::new(op)));

        let reduced_penalties: Vec<PenaltyMatrix> = spec
            .penalties
            .iter()
            .map(|p| pull_back_penalty_dense(p, v_b))
            .collect();

        let reduced_initial_beta = match &spec.initial_beta {
            Some(beta_raw) => {
                if beta_raw.len() != p_b {
                    return Err(CustomFamilyError::DimensionMismatch {
                        reason: format!(
                            "orthogonalize: block '{}' initial_beta length {} != design ncols {p_b}",
                            spec.name,
                            beta_raw.len(),
                        ),
                    });
                }
                Some(v_b.t().dot(beta_raw))
            }
            None => None,
        };

        // The orthogonalised design no longer matches the raw-width
        // jacobian_callback / stacked_design; single-channel blocks here carry
        // neither (multi-channel was deferred above), so drop them defensively.
        ortho_specs.push(ParameterBlockSpec {
            name: spec.name.clone(),
            design: reduced_design,
            offset: spec.offset.clone(),
            penalties: reduced_penalties,
            nullspace_dims: Vec::new(),
            initial_log_lambdas: spec.initial_log_lambdas.clone(),
            initial_beta: reduced_initial_beta,
            gauge_priority: spec.gauge_priority,
            jacobian_callback: None,
            stacked_design: None,
            stacked_offset: None,
        });
    }

    // Run the standard (non-orthogonalising) audit + post-T invariant + MAP
    // uniqueness checks on the orthogonalised specs. With the overlap removed,
    // this should produce a clean (identity-T) verdict; if a *residual* rank
    // deficiency survives orthogonalisation, the fail-closed gate still
    // refuses with an actionable diagnostic.
    let inner = canonicalize_for_identifiability_inner(&ortho_specs, false)?;

    // Compose the round-trip transform: β_raw = V_b · (T_inner · θ).
    // `inner.gauge.block_transform(b)` is T_inner (selection/identity from
    // the audit gate); the full raw lift is `V_b · T_inner`.
    let mut composed_transform: Vec<Array2<f64>> = Vec::with_capacity(specs.len());
    for (b, v_b) in ortho.block_transforms.iter().enumerate() {
        let t_inner = inner.gauge.block_transform(b);
        if v_b.ncols() != t_inner.nrows() {
            return Err(CustomFamilyError::DimensionMismatch {
                reason: format!(
                    "orthogonalize: transform composition shape mismatch — V_b is {:?}, \
                     T_inner is {:?}",
                    v_b.dim(),
                    t_inner.dim(),
                ),
            });
        }
        composed_transform.push(v_b.dot(&t_inner));
    }

    log::info!(
        "[CANON] orthogonalisation applied: {} block(s) shed overlap directions {:?}; \
         p_raw={} → p_reduced={}",
        ortho.dropped.len(),
        ortho.dropped,
        specs.iter().map(|s| s.design.ncols()).sum::<usize>(),
        composed_transform.iter().map(|t| t.ncols()).sum::<usize>(),
    );

    Ok(Some(CanonicalSpecs {
        reduced_specs: inner.reduced_specs,
        gauge: Gauge::from_block_transforms(&composed_transform),
        audit: inner.audit,
        used_channel_aware_audit: inner.used_channel_aware_audit,
    }))
}

/// Pull a penalty back through a dense reparam `V` as `Vᵀ S V`, preserving the
/// precision label and fixed-λ pin. Companion to [`pull_back_penalty`] (which
/// pulls back through a column *selection*); this handles the dense
/// orthogonalisation transform.
fn pull_back_penalty_dense(penalty: &PenaltyMatrix, v: &Array2<f64>) -> PenaltyMatrix {
    let label = penalty.precision_label().map(|s| s.to_string());
    let fixed_log_lambda = penalty.fixed_log_lambda();
    let dense = penalty.as_dense_cow();
    // Vᵀ S V  (r × r).
    let s_v = dense.dot(v);
    let reduced = v.t().dot(&s_v);
    let mut base = PenaltyMatrix::Dense(reduced);
    if let Some(lbl) = label {
        base = base.with_precision_label(lbl);
    }
    if let Some(value) = fixed_log_lambda {
        base = base.with_fixed_log_lambda(value);
    }
    base
}

fn build_reduced_design(
    raw: &DesignMatrix,
    kept: &[usize],
    block_name: &str,
    t_i: &Array2<f64>,
) -> Result<DesignMatrix, CustomFamilyError> {
    let inner_dense = match raw {
        DesignMatrix::Dense(d) => d.clone(),
        DesignMatrix::Sparse(_) => {
            let dense = raw
                .try_to_dense_by_chunks(&format!(
                    "canonicalize_for_identifiability sparse->dense block '{block_name}'"
                ))
                .map_err(|reason| CustomFamilyError::DimensionMismatch {
                    reason: format!(
                        "canonicalize_for_identifiability: densify sparse block '{block_name}' \
                         failed: {reason}"
                    ),
                })?;
            DenseDesignMatrix::from(dense)
        }
    };
    // Hot path: when the inner is already a materialised dense Array2,
    // slice the kept columns directly. This avoids carrying the full
    // raw-width inner through every PIRLS iteration when many columns
    // were dropped.
    if let Some(arr) = inner_dense.as_dense_ref() {
        let reduced =
            Array2::<f64>::from_shape_fn((arr.nrows(), kept.len()), |(i, j)| arr[[i, kept[j]]]);
        return Ok(DesignMatrix::Dense(DenseDesignMatrix::from(reduced)));
    }
    // Operator-backed inner (Lazy): preserve the operator structure
    // by wrapping with CoefficientTransformOperator on the selection T.
    let op = CoefficientTransformOperator::new(inner_dense, t_i.clone()).map_err(|reason| {
        CustomFamilyError::DimensionMismatch {
            reason: format!(
                "canonicalize_for_identifiability: build CoefficientTransformOperator \
                 for block '{block_name}': {reason}"
            ),
        }
    })?;
    Ok(DesignMatrix::Dense(DenseDesignMatrix::from(Arc::new(op))))
}

fn pull_back_penalty(penalty: &PenaltyMatrix, kept: &[usize]) -> PenaltyMatrix {
    let label = penalty.precision_label().map(|s| s.to_string());
    let fixed_log_lambda = penalty.fixed_log_lambda();
    let dense = penalty.as_dense_cow();
    let reduced =
        Array2::<f64>::from_shape_fn((kept.len(), kept.len()), |(i, j)| dense[[kept[i], kept[j]]]);
    let mut base = PenaltyMatrix::Dense(reduced);
    if let Some(lbl) = label {
        base = base.with_precision_label(lbl);
    }
    if let Some(value) = fixed_log_lambda {
        base = base.with_fixed_log_lambda(value);
    }
    base
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::families::custom_family::AdditiveBlockJacobian;
    use crate::linalg::matrix::DenseDesignMatrix;
    use ndarray::Array2;

    fn spec_from_dense(name: &str, design: Array2<f64>) -> ParameterBlockSpec {
        let n = design.nrows();
        ParameterBlockSpec {
            name: name.to_string(),
            design: DesignMatrix::Dense(DenseDesignMatrix::from(design)),
            offset: Array1::<f64>::zeros(n),
            penalties: Vec::new(),
            nullspace_dims: Vec::new(),
            initial_log_lambdas: Array1::<f64>::zeros(0),
            initial_beta: None,
            gauge_priority: 100,
            jacobian_callback: None,
            stacked_design: None,
            stacked_offset: None,
        }
    }

    fn linspace(n: usize) -> Array1<f64> {
        if n <= 1 {
            return Array1::<f64>::zeros(n.max(1));
        }
        let step = 2.0 / (n as f64 - 1.0);
        Array1::from_iter((0..n).map(|i| -1.0 + step * i as f64))
    }

    #[test]
    fn canonical_clean_specs_identity_transform() {
        let n = 32;
        let x = linspace(n);
        let mut p = Array2::<f64>::zeros((n, 2));
        let mut s = Array2::<f64>::zeros((n, 2));
        for i in 0..n {
            p[[i, 0]] = 1.0;
            p[[i, 1]] = x[i];
            s[[i, 0]] = x[i] * x[i];
            s[[i, 1]] = x[i] * x[i] * x[i];
        }
        let specs = [spec_from_dense("p", p), spec_from_dense("s", s)];
        let canon = canonicalize_for_identifiability(&specs).expect("clean canonical must succeed");
        assert_eq!(canon.reduced_specs.len(), 2);
        assert_eq!(canon.gauge.block_transform(0).dim(), (2, 2));
        assert_eq!(canon.gauge.block_transform(1).dim(), (2, 2));
        let theta = vec![Array1::from(vec![0.5, -0.25]), Array1::from(vec![1.0, 2.0])];
        let raw = canon.gauge.lift_block_betas(&theta);
        assert_eq!(raw[0].as_slice().unwrap(), &[0.5, -0.25]);
        assert_eq!(raw[1].as_slice().unwrap(), &[1.0, 2.0]);
    }

    /// Fail-closed contract: an aliased smooth-constant ~ intercept
    /// configuration must produce `CustomFamilyError::IdentifiabilityFailure`,
    /// not a silently-reduced spec list. Substituting a column-reduced
    /// `ParameterBlockSpec` under a family that captures raw-width
    /// designs (the current `CustomFamily` contract) panics inside
    /// `DesignMatrix::syr_row_into_view`; the safe behaviour is to
    /// refuse the fit with an actionable diagnostic in milliseconds.
    #[test]
    fn canonical_refuses_aliased_smooth_constant_with_intercept() {
        let n = 64;
        let x = linspace(n);
        let parametric = Array2::<f64>::from_shape_fn((n, 1), |(_, _)| 1.0);
        let mut smooth = Array2::<f64>::zeros((n, 3));
        for i in 0..n {
            smooth[[i, 0]] = 1.0;
            smooth[[i, 1]] = x[i] * x[i];
            smooth[[i, 2]] = x[i] * x[i] * x[i];
        }
        let specs = [
            spec_from_dense("intercept", parametric),
            spec_from_dense("smooth_with_const", smooth),
        ];
        let err = canonicalize_for_identifiability(&specs)
            .expect_err("aliased smooth-constant + intercept must refuse, not reduce");
        match err {
            CustomFamilyError::IdentifiabilityFailure { audit } => {
                assert!(
                    audit.fatal,
                    "audit attached to IdentifiabilityFailure must be fatal; got {}",
                    audit.summary,
                );
                assert!(
                    audit.summary.contains("intercept")
                        && audit.summary.contains("smooth_with_const"),
                    "refusal summary must name both offending blocks; got {:?}",
                    audit.summary,
                );
            }
            other => panic!("expected IdentifiabilityFailure, got {other:?}"),
        }
    }

    /// Five-block large-scale aliasing repro. Each block carries an
    /// intercept-like constant column; gauge_priority is set per the
    /// survival marginal-slope ownership policy (time=200 > marginal=150
    /// > logslope=120 > score_warp=80 > link_dev=60). The joint design
    /// has a 4-D null space among the five constants.
    ///
    /// Under the gauge-aware contract the canonicalisation must SUCCEED
    /// (not refuse) because the distinct `gauge_priority` values provide
    /// an unambiguous ordering for which columns to drop. The
    /// priority-ordered RRQR demotes the lowest-priority participants;
    /// the canonical-gauge pipeline applies the column-selection matrices
    /// and proceeds with reduced specs.  `time_surface` (highest priority)
    /// must retain all its columns; every attributed drop must belong to
    /// one of the four lower-priority blocks.
    #[test]
    fn canonical_five_block_gauge_ownership_succeeds_with_attribution() {
        let n = 96;
        let x = linspace(n);
        // Each block carries `ones(n)` in column 0 (the shared-constant
        // direction) plus per-block-unique content in columns 1..p.
        // The joint design therefore has a 4-D null space among the
        // five constant columns; priority decides which four are
        // dropped. We expect canonicalize to drop one constant from
        // each of (marginal, logslope, score_warp, link_dev) and keep
        // the time block's constant.
        let mut time = Array2::<f64>::zeros((n, 3));
        let mut marginal = Array2::<f64>::zeros((n, 3));
        let mut logslope = Array2::<f64>::zeros((n, 3));
        let mut score_warp = Array2::<f64>::zeros((n, 2));
        let mut link_dev = Array2::<f64>::zeros((n, 2));
        for i in 0..n {
            time[[i, 0]] = 1.0;
            time[[i, 1]] = x[i];
            time[[i, 2]] = x[i] * x[i] * x[i];
            marginal[[i, 0]] = 1.0;
            marginal[[i, 1]] = x[i] * x[i];
            marginal[[i, 2]] = x[i].sin();
            logslope[[i, 0]] = 1.0;
            logslope[[i, 1]] = (3.0 * x[i]).sin();
            logslope[[i, 2]] = (6.0 * x[i]).sin();
            score_warp[[i, 0]] = 1.0;
            score_warp[[i, 1]] = (5.0 * x[i]).cos();
            link_dev[[i, 0]] = 1.0;
            link_dev[[i, 1]] = (7.0 * x[i]).tanh();
        }
        let mut t_spec = spec_from_dense("time_surface", time);
        t_spec.gauge_priority = 200;
        let mut m_spec = spec_from_dense("marginal_surface", marginal);
        m_spec.gauge_priority = 150;
        let mut g_spec = spec_from_dense("logslope_surface", logslope);
        g_spec.gauge_priority = 120;
        let mut w_spec = spec_from_dense("score_warp_dev", score_warp);
        w_spec.gauge_priority = 80;
        let mut l_spec = spec_from_dense("link_dev", link_dev);
        l_spec.gauge_priority = 60;
        let specs = [t_spec, m_spec, g_spec, w_spec, l_spec];

        // With distinct gauge_priority values, the audit recognises that
        // the rank deficiency is gauge-resolvable and returns Ok (non-fatal).
        // The canonical-gauge pipeline proceeds with the column reductions.
        let canon = canonicalize_for_identifiability(&specs).expect(
            "five-block aliased joint with distinct gauge_priority must succeed (gauge-resolved)",
        );

        // The audit stored in canon.audit must be non-fatal.
        assert!(
            !canon.audit.fatal,
            "audit must be non-fatal when gauge_priority is non-trivial and \
             all drops are attributed to lower-priority blocks; got {}",
            canon.audit.summary,
        );

        // Raw p_total = 3+3+3+2+2 = 13; expected rank = 13 − 4 = 9.
        // Audit's per-block effective_dim sum equals joint_rank.
        let total_kept: usize = canon.audit.blocks.iter().map(|b| b.effective_dim).sum();
        assert_eq!(
            total_kept,
            9,
            "expected joint rank = 13 − 4 = 9 reported by audit; got {total_kept} \
             (per-block effective_dim {:?})",
            canon
                .audit
                .blocks
                .iter()
                .map(|b| (b.block_name.clone(), b.effective_dim))
                .collect::<Vec<_>>(),
        );

        // Gauge-priority attribution: with priority-descending column ordering,
        // the time_surface block (highest priority = 200) must NOT appear among
        // the attributed drops — every attributed drop belongs to one of the
        // four lower-priority blocks.
        for drop in &canon.audit.dropped_columns {
            assert_ne!(
                drop.block, "time_surface",
                "highest-priority block must never be the attributed drop \
                 origin under priority-aware RRQR; got drop on time_surface \
                 ({drop:?})",
            );
        }

        // The reduced specs must have 9 total columns (rank = 9).
        let reduced_total: usize = canon.reduced_specs.iter().map(|s| s.design.ncols()).sum();
        assert_eq!(
            reduced_total, 9,
            "reduced specs must have joint rank = 9 total columns; got {reduced_total}",
        );

        // time_surface retains all 3 of its columns (highest priority, never dropped).
        assert_eq!(
            canon.reduced_specs[0].design.ncols(),
            3,
            "time_surface must retain all 3 columns after gauge canonicalisation",
        );
    }

    /// On a clean (non-fatal) configuration with a non-trivial penalty,
    /// canonicalisation must succeed with **identity** transforms (the
    /// fail-closed contract makes reduction unreachable, but the
    /// pull-back-on-identity path is still exercised by the lift
    /// machinery downstream). The reduced penalty equals the raw
    /// penalty modulo cloning.
    #[test]
    fn canonical_clean_specs_with_penalty_round_trip() {
        let n = 32;
        let x = linspace(n);
        let parametric = Array2::<f64>::from_shape_fn((n, 1), |(_, _)| 1.0);
        // Smooth WITHOUT a duplicate constant column — clean joint design.
        let mut smooth = Array2::<f64>::zeros((n, 2));
        for i in 0..n {
            smooth[[i, 0]] = x[i] * x[i];
            smooth[[i, 1]] = x[i] * x[i] * x[i];
        }
        let mut smooth_spec = spec_from_dense("smooth_only", smooth);
        let mut s = Array2::<f64>::zeros((2, 2));
        s[[0, 0]] = 4.0;
        s[[1, 1]] = 9.0;
        s[[0, 1]] = 1.5;
        s[[1, 0]] = 1.5;
        smooth_spec.penalties = vec![PenaltyMatrix::Dense(s.clone())];
        smooth_spec.initial_log_lambdas = Array1::from(vec![0.0]);
        let specs = [spec_from_dense("intercept", parametric), smooth_spec];
        let canon = canonicalize_for_identifiability(&specs)
            .expect("clean canonical must succeed with identity transforms");
        let smooth_reduced = &canon.reduced_specs[1];
        assert_eq!(smooth_reduced.penalties.len(), 1);
        let dense_red = smooth_reduced.penalties[0].as_dense_cow().into_owned();
        // Identity pullback: penalty dimensions equal raw design width.
        assert_eq!(dense_red.dim(), (2, 2));
        // Identity transform: per-block transform is the 2×2 identity.
        let t_smooth = canon.gauge.block_transform(1);
        assert_eq!(t_smooth.dim(), (2, 2));
        for i in 0..2 {
            for j in 0..2 {
                let expected = if i == j { 1.0 } else { 0.0 };
                assert_eq!(t_smooth[[i, j]], expected, "T_smooth must be identity");
            }
        }
    }

    fn spec_from_dense_with_priority(
        name: &str,
        design: Array2<f64>,
        gauge_priority: u8,
    ) -> ParameterBlockSpec {
        let mut s = spec_from_dense(name, design);
        s.gauge_priority = gauge_priority;
        s
    }

    #[test]
    fn callback_owned_geometry_keeps_raw_width_after_audit_drop() {
        let n = 48;
        let x = linspace(n);
        let mut anchor = Array2::<f64>::zeros((n, 2));
        let mut callback_owned = Array2::<f64>::zeros((n, 2));
        for i in 0..n {
            anchor[[i, 0]] = 1.0;
            anchor[[i, 1]] = x[i];
            callback_owned[[i, 0]] = x[i];
            callback_owned[[i, 1]] = x[i] * x[i];
        }

        let anchor_spec = spec_from_dense_with_priority("marginal_surface", anchor, 150);
        let mut callback_spec =
            spec_from_dense_with_priority("logslope_surface", callback_owned.clone(), 120);
        callback_spec.jacobian_callback = Some(Arc::new(AdditiveBlockJacobian {
            design: callback_owned,
            own_output: 0,
            n_family_outputs: 1,
        }));
        let specs = [anchor_spec, callback_spec];

        let canon = canonicalize_for_identifiability(&specs).expect(
            "callback-owned overlap should be audit-attributed but width-preserving (#772)",
        );

        assert!(
            !canon.audit.fatal,
            "priority-owned overlap should be non-fatal; got {}",
            canon.audit.summary,
        );
        assert!(
            canon
                .audit
                .dropped_columns
                .iter()
                .any(|drop| drop.block == "logslope_surface"),
            "test must exercise an attributed logslope drop; got {:?}",
            canon.audit.dropped_columns,
        );
        assert_eq!(
            canon.reduced_specs[0].design.ncols(),
            2,
            "anchor block keeps raw width"
        );
        assert_eq!(
            canon.reduced_specs[1].design.ncols(),
            2,
            "callback-owned block keeps raw width instead of applying design-column surgery"
        );
        for block in 0..canon.gauge.n_blocks() {
            let transform = canon.gauge.block_transform(block);
            assert_eq!(
                transform.dim(),
                (2, 2),
                "block {block} transform must be raw-width identity"
            );
            for row in 0..2 {
                for col in 0..2 {
                    let expected = if row == col { 1.0 } else { 0.0 };
                    assert_eq!(
                        transform[[row, col]],
                        expected,
                        "block {block} transform must be identity"
                    );
                }
            }
        }
    }

    /// Two single-channel blocks with an exact shared column (anchor block
    /// `a` has column [1, x]; block `b` has [x, x²]). The `x` direction is
    /// shared. Orthogonalisation is unconditional, so block `b` (lower priority)
    /// must shed exactly one direction, the joint reduced design must be
    /// full-rank, and the round-trip lift must reproduce the raw prediction.
    #[test]
    fn orthogonalize_removes_exact_cross_block_overlap_and_round_trips() {
        let n = 48;
        let x = linspace(n);
        // Block a (high priority): [1, x].
        let mut a = Array2::<f64>::zeros((n, 2));
        // Block b (low priority): [x, x²]  → its first column aliases a's x.
        let mut b = Array2::<f64>::zeros((n, 2));
        for i in 0..n {
            a[[i, 0]] = 1.0;
            a[[i, 1]] = x[i];
            b[[i, 0]] = x[i];
            b[[i, 1]] = x[i] * x[i];
        }
        let specs = [
            spec_from_dense_with_priority("anchor", a.clone(), 150),
            spec_from_dense_with_priority("overlap", b.clone(), 120),
        ];
        let canon = canonicalize_for_identifiability(&specs)
            .expect("orthogonalisation must resolve the overlap, not refuse");

        // Block b shed exactly one direction (the x alias): V_b is 2×1.
        let v_b = canon.gauge.block_transform(1);
        assert_eq!(
            v_b.ncols(),
            1,
            "overlap block must keep exactly one direction"
        );
        assert_eq!(
            v_b.nrows(),
            2,
            "overlap block transform maps from raw width 2"
        );
        // Anchor block keeps both directions (square rotation).
        assert_eq!(canon.gauge.block_transform(0).ncols(), 2);

        // Round-trip: a reduced fit θ lifts to raw β = V·θ and predicts
        // identically through the raw designs.
        let theta = vec![Array1::from(vec![0.7, -0.3]), Array1::from(vec![1.4])];
        let raw = canon.gauge.lift_block_betas(&theta);
        assert_eq!(raw[0].len(), 2);
        assert_eq!(raw[1].len(), 2);
        // Raw prediction = a·β_a + b·β_b.
        let pred_a = a.dot(&raw[0]);
        let pred_b = b.dot(&raw[1]);
        // Reduced prediction = (a·V_a)·θ_a + (b·V_b)·θ_b must equal it.
        let v_a = canon.gauge.block_transform(0);
        let red_a = a.dot(&v_a).dot(&theta[0]);
        let red_b = b.dot(&v_b).dot(&theta[1]);
        for i in 0..n {
            let raw_pred = pred_a[i] + pred_b[i];
            let red_pred = red_a[i] + red_b[i];
            assert!(
                (raw_pred - red_pred).abs() < 1e-9,
                "row {i}: raw prediction {raw_pred} != reduced prediction {red_pred}",
            );
        }
    }

    /// A clean (non-overlapping) two-block design must canonicalise to identity
    /// transforms even though orthogonalisation runs unconditionally: with no
    /// cross-block overlap to remove, `try_orthogonalize_blocks` finds nothing to
    /// drop and the audit gate produces raw-width identity transforms.
    #[test]
    fn orthogonalize_clean_design_yields_identity_transforms() {
        let n = 32;
        let x = linspace(n);
        let mut p = Array2::<f64>::zeros((n, 2));
        let mut s = Array2::<f64>::zeros((n, 2));
        for i in 0..n {
            p[[i, 0]] = 1.0;
            p[[i, 1]] = x[i];
            s[[i, 0]] = x[i] * x[i];
            s[[i, 1]] = x[i] * x[i] * x[i];
        }
        let specs = [
            spec_from_dense_with_priority("p", p, 150),
            spec_from_dense_with_priority("s", s, 120),
        ];
        let canon = canonicalize_for_identifiability(&specs).expect("clean design canonicalises");
        // Identity transforms (nothing to orthogonalise) on the clean design.
        assert_eq!(canon.gauge.block_transform(0).dim(), (2, 2));
        assert_eq!(canon.gauge.block_transform(1).dim(), (2, 2));
    }

    /// Direct unit test of the compiler primitive: a block whose columns are
    /// fully spanned by a higher-priority anchor must shed all overlapping
    /// directions, and a non-overlapping configuration must keep full width.
    #[test]
    fn orthogonalize_design_blocks_drops_only_overlap() {
        use crate::families::identifiability_compiler::orthogonalize_design_blocks;
        let n = 40;
        let x = linspace(n);
        let mut anchor = Array2::<f64>::zeros((n, 2));
        let mut overlap = Array2::<f64>::zeros((n, 2));
        for i in 0..n {
            anchor[[i, 0]] = 1.0;
            anchor[[i, 1]] = x[i];
            // First column is exactly anchor's x; second is genuinely new.
            overlap[[i, 0]] = x[i];
            overlap[[i, 1]] = x[i] * x[i] * x[i];
        }
        let weight = vec![1.0_f64; n];
        let res =
            orthogonalize_design_blocks(&[anchor.clone(), overlap.clone()], &[150, 120], &weight)
                .expect("orthogonalisation must succeed");
        assert_eq!(
            res.block_transforms[0].ncols(),
            2,
            "anchor keeps full width"
        );
        assert_eq!(
            res.block_transforms[1].ncols(),
            1,
            "overlap block sheds exactly the aliased direction",
        );
        assert_eq!(
            res.dropped,
            vec![(1, 1)],
            "one direction dropped from block 1"
        );
    }
}