gam 0.3.64 - Docs.rs

//! Stage 3.3 GPU PIRLS-loop dispatch wire-in.
//!
//! `try_gpu_pirls_loop_dispatch` is the single entry the CPU PIRLS driver
//! ([`crate::solver::pirls::fit_model_for_fixed_rho_with_adaptive_kkt`])
//! calls before falling through to the host LM loop. Returns
//! `Some((PirlsResult, WorkingModelPirlsResult))` when the device-resident
//! loop fully completed and assembled the CPU-oracle-equivalent surface;
//! returns `None` when admission denied dispatch, the workload is shaped
//! in a way the GPU loop does not cover yet (sparse-native, Kronecker,
//! diagonal-penalty, constraints, Firth), or the device call failed in a
//! way the host wants to retry on CPU.
//!
//! Linux-only — `pirls_loop_on_stream` is gated behind `target_os =
//! "linux"`, and so is the entire wire surface. Non-Linux builds expose a
//! no-op stub.

#[cfg(target_os = "linux")]
mod linux_impl {
    use ndarray::{Array1, ArrayView1, ArrayView2};

    use crate::construction::ReparamResult;
    use crate::gpu::pirls_row::{CurvatureMode, PirlsRowFamily};
    use crate::gpu::policy::{PirlsLoopAdmission, PirlsLoopCurvatureKind, PirlsLoopFamilyKind};
    use crate::gpu::runtime::GpuRuntime;
    use crate::linalg::matrix::SymmetricMatrix;
    use crate::matrix::DesignMatrix;
    use crate::solver::active_set::{
        LinearInequalityConstraints, compute_constraint_kkt_diagnostics,
    };
    use crate::solver::gpu::cuda_selected;
    use crate::solver::gpu::pirls_dispatch::admission_for;
    use crate::solver::gpu::pirls_gpu::{self, cuda};
    use crate::solver::pirls::{
        ExportedLaplaceCurvature, FirthDiagnostics, HessianCurvatureKind, PirlsCoordinateFrame,
        PirlsResult, PirlsStatus, WorkingModelPirlsResult, WorkingState,
        calculate_loglikelihood_omitting_constants, compute_observed_hessian_curvature_arrays,
        computeworkingweight_derivatives_from_eta,
    };
    use crate::types::{Coefficients, GlmLikelihoodSpec, InverseLink, LinearPredictor};

    /// All inputs needed for the GPU PIRLS loop end-to-end. Built by the
    /// CPU PIRLS driver right before it would invoke `runworking_model_pirls`,
    /// so every field is already in transformed coordinates.
    pub struct GpuPirlsDispatchInput<'a> {
        /// `LikelihoodSpec`-shaped view used by `admission_for`.
        pub likelihood: &'a GlmLikelihoodSpec,
        /// Inverse link the row kernel was driven by.
        pub inverse_link: &'a InverseLink,
        /// Original dense design `X_original` (before reparameterization), shape `n × p`, row-major.
        /// Uploaded once to device-resident shared model cache; Qs is uploaded separately per ρ/σ point.
        pub x_original: ArrayView2<'a, f64>,
        /// Transformed dense penalty `S_λ` in transformed coordinates,
        /// shape `p × p`.
        pub s_transformed: ArrayView2<'a, f64>,
        /// Linear shift `b` of the penalty `βᵀSβ − 2βᵀb + c`, length `p`.
        /// Mirrors `PirlsPenalty::linear_shift()` in the CPU oracle.
        pub linear_shift: ArrayView1<'a, f64>,
        /// Constant shift `c` of the penalty `βᵀSβ − 2βᵀb + c`.
        pub constant_shift: f64,
        /// Response vector `y`, length `n`.
        pub y: ArrayView1<'a, f64>,
        /// Prior weights, length `n`.
        pub priorweights: ArrayView1<'a, f64>,
        /// Observation offset, length `n`. Must equal `n`-sized vector
        /// (zeros when absent).
        pub offset: ArrayView1<'a, f64>,
        /// Initial β guess in *transformed* coordinates.
        pub initial_beta: ArrayView1<'a, f64>,
        /// LM ridge to seed the loop with. Mirrors
        /// `WorkingModelPirlsOptions::initial_lm_lambda` (defaulted to
        /// `1e-6` when `None`).
        pub initial_lm_lambda: Option<f64>,
        /// Outer iteration cap.
        pub max_iterations: usize,
        /// Convergence tolerance (deviance-relative; the loop's stop test
        /// is `|Δdev| < tol · max(1, |dev|)`).
        pub convergence_tolerance: f64,
        /// Linear inequality constraints in transformed coordinates.
        pub linear_constraints: Option<LinearInequalityConstraints>,
        /// Reparameterisation `qs` (transformed → original). Passed to
        /// the loop's postpass to populate `beta_transformed`.
        pub qs: Option<ArrayView2<'a, f64>>,
        /// Full `ReparamResult` for `PirlsResult.reparam_result`.
        pub reparam_result: ReparamResult,
        /// Cached `x_transformed` as `DesignMatrix` (for
        /// `PirlsResult.x_transformed`).
        pub x_transformed_design: DesignMatrix,
        /// Coordinate frame label propagated onto the result.
        pub coordinate_frame: PirlsCoordinateFrame,
        /// EDF computed host-side from the penalty root + Hessian.
        /// `None` makes the assembler use NaN (will be patched by REML
        /// downstream).
        pub edf: Option<f64>,
        /// Whether the outer caller wants the observed-information
        /// curvature exported. Drives the postpass and the
        /// `exported_laplace_curvature` label.
        pub exported_curvature: HessianCurvatureKind,
    }

    fn family_to_row(family: PirlsLoopFamilyKind) -> PirlsRowFamily {
        match family {
            PirlsLoopFamilyKind::BernoulliLogit => PirlsRowFamily::BernoulliLogit,
            PirlsLoopFamilyKind::BernoulliProbit => PirlsRowFamily::BernoulliProbit,
            PirlsLoopFamilyKind::BernoulliCLogLog => PirlsRowFamily::BernoulliCLogLog,
            PirlsLoopFamilyKind::PoissonLog => PirlsRowFamily::PoissonLog,
            PirlsLoopFamilyKind::GaussianIdentity => PirlsRowFamily::GaussianIdentity,
            PirlsLoopFamilyKind::GammaLog => PirlsRowFamily::GammaLog,
        }
    }

    fn curvature_to_row(curvature: PirlsLoopCurvatureKind) -> CurvatureMode {
        match curvature {
            PirlsLoopCurvatureKind::Fisher => CurvatureMode::Fisher,
            PirlsLoopCurvatureKind::Observed => CurvatureMode::Observed,
        }
    }

    fn exported_to_loop(kind: HessianCurvatureKind) -> PirlsLoopCurvatureKind {
        match kind {
            HessianCurvatureKind::Fisher => PirlsLoopCurvatureKind::Fisher,
            HessianCurvatureKind::Observed => PirlsLoopCurvatureKind::Observed,
        }
    }

    /// Cheap pre-materialization admission gate. Returns `true` only when
    /// all of the following hold without touching any O(N·p) work:
    ///
    /// - The active solver `Device` is `Cuda` (`cuda_selected()`).
    /// - A live `GpuRuntime` is present.
    /// - The (family, curvature) pair is in the JIT-cached set
    ///   (`admission_for` succeeds).
    /// - The runtime policy accepts the (n, p) shape
    ///   (`should_use_gpu_pirls_loop`).
    ///
    /// The caller should test this **before** materializing `X·Qs` or any
    /// other transformed design so that CPU-default / no-runtime /
    /// policy-rejected paths pay zero `fast_ab` cost.
    pub fn try_gpu_pirls_loop_admit(
        likelihood: &crate::types::GlmLikelihoodSpec,
        n: usize,
        p: usize,
    ) -> bool {
        if !cuda_selected() {
            return false;
        }
        let Some(admission) = admission_for(&likelihood.spec, n, p) else {
            return false;
        };
        let Some(runtime) = GpuRuntime::global() else {
            return false;
        };
        runtime.policy().should_use_gpu_pirls_loop(admission)
    }

    /// Attempt to run the Stage 3.3 device-resident PIRLS loop for the
    /// dispatch input. Returns `Some` only when the loop ran end-to-end
    /// and the full CPU-oracle surface was assembled.
    pub fn try_gpu_pirls_loop_dispatch(
        input: GpuPirlsDispatchInput<'_>,
    ) -> Option<Result<(PirlsResult, WorkingModelPirlsResult), String>> {
        // Honor the documented Device::Cpu selection — never route to the
        // GPU loop when the caller has explicitly selected the CPU device.
        if !cuda_selected() {
            return None;
        }
        // Gaussian-identity fits have an exact GPU PLS path (issue #272) and
        // must NOT be routed through the row-kernel PIRLS loop on device.
        // The exact path (try_gpu_gaussian_pls_dispatch) fires before this
        // dispatch site in fit_model_for_fixed_rho_with_adaptive_kkt.
        // This gate ensures no future code path accidentally re-routes them
        // here.  Tests that explicitly exercise the row kernel may bypass
        // this gate by calling pirls_loop_on_stream directly.
        if input.likelihood.spec.is_gaussian_identity() {
            return None;
        }
        let n = input.x_original.nrows();
        let p = input.x_original.ncols();
        // Engine-level admission: shape + family + curvature + runtime probe.
        let admission = admission_for(&input.likelihood.spec, n, p)?;
        let runtime = GpuRuntime::global()?;
        if !runtime.policy().should_use_gpu_pirls_loop(admission) {
            return None;
        }
        let family = family_to_row(admission.family?);
        let curvature = curvature_to_row(admission.curvature);

        Some(run_gpu_pirls_loop(
            input, admission, family, curvature, n, p,
        ))
    }

    fn run_gpu_pirls_loop(
        input: GpuPirlsDispatchInput<'_>,
        admission: PirlsLoopAdmission,
        family: PirlsRowFamily,
        curvature: CurvatureMode,
        n: usize,
        p: usize,
    ) -> Result<(PirlsResult, WorkingModelPirlsResult), String> {
        assert_eq!(admission.n, n);
        assert_eq!(admission.p, p);
        // --- Device upload + workspace allocation -----------------------
        // Upload X, y, prior_w, and offset (#258) once; shared by all iters.
        let shared = pirls_gpu::upload_shared_pirls_gpu(
            input.x_original,
            input.y,
            input.priorweights,
            input.offset,
        )?;
        // Upload Qs for this ρ/σ point. Identity when no reparameterization.
        let mut ws = pirls_gpu::allocate_sigma_pirls_workspace(&shared)?;
        if let Some(qs) = input.qs {
            pirls_gpu::upload_qs_pirls(&mut ws, qs)?;
        } else {
            pirls_gpu::upload_qs_identity_pirls(&mut ws)?;
        }
        let mut loop_ws = pirls_gpu::allocate_pirls_loop_workspace(&shared, &ws)?;

        let lm_ridge = input.initial_lm_lambda.unwrap_or(1e-6);
        // Forward the active Gamma shape so the GammaLog kernel uses the
        // correct dispersion. Defaults to 1.0 (unit-shape Gamma / Poisson
        // analogue) when the spec does not carry an explicit shape — this
        // matches the CPU PIRLS path's `gamma_shape().unwrap_or(1.0)` fallback.
        let gamma_shape = input.likelihood.gamma_shape().unwrap_or(1.0);
        let qs_view = input.qs;
        let firth_default = FirthDiagnostics::Inactive;
        // Sanity-check that the host-side enum maps round-trip; if a future
        // change to PirlsLoopCurvatureKind / HessianCurvatureKind drops a
        // case this assertion will catch the gap at the dispatch boundary.
        assert!(matches!(
            exported_to_loop(input.exported_curvature),
            PirlsLoopCurvatureKind::Fisher | PirlsLoopCurvatureKind::Observed
        ));

        let extra = cuda::PirlsLoopExtra {
            likelihood: input.likelihood,
            inverse_link: input.inverse_link,
            y: input.y,
            priorweights: input.priorweights,
            offset: input.offset,
            linear_constraints: input.linear_constraints.as_ref(),
            exported_curvature: input.exported_curvature,
            ridge_passport: None,
            firth: Some(firth_default.clone()),
            qs: qs_view,
            edf: input.edf,
        };
        // step_lm_lambda = lm_ridge (temporary Newton stabilization only).
        // objective_ridge = 0.0: the model's ridge is already baked into
        // s_transformed by the outer REML loop; no separate identity ridge
        // enters the exported Hessian / EDF / RidgePassport here.
        let outcome = pirls_gpu::pirls_loop_on_stream(
            &shared,
            &mut ws,
            &mut loop_ws,
            family,
            curvature,
            gamma_shape,
            input.initial_beta,
            input.s_transformed,
            input.linear_shift,
            input.constant_shift,
            lm_ridge,
            0.0,
            input.max_iterations,
            input.convergence_tolerance,
            Some(&extra),
        )?;

        // --- Assemble PirlsResult + WorkingModelPirlsResult ------------
        let cuda::PirlsLoopOutcome {
            beta,
            penalized_hessian,
            logdet,
            deviance,
            iterations,
            converged,
            final_eta,
            final_mu,
            final_grad_eta,
            final_w_hessian,
            final_w_solver,
            final_offset,
            beta_transformed,
            finalweights,
            solveweights,
            solve_dmu_deta,
            solve_d2mu_deta2,
            solve_d3mu_deta3,
            solve_c_array,
            solve_d_array,
            derivatives_unsupported,
            status,
            ridge_passport,
            firth,
            constraint_kkt,
            edf,
            last_deviance_change,
            last_step_halving,
            last_step_size,
            final_lm_lambda,
            min_deviance,
            max_abs_eta,
            per_row_status_or,
        } = outcome;
        // per_row_status_or already drives `status` (Unstable when forbidden
        // bits are set) via build_loop_outcome. Enforce the invariant here so
        // a future regression that breaks the loop's classification is caught
        // at the dispatch boundary rather than silently passing a corrupt
        // iterate to the outer REML loop.
        {
            const FORBIDDEN_ROW: u32 = crate::gpu::pirls_row::status_flags::INVALID_RESPONSE
                | crate::gpu::pirls_row::status_flags::ZERO_PRIOR_WEIGHT;
            if (per_row_status_or & FORBIDDEN_ROW) != 0 && !matches!(status, PirlsStatus::Unstable)
            {
                return Err(format!(
                    "GPU PIRLS: per_row_status_or={per_row_status_or:#010x} has forbidden row \
                     status bits but outcome status is {status:?} — expected Unstable"
                ));
            }
        }

        // `logdet` corresponds to log|H_penalized| at the converged β; it is
        // not on the CPU oracle's `PirlsResult` surface (REML recomputes it
        // from the assembled Hessian), but cross-checking finiteness here
        // catches a non-PD final factorisation before downstream code
        // touches the Hessian.
        if !logdet.is_finite() {
            return Err(format!(
                "GPU PIRLS loop returned non-finite log|H| = {logdet}"
            ));
        }
        // `converged` already feeds `status` (Converged / Unstable /
        // MaxIterationsReached); make the relationship explicit so a future
        // refactor that breaks the invariant trips here rather than silently
        // mis-stamping `PirlsResult.status`.
        assert_eq!(
            converged,
            matches!(
                status,
                PirlsStatus::Converged | PirlsStatus::StalledAtValidMinimum
            ),
            "GPU outcome converged flag inconsistent with status",
        );

        // working response z_i = eta_i + (y - mu) / dmu/deta (0 on zero deriv).
        let finalz = {
            let mut z = final_eta.clone();
            for i in 0..n {
                let d = solve_dmu_deta.get(i).copied().unwrap_or(0.0);
                let resid = input.y[i] - final_mu[i];
                if d.is_finite() && d.abs() > 0.0 {
                    z[i] += resid / d;
                }
            }
            z
        };

        // If the outcome lacks derivative arrays (extra was None on a
        // mis-wired call), recompute host-side so PirlsResult is whole.
        let (final_dmu_deta, final_d2mu_deta2, final_d3mu_deta3, final_c, final_d) =
            if derivatives_unsupported
                || solve_dmu_deta.is_empty()
                || solve_d2mu_deta2.is_empty()
                || solve_d3mu_deta3.is_empty()
            {
                let (sc, sd, sdmu, sd2, sd3) = computeworkingweight_derivatives_from_eta(
                    input.likelihood,
                    input.inverse_link,
                    &final_eta,
                    input.priorweights,
                )
                .map_err(|e| format!("derivative recompute failed: {e:?}"))?;
                (sdmu, sd2, sd3, sc, sd)
            } else {
                (
                    solve_dmu_deta.clone(),
                    solve_d2mu_deta2.clone(),
                    solve_d3mu_deta3.clone(),
                    solve_c_array.clone(),
                    solve_d_array.clone(),
                )
            };

        // Observed-curvature finalisation if the outer caller requested
        // it and the GPU loop did not already promote (i.e. ran Fisher).
        let (finalweights_arr, final_c_arr, final_d_arr) =
            if matches!(input.exported_curvature, HessianCurvatureKind::Observed)
                && curvature == CurvatureMode::Fisher
            {
                compute_observed_hessian_curvature_arrays(
                    input.likelihood,
                    input.inverse_link,
                    &final_eta,
                    input.y,
                    &final_w_solver,
                    input.priorweights,
                )
                .map_err(|e| format!("observed-curvature finalisation failed: {e:?}"))?
            } else {
                (finalweights.clone(), final_c.clone(), final_d.clone())
            };
        // Echo through whichever finalweights array we ended with for use below.
        let finalweights_for_state = if finalweights_arr.is_empty() {
            final_w_hessian.clone()
        } else {
            finalweights_arr.clone()
        };

        // Stabilised Hessian = penalized_hessian + δI per ridge_passport.
        let delta = ridge_passport.delta;
        let mut stab = penalized_hessian.clone();
        if delta > 0.0 {
            for i in 0..p {
                stab[[i, i]] += delta;
            }
        }
        let penalized_hessian_sym = SymmetricMatrix::Dense(penalized_hessian.clone());
        let stabilizedhessian_sym = SymmetricMatrix::Dense(stab);

        // max_abs_eta — recompute from the actual eta if outcome's was zero
        // (older GPU outcomes pre-dating the field surface stamp 0.0).
        let max_abs_eta_used = if max_abs_eta > 0.0 {
            max_abs_eta
        } else {
            final_eta.iter().fold(0.0_f64, |a, &x| a.max(x.abs()))
        };

        // Gradient in transformed coordinates: Qsᵀ (X_originalᵀ · score_eta).
        // X_originalᵀ · score_eta is p-vector; then project through Qsᵀ.
        let xt_grad_eta = {
            let xo = input.x_original;
            let mut xo_score = Array1::<f64>::zeros(p);
            for j in 0..p {
                let mut acc = 0.0_f64;
                for i in 0..n {
                    acc += xo[[i, j]] * final_grad_eta[i];
                }
                xo_score[j] = acc;
            }
            // Project through Qsᵀ: xt_grad_eta = Qsᵀ · xo_score.
            if let Some(qs) = input.qs {
                qs.t().dot(&xo_score)
            } else {
                xo_score
            }
        };
        let s_beta = {
            let mut acc = Array1::<f64>::zeros(p);
            for i in 0..p {
                let mut s = 0.0_f64;
                for j in 0..p {
                    s += input.s_transformed[[i, j]] * beta[j];
                }
                acc[i] = s;
            }
            acc
        };
        // gradient = S·β − linear_shift − Xᵀ·score_eta
        let mut gradient_total = s_beta.clone();
        gradient_total -= &input.linear_shift;
        gradient_total -= &xt_grad_eta;
        let lastgradient_norm = gradient_total.dot(&gradient_total).sqrt();
        let score_norm = xt_grad_eta.dot(&xt_grad_eta).sqrt();
        let s_beta_norm = s_beta.dot(&s_beta).sqrt();
        let ridge_grad_norm = if delta > 0.0 {
            delta * beta.dot(&beta).sqrt()
        } else {
            0.0
        };
        let gradient_natural_scale = score_norm + s_beta_norm + ridge_grad_norm;

        // Penalty term = βᵀSβ + δ‖β‖².
        let penalty_term = beta.dot(&s_beta) + delta * beta.dot(&beta);
        let min_penalized_deviance = {
            let cand = min_deviance + penalty_term;
            if cand.is_finite() {
                cand
            } else {
                f64::INFINITY
            }
        };

        let coefficients = Coefficients::new(beta.clone());
        let beta_transformed_coef = Coefficients::new(beta_transformed.clone());

        let constraint_kkt_final = if constraint_kkt.is_some() {
            constraint_kkt.clone()
        } else if let Some(lin) = input.linear_constraints.as_ref() {
            Some(compute_constraint_kkt_diagnostics(
                &beta,
                &gradient_total,
                lin,
            ))
        } else {
            None
        };

        let exported_label = match (input.exported_curvature, derivatives_unsupported) {
            (HessianCurvatureKind::Observed, false) => ExportedLaplaceCurvature::ObservedExact,
            _ => ExportedLaplaceCurvature::ExpectedInformationSurrogate,
        };

        let working_state = WorkingState {
            eta: LinearPredictor::new(final_eta.clone()),
            gradient: gradient_total.clone(),
            hessian: penalized_hessian_sym.clone(),
            log_likelihood: calculate_loglikelihood_omitting_constants(
                input.y,
                &final_mu,
                input.likelihood,
                input.priorweights,
            ),
            deviance,
            penalty_term,
            firth: firth.clone(),
            ridge_used: delta,
            hessian_curvature: match curvature {
                CurvatureMode::Fisher => HessianCurvatureKind::Fisher,
                CurvatureMode::Observed => HessianCurvatureKind::Observed,
            },
            gradient_natural_scale,
        };

        let working_summary = WorkingModelPirlsResult {
            beta: coefficients.clone(),
            state: working_state,
            status,
            iterations,
            lastgradient_norm,
            last_deviance_change,
            last_step_size,
            last_step_halving,
            max_abs_eta: max_abs_eta_used,
            constraint_kkt: constraint_kkt_final.clone(),
            final_lm_lambda,
            final_accept_rho: None,
            min_penalized_deviance,
            exported_laplace_curvature: exported_label.clone(),
        };

        let edf_final = if edf.is_finite() { edf } else { f64::NAN };

        // final_offset is `n` zeros when the loop did not echo offset
        // through. Use the caller-supplied offset in that case.
        let final_offset_arr = if final_offset.len() == n {
            final_offset
        } else {
            input.offset.to_owned()
        };

        let pirls_result = PirlsResult {
            likelihood: input.likelihood.clone(),
            beta_transformed: beta_transformed_coef,
            penalized_hessian_transformed: penalized_hessian_sym,
            stabilizedhessian_transformed: stabilizedhessian_sym,
            ridge_passport,
            ridge_used: delta,
            deviance,
            edf: edf_final,
            stable_penalty_term: penalty_term,
            firth,
            finalweights: finalweights_for_state,
            final_offset: final_offset_arr,
            final_eta: final_eta.clone(),
            finalmu: final_mu.clone(),
            solveweights: if solveweights.is_empty() {
                final_w_solver.clone()
            } else {
                solveweights.clone()
            },
            solveworking_response: finalz,
            solvemu: final_mu.clone(),
            solve_dmu_deta: final_dmu_deta,
            solve_d2mu_deta2: final_d2mu_deta2,
            solve_d3mu_deta3: final_d3mu_deta3,
            solve_c_array: final_c_arr,
            solve_d_array: final_d_arr,
            derivatives_unsupported,
            status,
            iteration: iterations,
            max_abs_eta: max_abs_eta_used,
            lastgradient_norm,
            gradient_natural_scale,
            last_deviance_change,
            last_step_halving,
            hessian_curvature: match curvature {
                CurvatureMode::Fisher => HessianCurvatureKind::Fisher,
                CurvatureMode::Observed => HessianCurvatureKind::Observed,
            },
            exported_laplace_curvature: exported_label,
            final_lm_lambda,
            final_accept_rho: None,
            constraint_kkt: constraint_kkt_final,
            linear_constraints_transformed: input.linear_constraints,
            reparam_result: input.reparam_result,
            x_transformed: input.x_transformed_design,
            coordinate_frame: input.coordinate_frame,
            cache_compacted: false,
            min_penalized_deviance,
        };

        // Hessian-side weights are kept on the working_summary surface for
        // outer LM consumers; if the loop did not stamp a separate
        // `finalweights`, fall back to `final_w_hessian` so REML's
        // `H = XᵀW_HX + S_λ` reconstruction has the curvature it expects.
        assert_eq!(final_w_hessian.len(), n);
        assert_eq!(final_grad_eta.len(), n);

        Ok((pirls_result, working_summary))
    }

    /// All inputs needed for the GPU Gaussian-identity exact PLS dispatch.
    /// Built by the CPU PIRLS driver immediately before the CPU
    /// `solve_penalized_least_squares_implicit` fast-path, so the GPU path
    /// fires first when available.
    pub struct GpuGaussianPlsInput<'a> {
        /// Precomputed `XᵀWX` in original (pre-Qs) coordinates, p×p.
        pub xtwx_orig: ArrayView2<'a, f64>,
        /// Precomputed `XᵀW(y − offset)` in original coordinates, length p.
        pub xtwy_orig: ArrayView1<'a, f64>,
        /// Penalty `Σλₖ Sₖ` in transformed (post-Qs) coordinates, p×p.
        pub s_transformed: ArrayView2<'a, f64>,
        /// Additive RHS correction in transformed coordinates, length p.
        pub linear_shift: ArrayView1<'a, f64>,
        /// Prior-mean vector for Tikhonov RHS, length p.
        pub prior_mean_target: ArrayView1<'a, f64>,
        /// Constant term of the shifted penalty quadratic (for penalty_term).
        pub constant_shift: f64,
        /// Reparameterisation matrix Qs (p×p).  `None` = identity transform.
        pub qs: Option<ArrayView2<'a, f64>>,
        /// Stabilisation ridge δ.
        pub ridge: f64,
        /// GLM likelihood spec.
        pub likelihood: &'a crate::types::GlmLikelihoodSpec,
        /// Inverse link.
        pub inverse_link: &'a crate::types::InverseLink,
        /// Original design X for computing η = offset + X·Qs·β.
        pub x_original: &'a DesignMatrix,
        /// Response y, length n.
        pub y: ArrayView1<'a, f64>,
        /// Prior weights, length n.
        pub priorweights: ArrayView1<'a, f64>,
        /// Observation offset, length n.
        pub offset: ArrayView1<'a, f64>,
        /// Full reparameterisation result for `PirlsResult::reparam_result`.
        pub reparam_result: ReparamResult,
        /// Design matrix for `PirlsResult::x_transformed`.
        pub x_transformed_design: DesignMatrix,
        /// Coordinate frame for `PirlsResult::coordinate_frame`.
        pub coordinate_frame: PirlsCoordinateFrame,
        /// Linear constraints (None when cache_eligible was true).
        pub linear_constraints: Option<LinearInequalityConstraints>,
    }

    /// Cheap admission gate for the GPU Gaussian-identity exact PLS path.
    /// Returns `true` iff cuda_selected(), runtime available, and the likelihood
    /// is Gaussian-identity.
    pub fn try_gpu_gaussian_pls_admit(likelihood: &crate::types::GlmLikelihoodSpec) -> bool {
        if !crate::solver::gpu::cuda_selected() {
            return false;
        }
        if crate::gpu::runtime::GpuRuntime::global().is_none() {
            return false;
        }
        likelihood.spec.is_gaussian_identity()
    }

    /// Attempt to run the exact GPU PLS for Gaussian-identity.
    ///
    /// Returns `Some(Ok(...))` when the device solve completed and the full
    /// CPU-oracle surface was assembled; returns `None` when admission was
    /// denied; returns `Some(Err(...))` on device failure (caller logs and
    /// falls through to CPU).
    pub fn try_gpu_gaussian_pls_dispatch(
        input: GpuGaussianPlsInput<'_>,
    ) -> Option<Result<(PirlsResult, WorkingModelPirlsResult), String>> {
        if !try_gpu_gaussian_pls_admit(input.likelihood) {
            return None;
        }
        Some(run_gpu_gaussian_pls(input))
    }

    fn run_gpu_gaussian_pls(
        input: GpuGaussianPlsInput<'_>,
    ) -> Result<(PirlsResult, WorkingModelPirlsResult), String> {
        use crate::linalg::utils::inf_norm;
        use crate::matrix::LinearOperator;
        use crate::solver::pirls::{
            array1_l2_norm, calculate_deviance, calculate_loglikelihood_omitting_constants,
            computeworkingweight_derivatives_from_eta,
        };
        use crate::types::{RidgePassport, RidgePolicy};
        use ndarray::Array1;

        let pls = pirls_gpu::solve_gaussian_pls_gpu(
            input.xtwx_orig,
            input.xtwy_orig,
            input.s_transformed,
            input.linear_shift,
            input.prior_mean_target,
            input.ridge,
            input.qs,
        )?;

        if !pls.logdet.is_finite() {
            return Err(format!(
                "GPU Gaussian PLS returned non-finite log|H| = {}",
                pls.logdet
            ));
        }

        let beta = pls.beta.clone();
        let penalized_hessian = pls.penalized_hessian;
        let p = beta.len();
        // eta = offset + X · (Qs · beta), or offset + X · beta if no Qs.
        let qbeta: Array1<f64> = if let Some(qs_v) = input.qs {
            qs_v.dot(&beta)
        } else {
            beta.clone()
        };
        let mut eta = input.offset.to_owned();
        eta += &input.x_original.apply(&qbeta);
        let finalmu = eta.clone();
        let finalz = input.y.to_owned();

        // gradient_data = QsᵀXᵀ W (mu - y) in transformed coordinates.
        let mut weighted_residual = finalmu.clone();
        weighted_residual -= &finalz;
        weighted_residual *= &input.priorweights;
        // Xᵀ W r (in original coords) via DesignMatrix::transpose_vector_multiply.
        let xt_wr_orig = input
            .x_original
            .transpose_vector_multiply(&weighted_residual);
        // Rotate to transformed coords: QsᵀXᵀWr.
        let gradient_data: Array1<f64> = if let Some(qs_v) = input.qs {
            qs_v.t().dot(&xt_wr_orig)
        } else {
            xt_wr_orig
        };
        let score_norm = array1_l2_norm(&gradient_data);

        // s_beta = S·β − linear_shift.
        let mut s_beta: Array1<f64> = Array1::zeros(p);
        for i in 0..p {
            let mut acc = 0.0_f64;
            for j in 0..p {
                acc += input.s_transformed[[i, j]] * beta[j];
            }
            s_beta[i] = acc - input.linear_shift[i];
        }
        let s_beta_norm = array1_l2_norm(&s_beta);

        let mut gradient = gradient_data.clone();
        gradient += &s_beta;

        // penalty_term = betaᵀ·S·β − 2·betaᵀ·linear_shift + constant_shift.
        let mut penalty_term: f64 = input.constant_shift;
        for i in 0..p {
            let mut s_row_b = 0.0_f64;
            for j in 0..p {
                s_row_b += input.s_transformed[[i, j]] * beta[j];
            }
            penalty_term += beta[i] * s_row_b;
            penalty_term -= 2.0 * beta[i] * input.linear_shift[i];
        }

        let ridge_used = input.ridge;
        let mut ridge_grad_norm = 0.0_f64;
        if ridge_used > 0.0 {
            let beta_sq: f64 = beta.dot(&beta);
            penalty_term += ridge_used * beta_sq;
            let ridge_contrib = beta.mapv(|v| ridge_used * v);
            gradient += &ridge_contrib;
            ridge_grad_norm = ridge_used * array1_l2_norm(&beta);
        }

        let gradient_norm = array1_l2_norm(&gradient);
        let max_abs_eta = inf_norm(finalmu.iter().copied());

        let deviance = calculate_deviance(input.y, &finalmu, input.likelihood, input.priorweights);
        let log_likelihood = calculate_loglikelihood_omitting_constants(
            input.y,
            &finalmu,
            input.likelihood,
            input.priorweights,
        );

        // Stabilised Hessian = penalized_hessian + ridge_used·I.
        let mut stab = penalized_hessian.clone();
        if ridge_used > 0.0 {
            for i in 0..p {
                stab[[i, i]] += ridge_used;
            }
        }
        let penalized_hessian_sym = SymmetricMatrix::Dense(penalized_hessian.clone());
        let stabilizedhessian_sym = SymmetricMatrix::Dense(stab);

        let priorweights_owned = input.priorweights.to_owned();
        let beta_coef = Coefficients::new(beta.clone());

        let zero_iter_penalized = deviance + penalty_term;

        let working_state = WorkingState {
            eta: LinearPredictor::new(finalmu.clone()),
            gradient: gradient.clone(),
            hessian: penalized_hessian_sym.clone(),
            log_likelihood,
            deviance,
            penalty_term,
            firth: FirthDiagnostics::Inactive,
            ridge_used,
            hessian_curvature: HessianCurvatureKind::Fisher,
            gradient_natural_scale: score_norm + s_beta_norm + ridge_grad_norm,
        };

        let constraint_kkt_val = if let Some(lin) = input.linear_constraints.as_ref() {
            Some(compute_constraint_kkt_diagnostics(&beta, &gradient, lin))
        } else {
            None
        };

        let working_summary = WorkingModelPirlsResult {
            beta: beta_coef.clone(),
            state: working_state,
            status: PirlsStatus::Converged,
            iterations: 1,
            lastgradient_norm: gradient_norm,
            last_deviance_change: 0.0,
            last_step_size: 1.0,
            last_step_halving: 0,
            max_abs_eta,
            constraint_kkt: constraint_kkt_val.clone(),
            min_penalized_deviance: if zero_iter_penalized.is_finite() {
                zero_iter_penalized
            } else {
                f64::INFINITY
            },
            final_lm_lambda: 1e-6,
            final_accept_rho: None,
            exported_laplace_curvature: ExportedLaplaceCurvature::ExpectedInformationSurrogate,
        };

        let (solve_c_array, solve_d_array, solve_dmu_deta, solve_d2mu_deta2, solve_d3mu_deta3) =
            computeworkingweight_derivatives_from_eta(
                input.likelihood,
                input.inverse_link,
                &eta,
                input.priorweights,
            )
            .map_err(|e| format!("derivative computation failed: {e:?}"))?;

        let pirls_result = PirlsResult {
            likelihood: input.likelihood.clone(),
            beta_transformed: beta_coef.clone(),
            penalized_hessian_transformed: penalized_hessian_sym,
            stabilizedhessian_transformed: stabilizedhessian_sym,
            ridge_passport: RidgePassport::scaled_identity(
                ridge_used,
                RidgePolicy::explicit_stabilization_full(),
            ),
            ridge_used,
            deviance,
            edf: f64::NAN, // recomputed by outer REML from penalized_hessian + e_transformed
            stable_penalty_term: penalty_term,
            firth: FirthDiagnostics::Inactive,
            finalweights: priorweights_owned.clone(),
            final_offset: input.offset.to_owned(),
            final_eta: eta.clone(),
            finalmu: finalmu.clone(),
            solveweights: priorweights_owned,
            solveworking_response: finalz.clone(),
            solvemu: finalmu.clone(),
            solve_dmu_deta,
            solve_d2mu_deta2,
            solve_d3mu_deta3,
            solve_c_array,
            solve_d_array,
            derivatives_unsupported: false,
            status: PirlsStatus::Converged,
            iteration: 1,
            max_abs_eta,
            lastgradient_norm: gradient_norm,
            gradient_natural_scale: score_norm + s_beta_norm + ridge_grad_norm,
            last_deviance_change: 0.0,
            last_step_halving: 0,
            hessian_curvature: HessianCurvatureKind::Fisher,
            exported_laplace_curvature: ExportedLaplaceCurvature::ExpectedInformationSurrogate,
            final_lm_lambda: 1e-6,
            final_accept_rho: None,
            constraint_kkt: constraint_kkt_val,
            linear_constraints_transformed: input.linear_constraints,
            reparam_result: input.reparam_result,
            x_transformed: input.x_transformed_design,
            coordinate_frame: input.coordinate_frame,
            cache_compacted: false,
            min_penalized_deviance: working_summary.min_penalized_deviance,
        };

        Ok((pirls_result, working_summary))
    }
}

#[cfg(target_os = "linux")]
pub use linux_impl::{
    GpuGaussianPlsInput, GpuPirlsDispatchInput, try_gpu_gaussian_pls_admit,
    try_gpu_gaussian_pls_dispatch, try_gpu_pirls_loop_admit, try_gpu_pirls_loop_dispatch,
};