Skip to main content

gam_models/gamlss/gaussian/
location_scale.rs

1// Real concern-organized submodule of the gamlss family stack.
2// Cross-module items are re-exported flat through the parent (`gamlss.rs`),
3// so `use super::*;` makes the sibling-concern symbols this module references
4// resolve through the parent namespace.
5use super::*;
6
7pub struct GaussianLocationScaleFamily {
8    pub y: Array1<f64>,
9    pub weights: Array1<f64>,
10    pub mu_design: Option<DesignMatrix>,
11    pub log_sigma_design: Option<DesignMatrix>,
12    /// Resource policy threaded into PsiDesignMap construction (and any other
13    /// per-call materialization decision) made during exact-Newton joint psi
14    /// derivative evaluation. Defaults to `ResourcePolicy::default_library()`
15    /// when the family is built without an explicit policy.
16    pub policy: gam_runtime::resource::ResourcePolicy,
17    /// Cached per-observation row scalars keyed by the FULL `(η_μ, η_logσ)`
18    /// predictor pair the scalars were computed at. The row scalars are a
19    /// deterministic function of `(η_μ, η_logσ)` (plus the fixed `y`/`weights`),
20    /// so a hit is only valid when both eta vectors match bit-for-bit element by
21    /// element — a lossy 3-point fingerprint could collide two genuinely
22    /// different predictors and serve STALE scalars, so the key is the whole
23    /// vectors. The compare is O(n), far cheaper than the O(n) transcendental
24    /// recompute it guards, and is hit K+ times per REML gradient/Hessian
25    /// evaluation under the same predictors.
26    pub cached_row_scalars:
27        std::sync::RwLock<Option<(Array1<f64>, Array1<f64>, Arc<GaussianJointRowScalars>)>>,
28}
29
30impl Clone for GaussianLocationScaleFamily {
31    fn clone(&self) -> Self {
32        Self {
33            y: self.y.clone(),
34            weights: self.weights.clone(),
35            mu_design: self.mu_design.clone(),
36            log_sigma_design: self.log_sigma_design.clone(),
37            policy: self.policy.clone(),
38            cached_row_scalars: std::sync::RwLock::new(
39                self.cached_row_scalars
40                    .read()
41                    .expect("lock poisoned")
42                    .clone(),
43            ),
44        }
45    }
46}
47
48impl GaussianLocationScaleFamily {
49    pub const BLOCK_MU: usize = 0;
50    pub const BLOCK_LOG_SIGMA: usize = 1;
51
52    /// Bit-exact equality of two η vectors as a cache key. Within one outer REML
53    /// evaluation η_μ and η_logσ are the fixed inner-converged predictors, so
54    /// every exact-joint consumer (Hessian / directional / second-directional /
55    /// ψ paths) is handed the identical pair; matching the full vectors lets us
56    /// recompute the O(n) transcendental row-scalar stack
57    /// (`gaussian_jointrow_scalars`: one `sigma_link` jet + `exp`/reciprocal per
58    /// row) ONCE and share an `Arc` across all of them — without ever serving a
59    /// stale cache hit to a different predictor. Comparison is on the raw bit
60    /// patterns so a stored entry whose key contains any `NaN` (e.g. a degenerate
61    /// predictor) never spuriously matches a fresh `NaN`-free key and vice versa,
62    /// and `±0.0` are kept distinct; the underlying constructor handles n = 0.
63    #[inline]
64    fn eta_keys_match(stored: &Array1<f64>, query: &Array1<f64>) -> bool {
65        stored.len() == query.len()
66            && stored
67                .iter()
68                .zip(query.iter())
69                .all(|(a, b)| a.to_bits() == b.to_bits())
70    }
71
72    pub(crate) fn get_or_compute_row_scalars(
73        &self,
74        etamu: &Array1<f64>,
75        eta_ls: &Array1<f64>,
76    ) -> Result<Arc<GaussianJointRowScalars>, String> {
77        // Fast path: full-key hit under a shared read lock. A cached entry is
78        // only reused when BOTH eta vectors match the query bit-for-bit, so a
79        // distinct predictor can never be served stale scalars.
80        if let Ok(guard) = self.cached_row_scalars.read() {
81            if let Some((cmu, cls, rows)) = guard.as_ref() {
82                if Self::eta_keys_match(cmu, etamu) && Self::eta_keys_match(cls, eta_ls) {
83                    return Ok(Arc::clone(rows));
84                }
85            }
86        }
87        // Miss: compute once and publish under the write lock. A concurrent
88        // race recomputing the same (η_μ, η_logσ) is harmless — identical
89        // inputs yield bit-identical scalars — so last-writer-wins is safe and
90        // every reader observes equal contents.
91        let rows = Arc::new(gaussian_jointrow_scalars(
92            &self.y,
93            etamu,
94            eta_ls,
95            &self.weights,
96        )?);
97        if let Ok(mut guard) = self.cached_row_scalars.write() {
98            *guard = Some((etamu.clone(), eta_ls.clone(), Arc::clone(&rows)));
99        }
100        Ok(rows)
101    }
102
103    pub fn parameternames() -> &'static [&'static str] {
104        &["mu", "log_sigma"]
105    }
106
107    pub fn parameter_links() -> &'static [ParameterLink] {
108        &[ParameterLink::Identity, ParameterLink::Log]
109    }
110
111    pub fn metadata() -> FamilyMetadata {
112        FamilyMetadata {
113            name: "gaussian_location_scale",
114            parameternames: Self::parameternames(),
115            parameter_links: Self::parameter_links(),
116        }
117    }
118
119    pub(crate) fn exact_joint_supported(&self) -> bool {
120        self.mu_design.is_some() && self.log_sigma_design.is_some()
121    }
122
123    pub(crate) fn exact_block_designs(
124        &self,
125    ) -> Result<(DenseOrOperator<'_>, DenseOrOperator<'_>), String> {
126        let mu_design = self.mu_design.as_ref().ok_or_else(|| {
127            "GaussianLocationScaleFamily exact path is missing mu design".to_string()
128        })?;
129        let log_sigma_design = self.log_sigma_design.as_ref().ok_or_else(|| {
130            "GaussianLocationScaleFamily exact path is missing log-sigma design".to_string()
131        })?;
132        let planned = dense_blocks_planned_budget(&[mu_design, log_sigma_design]);
133        let xmu = dense_block_or_operator(
134            mu_design,
135            mu_design.nrows(),
136            mu_design.ncols(),
137            planned[0],
138            &self.policy,
139        );
140        let x_ls = dense_block_or_operator(
141            log_sigma_design,
142            log_sigma_design.nrows(),
143            log_sigma_design.ncols(),
144            planned[1],
145            &self.policy,
146        );
147        Ok((xmu, x_ls))
148    }
149
150    pub(crate) fn exact_block_designs_fromspecs<'a>(
151        &self,
152        specs: &'a [ParameterBlockSpec],
153    ) -> Result<(DenseOrOperator<'a>, DenseOrOperator<'a>), String> {
154        if specs.len() != 2 {
155            return Err(GamlssError::DimensionMismatch {
156                reason: format!(
157                    "GaussianLocationScaleFamily spec-aware exact path expects 2 specs, got {}",
158                    specs.len()
159                ),
160            }
161            .into());
162        }
163        let mu_design = &specs[Self::BLOCK_MU].design;
164        let log_sigma_design = &specs[Self::BLOCK_LOG_SIGMA].design;
165        let planned = dense_blocks_planned_budget(&[mu_design, log_sigma_design]);
166        let xmu = dense_block_or_operator(
167            mu_design,
168            mu_design.nrows(),
169            mu_design.ncols(),
170            planned[0],
171            &self.policy,
172        );
173        let x_ls = dense_block_or_operator(
174            log_sigma_design,
175            log_sigma_design.nrows(),
176            log_sigma_design.ncols(),
177            planned[1],
178            &self.policy,
179        );
180        Ok((xmu, x_ls))
181    }
182
183    pub(crate) fn exact_joint_block_designs<'a>(
184        &'a self,
185        specs: Option<&'a [ParameterBlockSpec]>,
186    ) -> Result<Option<(DenseOrOperator<'a>, DenseOrOperator<'a>)>, String> {
187        // #1504: prefer the identifiability-CONSTRAINED block designs carried by
188        // `specs` whenever they are provided. The inner joint-Newton solve sizes
189        // its coefficient vector — and the consumer's dense-Hessian `total` — from
190        // these post-audit specs, which can be NARROWER than the family's stored
191        // pre-audit designs: a by-group smooth in BOTH the mean and log-σ blocks
192        // has aliased columns the identifiability audit drops. Reaching for the
193        // stored (unconstrained) designs here regardless of `specs` sized the joint
194        // Hessian to the wider pre-audit width and tripped the dense-Hessian shape
195        // check ("got 36x36, expected 32x32") on every such fit. When no audit
196        // reduction occurred the specs designs equal the stored ones, so ordinary
197        // (non-by-group) gaulss fits are byte-identical; the analytic location-scale
198        // Hessian is design-agnostic, so using the constrained designs loses no
199        // exactness. Fall back to the stored designs only when no specs are given.
200        if let Some(specs) = specs {
201            return self.exact_block_designs_fromspecs(specs).map(Some);
202        }
203        if self.exact_joint_supported() {
204            return self.exact_block_designs().map(Some);
205        }
206        Ok(None)
207    }
208
209    pub(crate) fn exact_joint_dense_block_designs<'a>(
210        &'a self,
211        specs: Option<&'a [ParameterBlockSpec]>,
212    ) -> Result<Option<(Cow<'a, Array2<f64>>, Cow<'a, Array2<f64>>)>, String> {
213        let Some((xmu, x_ls)) = self.exact_joint_block_designs(specs)? else {
214            return Ok(None);
215        };
216        let xmu = match xmu {
217            DenseOrOperator::Borrowed(dense) => Cow::Borrowed(dense),
218            DenseOrOperator::Owned(dense) => Cow::Owned(dense),
219            DenseOrOperator::Operator(_) => {
220                return Err(
221                    "GaussianLocationScaleFamily exact psi path requires chunked operator support for oversized designs"
222                        .to_string(),
223                );
224            }
225        };
226        let x_ls = match x_ls {
227            DenseOrOperator::Borrowed(dense) => Cow::Borrowed(dense),
228            DenseOrOperator::Owned(dense) => Cow::Owned(dense),
229            DenseOrOperator::Operator(_) => {
230                return Err(
231                    "GaussianLocationScaleFamily exact psi path requires chunked operator support for oversized designs"
232                        .to_string(),
233                );
234            }
235        };
236        Ok(Some((xmu, x_ls)))
237    }
238
239    pub(crate) fn exact_newton_joint_hessian_for_specs(
240        &self,
241        block_states: &[ParameterBlockState],
242        specs: Option<&[ParameterBlockSpec]>,
243    ) -> Result<Option<Array2<f64>>, String> {
244        let Some((xmu, x_ls)) = self.exact_joint_block_designs(specs)? else {
245            return Ok(None);
246        };
247        self.exact_newton_joint_hessian_from_designs(block_states, &xmu, &x_ls)
248    }
249
250    pub(crate) fn exact_newton_joint_hessian_directional_derivative_for_specs(
251        &self,
252        block_states: &[ParameterBlockState],
253        specs: Option<&[ParameterBlockSpec]>,
254        d_beta_flat: &Array1<f64>,
255    ) -> Result<Option<Array2<f64>>, String> {
256        let Some((xmu, x_ls)) = self.exact_joint_block_designs(specs)? else {
257            return Ok(None);
258        };
259        self.exact_newton_joint_hessian_directional_derivative_from_designs(
260            block_states,
261            &xmu,
262            &x_ls,
263            d_beta_flat,
264        )
265    }
266
267    pub(crate) fn exact_newton_joint_hessian_second_directional_derivative_for_specs(
268        &self,
269        block_states: &[ParameterBlockState],
270        specs: Option<&[ParameterBlockSpec]>,
271        d_beta_u_flat: &Array1<f64>,
272        d_betav_flat: &Array1<f64>,
273    ) -> Result<Option<Array2<f64>>, String> {
274        let Some((xmu, x_ls)) = self.exact_joint_block_designs(specs)? else {
275            return Ok(None);
276        };
277        self.exact_newton_joint_hessiansecond_directional_derivative_from_designs(
278            block_states,
279            &xmu,
280            &x_ls,
281            d_beta_u_flat,
282            d_betav_flat,
283        )
284    }
285
286    pub(crate) fn exact_newton_joint_psi_terms_for_specs(
287        &self,
288        block_states: &[ParameterBlockState],
289        specs: &[ParameterBlockSpec],
290        derivative_blocks: &[Vec<crate::custom_family::CustomFamilyBlockPsiDerivative>],
291        psi_index: usize,
292    ) -> Result<Option<crate::custom_family::ExactNewtonJointPsiTerms>, String> {
293        let Some((xmu, x_ls)) = self.exact_joint_dense_block_designs(Some(specs))? else {
294            return Ok(None);
295        };
296        self.exact_newton_joint_psi_terms_from_designs(
297            block_states,
298            specs,
299            derivative_blocks,
300            psi_index,
301            &xmu,
302            &x_ls,
303        )
304    }
305
306    pub(crate) fn exact_newton_joint_psisecond_order_terms_for_specs(
307        &self,
308        block_states: &[ParameterBlockState],
309        specs: &[ParameterBlockSpec],
310        derivative_blocks: &[Vec<crate::custom_family::CustomFamilyBlockPsiDerivative>],
311        psi_i: usize,
312        psi_j: usize,
313    ) -> Result<Option<crate::custom_family::ExactNewtonJointPsiSecondOrderTerms>, String> {
314        let Some((xmu, x_ls)) = self.exact_joint_dense_block_designs(Some(specs))? else {
315            return Ok(None);
316        };
317        self.exact_newton_joint_psisecond_order_terms_from_designs(
318            block_states,
319            derivative_blocks,
320            psi_i,
321            psi_j,
322            &xmu,
323            &x_ls,
324        )
325    }
326
327    pub(crate) fn exact_newton_joint_psihessian_directional_derivative_for_specs(
328        &self,
329        block_states: &[ParameterBlockState],
330        specs: &[ParameterBlockSpec],
331        derivative_blocks: &[Vec<crate::custom_family::CustomFamilyBlockPsiDerivative>],
332        psi_index: usize,
333        d_beta_flat: &Array1<f64>,
334    ) -> Result<Option<Array2<f64>>, String> {
335        let Some((xmu, x_ls)) = self.exact_joint_dense_block_designs(Some(specs))? else {
336            return Ok(None);
337        };
338        self.exact_newton_joint_psihessian_directional_derivative_from_designs(
339            block_states,
340            derivative_blocks,
341            psi_index,
342            d_beta_flat,
343            &xmu,
344            &x_ls,
345        )
346    }
347
348    pub(crate) fn exact_newton_joint_hessian_from_designs(
349        &self,
350        block_states: &[ParameterBlockState],
351        xmu: &DenseOrOperator<'_>,
352        x_ls: &DenseOrOperator<'_>,
353    ) -> Result<Option<Array2<f64>>, String> {
354        validate_block_count::<GamlssError>("GaussianLocationScaleFamily", 2, block_states.len())?;
355        let n = self.y.len();
356        let etamu = &block_states[Self::BLOCK_MU].eta;
357        let eta_ls = &block_states[Self::BLOCK_LOG_SIGMA].eta;
358        if etamu.len() != n || eta_ls.len() != n || self.weights.len() != n {
359            return Err(GamlssError::DimensionMismatch {
360                reason: "GaussianLocationScaleFamily input size mismatch".to_string(),
361            }
362            .into());
363        }
364
365        let rows = self.get_or_compute_row_scalars(etamu, eta_ls)?;
366        // Block-diagonal Gaussian Fisher curvature (μ ⊥ σ ⇒ cross = 0, #684;
367        // (ls,ls) = 2κ²a, #566), built from the shared single-source-of-truth
368        // constructor so this dense path and the matrix-free workspace can never
369        // disagree on the cross block. See `gaussian_locscale_fisher_joint_row_coeffs`.
370        let (mm, cross, scale) = gaussian_locscale_fisher_joint_row_coeffs(&rows);
371        Ok(Some(gaussian_joint_hessian_from_designs(
372            xmu, x_ls, &mm, &cross, &scale,
373        )?))
374    }
375
376    pub(crate) fn exact_newton_joint_hessian_directional_derivative_from_designs(
377        &self,
378        block_states: &[ParameterBlockState],
379        xmu: &DenseOrOperator<'_>,
380        x_ls: &DenseOrOperator<'_>,
381        d_beta_flat: &Array1<f64>,
382    ) -> Result<Option<Array2<f64>>, String> {
383        validate_block_count::<GamlssError>("GaussianLocationScaleFamily", 2, block_states.len())?;
384        let n = self.y.len();
385        let etamu = &block_states[Self::BLOCK_MU].eta;
386        let eta_ls = &block_states[Self::BLOCK_LOG_SIGMA].eta;
387        if etamu.len() != n || eta_ls.len() != n || self.weights.len() != n {
388            return Err(GamlssError::DimensionMismatch {
389                reason: "GaussianLocationScaleFamily input size mismatch".to_string(),
390            }
391            .into());
392        }
393
394        let pmu = xmu.ncols();
395        let p_ls = x_ls.ncols();
396        let total = pmu + p_ls;
397        if d_beta_flat.len() != total {
398            return Err(GamlssError::DimensionMismatch {
399                reason: format!(
400                    "GaussianLocationScaleFamily joint d_beta length mismatch: got {}, expected {}",
401                    d_beta_flat.len(),
402                    total
403                ),
404            }
405            .into());
406        }
407        let ximu = xmu.dot(d_beta_flat.slice(s![0..pmu]));
408        let xi_ls = x_ls.dot(d_beta_flat.slice(s![pmu..pmu + p_ls]));
409        let rows = self.get_or_compute_row_scalars(etamu, eta_ls)?;
410        let directional = gaussian_joint_first_directionalweights(&rows, &ximu, &xi_ls);
411        let dhmumu = directional.0;
412        let dh_ls_ls = directional.2;
413        // Fisher cross block E[H_{μ,ls}] ≡ 0 (μ ⊥ σ; see
414        // exact_newton_joint_hessian_from_designs / #684), so its directional
415        // derivative is identically 0 — keep the Hessian's curvature object the
416        // block-diagonal Gaussian Fisher information at every order. The
417        // observed-cross directional weight (`directional.1`) is therefore not
418        // assembled.
419        let dhmu_ls = Array1::<f64>::zeros(dhmumu.len());
420
421        Ok(Some(gaussian_joint_hessian_from_designs(
422            xmu, x_ls, &dhmumu, &dhmu_ls, &dh_ls_ls,
423        )?))
424    }
425
426    pub(crate) fn exact_newton_joint_hessiansecond_directional_derivative_from_designs(
427        &self,
428        block_states: &[ParameterBlockState],
429        xmu: &DenseOrOperator<'_>,
430        x_ls: &DenseOrOperator<'_>,
431        d_beta_u_flat: &Array1<f64>,
432        d_betav_flat: &Array1<f64>,
433    ) -> Result<Option<Array2<f64>>, String> {
434        validate_block_count::<GamlssError>("GaussianLocationScaleFamily", 2, block_states.len())?;
435        let n = self.y.len();
436        let etamu = &block_states[Self::BLOCK_MU].eta;
437        let eta_ls = &block_states[Self::BLOCK_LOG_SIGMA].eta;
438        if etamu.len() != n || eta_ls.len() != n || self.weights.len() != n {
439            return Err(GamlssError::DimensionMismatch {
440                reason: "GaussianLocationScaleFamily input size mismatch".to_string(),
441            }
442            .into());
443        }
444
445        let pmu = xmu.ncols();
446        let p_ls = x_ls.ncols();
447        let total = pmu + p_ls;
448        if d_beta_u_flat.len() != total || d_betav_flat.len() != total {
449            return Err(GamlssError::DimensionMismatch { reason: format!(
450                "GaussianLocationScaleFamily joint second directional derivative length mismatch: got {} and {}, expected {}",
451                d_beta_u_flat.len(),
452                d_betav_flat.len(),
453                total
454            ) }.into());
455        }
456        let ximu_u = xmu.dot(d_beta_u_flat.slice(s![0..pmu]));
457        let xi_ls_u = x_ls.dot(d_beta_u_flat.slice(s![pmu..pmu + p_ls]));
458        let ximuv = xmu.dot(d_betav_flat.slice(s![0..pmu]));
459        let xi_lsv = x_ls.dot(d_betav_flat.slice(s![pmu..pmu + p_ls]));
460        let rows = self.get_or_compute_row_scalars(etamu, eta_ls)?;
461        let second =
462            gaussian_jointsecond_directionalweights(&rows, &ximu_u, &xi_ls_u, &ximuv, &xi_lsv);
463        let d2hmumu = second.0;
464        let d2h_ls_ls = second.2;
465        // Fisher cross block E[H_{μ,ls}] ≡ 0 (μ ⊥ σ; #684), so its second
466        // directional derivative is identically 0; `second.1` (observed) is not
467        // assembled, keeping the curvature object block-diagonal Fisher.
468        let d2hmu_ls = Array1::<f64>::zeros(d2hmumu.len());
469
470        Ok(Some(gaussian_joint_hessian_from_designs(
471            xmu, x_ls, &d2hmumu, &d2hmu_ls, &d2h_ls_ls,
472        )?))
473    }
474
475    pub(crate) fn exact_newton_joint_psi_direction(
476        &self,
477        block_states: &[ParameterBlockState],
478        derivative_blocks: &[Vec<crate::custom_family::CustomFamilyBlockPsiDerivative>],
479        psi_index: usize,
480        xmu: &Array2<f64>,
481        x_ls: &Array2<f64>,
482        policy: &gam_runtime::resource::ResourcePolicy,
483    ) -> Result<Option<LocationScaleJointPsiDirection>, String> {
484        let Some(parts) = locscale_joint_psi_direction_parts(
485            block_states,
486            derivative_blocks,
487            psi_index,
488            self.y.len(),
489            xmu.ncols(),
490            x_ls.ncols(),
491            Self::BLOCK_MU,
492            Self::BLOCK_LOG_SIGMA,
493            2,
494            "GaussianLocationScaleFamily",
495            "mu",
496            policy,
497        )?
498        else {
499            return Ok(None);
500        };
501        Ok(Some(LocationScaleJointPsiDirection {
502            block_idx: parts.block_idx,
503            local_idx: parts.local_idx,
504            z_primary_psi: parts.primary_z,
505            z_ls_psi: parts.log_sigma_z,
506            x_primary_psi: parts.primary_psi,
507            x_ls_psi: parts.log_sigma_psi,
508        }))
509    }
510
511    pub(crate) fn exact_newton_joint_psisecond_design_drifts(
512        &self,
513        block_states: &[ParameterBlockState],
514        derivative_blocks: &[Vec<crate::custom_family::CustomFamilyBlockPsiDerivative>],
515        psi_a: &LocationScaleJointPsiDirection,
516        psi_b: &LocationScaleJointPsiDirection,
517        xmu: &Array2<f64>,
518        x_ls: &Array2<f64>,
519    ) -> Result<LocationScaleJointPsiSecondDrifts, String> {
520        locscale_joint_psisecond_design_drifts(
521            block_states,
522            derivative_blocks,
523            psi_a,
524            psi_b,
525            LocScalePsiDriftConfig {
526                n: self.y.len(),
527                p_primary: xmu.ncols(),
528                p_log_sigma: x_ls.ncols(),
529                primary_block_idx: Self::BLOCK_MU,
530                log_sigma_block_idx: Self::BLOCK_LOG_SIGMA,
531                family_name: "GaussianLocationScaleFamily",
532                primary_label: "mu",
533                policy: &self.policy,
534            },
535        )
536    }
537
538    pub(crate) fn exact_newton_joint_psi_terms_from_designs(
539        &self,
540        block_states: &[ParameterBlockState],
541        specs: &[ParameterBlockSpec],
542        derivative_blocks: &[Vec<crate::custom_family::CustomFamilyBlockPsiDerivative>],
543        psi_index: usize,
544        xmu: &Array2<f64>,
545        x_ls: &Array2<f64>,
546    ) -> Result<Option<crate::custom_family::ExactNewtonJointPsiTerms>, String> {
547        validate_block_count::<GamlssError>("GaussianLocationScaleFamily", 2, block_states.len())?;
548        if specs.len() != 2 || derivative_blocks.len() != 2 {
549            return Err(GamlssError::DimensionMismatch { reason: format!(
550                "GaussianLocationScaleFamily joint psi terms expect 2 specs and 2 derivative blocks, got {} and {}",
551                specs.len(),
552                derivative_blocks.len()
553            ) }.into());
554        }
555        let Some(dir_a) = self.exact_newton_joint_psi_direction(
556            block_states,
557            derivative_blocks,
558            psi_index,
559            xmu,
560            x_ls,
561            &self.policy,
562        )?
563        else {
564            return Ok(None);
565        };
566        // Gaussian 2-block location-scale family in the unified flattened
567        // coefficient space beta = [betamu; beta_sigma]:
568        //
569        //   mu_i = z_i^T betamu,
570        //   ell_i = x_i^T beta_sigma,
571        //   s_i = exp(ell_i),
572        //   r_i = y_i - mu_i,
573        //   q_i = r_i / s_i,
574        //   w_i = s_i^{-2},
575        //   alpha_i = r_i s_i^{-2},
576        //   b_i = q_i^2.
577        //
578        // The first fixed-beta psi object returned here is likelihood-only:
579        //
580        //   D_a         = -alpha^T m_a + (1 - b)^T ell_a
581        //   D_{beta a}  = [ -Xmu^T alpha_a - X_{mu,a}^T alpha ;
582        //                   -X_sigma^T b_a + X_{sigma,a}^T (1-b) ]
583        //   D_{bb a}    = [ Xmu^T W_a Xmu + X_{mu,a}^T W Xmu + Xmu^T W X_{mu,a},
584        //                   2( Xmu^T A_a X_sigma + X_{mu,a}^T A X_sigma + Xmu^T A X_{sigma,a} );
585        //                   sym,
586        //                   2( X_sigma^T B_a X_sigma + X_{sigma,a}^T B X_sigma + X_sigma^T B X_{sigma,a} ) ]
587        //
588        // with m_a = X_{mu,a} betamu, ell_a = X_{sigma,a} beta_sigma and
589        // rowwise scalar drifts
590        //
591        //   w_a     = -2 w * ell_a
592        //   alpha_a = -w * m_a - 2 alpha * ell_a
593        //   b_a     = -2 alpha * m_a - 2 b * ell_a.
594        //
595        // Generic code in custom_family.rs promotes these likelihood-only
596        // objects to the full fixed-beta V_a / g_a / H_a by adding S_a.
597        let etamu = &block_states[Self::BLOCK_MU].eta;
598        let eta_ls = &block_states[Self::BLOCK_LOG_SIGMA].eta;
599        let rows = self.get_or_compute_row_scalars(etamu, eta_ls)?;
600        let weights_a =
601            gaussian_joint_psi_firstweights(&rows, &dir_a.z_primary_psi, &dir_a.z_ls_psi);
602        let objective_psi = weights_a.objective_psirow.sum();
603        let xmu_map = dir_a.x_primary_psi.as_linear_map_ref();
604        let x_ls_map = dir_a.x_ls_psi.as_linear_map_ref();
605        let score_mu =
606            xmu_map.transpose_mul(weights_a.scoremu.view()) + fast_atv(xmu, &weights_a.dscoremu);
607        let score_ls = x_ls_map.transpose_mul(weights_a.score_ls.view())
608            + fast_atv(x_ls, &weights_a.dscore_ls);
609        let score_psi = gaussian_pack_joint_score(&score_mu, &score_ls);
610        let hessian_psi_operator = build_two_block_custom_family_joint_psi_operator_from_actions(
611            dir_a.x_primary_psi.cloned_first_action(),
612            dir_a.x_ls_psi.cloned_first_action(),
613            0..xmu.ncols(),
614            xmu.ncols()..xmu.ncols() + x_ls.ncols(),
615            xmu,
616            x_ls,
617            &weights_a.hmumu,
618            &weights_a.hmu_ls,
619            &weights_a.h_ls_ls,
620            &weights_a.dhmumu,
621            &weights_a.dhmu_ls,
622            &weights_a.dh_ls_ls,
623        )?;
624        let hessian_psi = if hessian_psi_operator.is_some() {
625            Array2::zeros((0, 0))
626        } else {
627            gaussian_joint_psihessian_fromweights(xmu, x_ls, xmu_map, x_ls_map, &weights_a)?
628        };
629
630        Ok(Some(crate::custom_family::ExactNewtonJointPsiTerms {
631            objective_psi,
632            score_psi,
633            hessian_psi,
634            hessian_psi_operator,
635        }))
636    }
637
638    pub(crate) fn exact_newton_joint_psisecond_order_terms_from_designs(
639        &self,
640        block_states: &[ParameterBlockState],
641        derivative_blocks: &[Vec<crate::custom_family::CustomFamilyBlockPsiDerivative>],
642        psi_i: usize,
643        psi_j: usize,
644        xmu: &Array2<f64>,
645        x_ls: &Array2<f64>,
646    ) -> Result<Option<crate::custom_family::ExactNewtonJointPsiSecondOrderTerms>, String> {
647        let Some(dir_i) = self.exact_newton_joint_psi_direction(
648            block_states,
649            derivative_blocks,
650            psi_i,
651            xmu,
652            x_ls,
653            &self.policy,
654        )?
655        else {
656            return Ok(None);
657        };
658        let Some(dir_j) = self.exact_newton_joint_psi_direction(
659            block_states,
660            derivative_blocks,
661            psi_j,
662            xmu,
663            x_ls,
664            &self.policy,
665        )?
666        else {
667            return Ok(None);
668        };
669        Ok(Some(
670            self.exact_newton_joint_psisecond_order_terms_from_parts(
671                block_states,
672                derivative_blocks,
673                &dir_i,
674                &dir_j,
675                xmu,
676                x_ls,
677                None,
678            )?,
679        ))
680    }
681
682    pub(crate) fn exact_newton_joint_psisecond_order_terms_from_parts(
683        &self,
684        block_states: &[ParameterBlockState],
685        derivative_blocks: &[Vec<crate::custom_family::CustomFamilyBlockPsiDerivative>],
686        dir_i: &LocationScaleJointPsiDirection,
687        dir_j: &LocationScaleJointPsiDirection,
688        xmu: &Array2<f64>,
689        x_ls: &Array2<f64>,
690        subsample: Option<&[crate::outer_subsample::WeightedOuterRow]>,
691    ) -> Result<crate::custom_family::ExactNewtonJointPsiSecondOrderTerms, String> {
692        let second_drifts = self.exact_newton_joint_psisecond_design_drifts(
693            block_states,
694            derivative_blocks,
695            dir_i,
696            dir_j,
697            xmu,
698            x_ls,
699        )?;
700        let n = self.y.len();
701        let xmu_i_map = dir_i.x_primary_psi.as_linear_map_ref();
702        let x_ls_i_map = dir_i.x_ls_psi.as_linear_map_ref();
703        let xmu_j_map = dir_j.x_primary_psi.as_linear_map_ref();
704        let x_ls_j_map = dir_j.x_ls_psi.as_linear_map_ref();
705        let xmu_ab_map = second_psi_linear_map(
706            second_drifts.x_primary_ab_action.as_ref(),
707            second_drifts.x_primary_ab.as_ref(),
708            n,
709            xmu.ncols(),
710        );
711        let x_ls_ab_map = second_psi_linear_map(
712            second_drifts.x_ls_ab_action.as_ref(),
713            second_drifts.x_ls_ab.as_ref(),
714            n,
715            x_ls.ncols(),
716        );
717        // Second fixed-beta psi objects for the same Gaussian location-scale
718        // kernel. Using the notation from the first-order comment, the rowwise
719        // second psi drifts are
720        //
721        //   w_ab     = 4 w * ell_a * ell_b - 2 w * ell_ab
722        //   alpha_ab = 2 w * (m_a * ell_b + m_b * ell_a)
723        //              + 4 alpha * ell_a * ell_b
724        //              - w * m_ab
725        //              - 2 alpha * ell_ab
726        //   b_ab     = 2 w * m_a * m_b
727        //              + 4 alpha * (m_a * ell_b + m_b * ell_a)
728        //              + 4 b * ell_a * ell_b
729        //              - 2 alpha * m_ab
730        //              - 2 b * ell_ab.
731        //
732        // The exact likelihood-only second-order objects are then:
733        //
734        //   D_ab,
735        //   D_{beta ab},
736        //   D_{beta beta ab},
737        //
738        // assembled from the usual product-rule expansion over realized
739        // design motion X_{.,a}, X_{.,b}, X_{.,ab}. Generic code adds S_ab.
740        let etamu = &block_states[Self::BLOCK_MU].eta;
741        let eta_ls = &block_states[Self::BLOCK_LOG_SIGMA].eta;
742        let rows = self.get_or_compute_row_scalars(etamu, eta_ls)?;
743        let mut weights_i =
744            gaussian_joint_psi_firstweights(&rows, &dir_i.z_primary_psi, &dir_i.z_ls_psi);
745        let mut weights_j =
746            gaussian_joint_psi_firstweights(&rows, &dir_j.z_primary_psi, &dir_j.z_ls_psi);
747        let mut secondweights = gaussian_joint_psisecondweights(
748            &rows,
749            &dir_i.z_primary_psi,
750            &dir_i.z_ls_psi,
751            &dir_j.z_primary_psi,
752            &dir_j.z_ls_psi,
753            &second_drifts.z_primary_ab,
754            &second_drifts.z_ls_ab,
755        );
756        if let Some(sub_rows) = subsample {
757            // HT mask: every downstream consumer (gaussian_joint_psisecondhessian_fromweights,
758            // weighted_crossprod_psi_maps with weights_*.{hmumu,hmu_ls,h_ls_ls},
759            // fast_atv on d2score_* and dscore_*) is row-linear in these arrays, so
760            // scaling sampled rows by 1/π_i and zeroing the rest yields an unbiased
761            // estimator of the full-data second-order ψ Hessian and ψ score.
762            apply_ht_mask_first(&mut weights_i, sub_rows);
763            apply_ht_mask_first(&mut weights_j, sub_rows);
764            apply_ht_mask_second(&mut secondweights, sub_rows);
765        }
766        let objective_psi_psi = secondweights.objective_psi_psirow.sum();
767
768        let score_psi_psi = gaussian_pack_joint_score(
769            &(xmu_ab_map.transpose_mul(weights_i.scoremu.view())
770                + xmu_i_map.transpose_mul(weights_j.dscoremu.view())
771                + xmu_j_map.transpose_mul(weights_i.dscoremu.view())
772                + fast_atv(xmu, &secondweights.d2scoremu)),
773            &(x_ls_ab_map.transpose_mul(weights_i.score_ls.view())
774                + x_ls_i_map.transpose_mul(weights_j.dscore_ls.view())
775                + x_ls_j_map.transpose_mul(weights_i.dscore_ls.view())
776                + fast_atv(x_ls, &secondweights.d2score_ls)),
777        );
778        let hessian_psi_psi = gaussian_joint_psisecondhessian_fromweights(
779            xmu,
780            x_ls,
781            xmu_i_map,
782            x_ls_i_map,
783            xmu_j_map,
784            x_ls_j_map,
785            xmu_ab_map,
786            x_ls_ab_map,
787            &weights_i,
788            &weights_j,
789            &secondweights,
790        )?;
791
792        Ok(crate::custom_family::ExactNewtonJointPsiSecondOrderTerms {
793            objective_psi_psi,
794            score_psi_psi,
795            hessian_psi_psi,
796            hessian_psi_psi_operator: None,
797        })
798    }
799
800    pub(crate) fn exact_newton_joint_psihessian_directional_derivative_from_designs(
801        &self,
802        block_states: &[ParameterBlockState],
803        derivative_blocks: &[Vec<crate::custom_family::CustomFamilyBlockPsiDerivative>],
804        psi_index: usize,
805        d_beta_flat: &Array1<f64>,
806        xmu: &Array2<f64>,
807        x_ls: &Array2<f64>,
808    ) -> Result<Option<Array2<f64>>, String> {
809        let Some(dir_a) = self.exact_newton_joint_psi_direction(
810            block_states,
811            derivative_blocks,
812            psi_index,
813            xmu,
814            x_ls,
815            &self.policy,
816        )?
817        else {
818            return Ok(None);
819        };
820        Ok(Some(
821            self.exact_newton_joint_psihessian_directional_derivative_from_parts(
822                block_states,
823                &dir_a,
824                d_beta_flat,
825                xmu,
826                x_ls,
827                None,
828            )?,
829        ))
830    }
831
832    pub(crate) fn exact_newton_joint_psihessian_directional_derivative_from_parts(
833        &self,
834        block_states: &[ParameterBlockState],
835        dir_a: &LocationScaleJointPsiDirection,
836        d_beta_flat: &Array1<f64>,
837        xmu: &Array2<f64>,
838        x_ls: &Array2<f64>,
839        subsample: Option<&[crate::outer_subsample::WeightedOuterRow]>,
840    ) -> Result<Array2<f64>, String> {
841        let etamu = &block_states[Self::BLOCK_MU].eta;
842        let eta_ls = &block_states[Self::BLOCK_LOG_SIGMA].eta;
843        let pmu = xmu.ncols();
844        let p_ls = x_ls.ncols();
845        let xmu_map = dir_a.x_primary_psi.as_linear_map_ref();
846        let x_ls_map = dir_a.x_ls_psi.as_linear_map_ref();
847        let total = pmu + p_ls;
848        if d_beta_flat.len() != total {
849            return Err(GamlssError::DimensionMismatch { reason: format!(
850                "GaussianLocationScaleFamily joint psi hessian directional derivative length mismatch: got {}, expected {}",
851                d_beta_flat.len(),
852                total
853            ) }.into());
854        }
855        // Only the log-σ–channel direction enters the surviving Fisher blocks
856        // of the mixed drift (the μ-channel direction fed the observed cross
857        // block, now Fisher 0; μ ⊥ σ, #684).
858        let u_ls = d_beta_flat.slice(s![pmu..pmu + p_ls]);
859        let xi_ls = fast_av(x_ls, &u_ls);
860        let uza_ls = x_ls_map.forward_mul(u_ls);
861        // Mixed drift T_a[u] = D_beta H_a^{(D)}[u] for the Gaussian family.
862        //
863        // Along u = [umu; u_sigma], define xi = Xmu umu and zeta = X_sigma u_sigma.
864        // The first beta-directional drifts of the Gaussian row scalars are
865        //
866        //   d_u w     = -2 w * zeta
867        //   d_u alpha = -w * xi - 2 alpha * zeta
868        //   d_u b     = -2 alpha * xi - 2 b * zeta.
869        //
870        // Differentiating the psi-a scalar drifts once more gives
871        //
872        //   d_u w_a     = 4 w * ell_a * zeta - 2 w * zeta_a
873        //   d_u alpha_a = 2 w * (m_a * zeta + ell_a * xi)
874        //                 - w * xi_a
875        //                 + 4 alpha * ell_a * zeta
876        //                 - 2 alpha * zeta_a
877        //   d_u b_a     = 2 w * m_a * xi
878        //                 + 4 alpha * (m_a * zeta + ell_a * xi)
879        //                 + 4 b * ell_a * zeta
880        //                 - 2 alpha * xi_a
881        //                 - 2 b * zeta_a.
882        //
883        // The matrix drift returned here is the exact likelihood-only
884        //
885        //   T_a[u] = D_beta H_{psi_a}^{(D)}[u],
886        //
887        // assembled blockwise as
888        //
889        //   Kmumu,a[u]   = Xmu^T W_a[u] Xmu
890        //                   + X_{mu,a}^T W[u] Xmu
891        //                   + Xmu^T W[u] X_{mu,a}
892        //   Kmusigma,a[u]= 2( Xmu^T A_a[u] X_sigma
893        //                   + X_{mu,a}^T A[u] X_sigma
894        //                   + Xmu^T A[u] X_{sigma,a} )
895        //   K_sigmasigma,a[u]
896        //                   = 2( X_sigma^T B_a[u] X_sigma
897        //                   + X_{sigma,a}^T B[u] X_sigma
898        //                   + X_sigma^T B[u] X_{sigma,a} ).
899        //
900        // Generic code then combines this with S(theta)-motion and the profile
901        // mode responses to form ddot H_{ij}.
902        let rows = self.get_or_compute_row_scalars(etamu, eta_ls)?;
903        let mut mixedweights =
904            gaussian_joint_psi_mixed_driftweights(&rows, &xi_ls, &dir_a.z_ls_psi, &uza_ls);
905        if let Some(sub_rows) = subsample {
906            // HT mask: `gaussian_joint_psi_mixedhessian_drift_fromweights` is
907            // row-linear in every `mixedweights.*` array via `xt_diag_*_dense`
908            // and `weighted_crossprod_psi_maps`, so the masked Hessian-drift
909            // remains an unbiased estimator of the full-data drift.
910            apply_ht_mask_mixed(&mut mixedweights, sub_rows);
911        }
912
913        gaussian_joint_psi_mixedhessian_drift_fromweights(
914            xmu,
915            x_ls,
916            xmu_map,
917            x_ls_map,
918            &mixedweights,
919        )
920    }
921
922    /// Build the [`BlockEffectiveJacobian`] for block `block_idx` given the
923    /// realised block specs.  Returns an [`AdditiveBlockJacobian`] encoding the
924    /// linear map η_r[i] = X_r[i,:] · β_r:
925    ///
926    /// - block 0 (mu):       output 0 = design rows, output 1 = zeros
927    /// - block 1 (log_sigma): output 0 = zeros, output 1 = design rows
928    pub fn block_effective_jacobian(
929        specs: &[ParameterBlockSpec],
930        block_idx: usize,
931    ) -> Result<Box<dyn BlockEffectiveJacobian>, String> {
932        crate::block_layout::block_jacobian::AdditiveWiggleBlockLayout {
933            family: "GaussianLocationScaleFamily",
934            n_outputs: 2,
935            additive_blocks: &[Self::BLOCK_MU, Self::BLOCK_LOG_SIGMA],
936            wiggle_block: None,
937        }
938        .block_effective_jacobian(specs, block_idx)
939    }
940}
941
942/// Per-subject 2×2 channel Hessian `W_i` for Gaussian location-scale.
943///
944/// The row negative log-likelihood (with per-row weight `w_i`, response `y_i`,
945/// mean predictor `μ_i`, log-scale predictor `s_i = log σ_i`) is
946///
947/// ```text
948/// ρ_i(μ, s) = w_i [s + 0.5·(y_i − μ)²·exp(−2s)]
949/// ```
950///
951/// The 2×2 Hessian in `(μ, s)` coordinates:
952///
953/// ```text
954/// W_i[0,0] = w_i · exp(−2 s_i)                        ∂²ρ/∂μ²
955/// W_i[1,1] = w_i · 2·(y_i − μ_i)²·exp(−2 s_i)        ∂²ρ/∂s²
956/// W_i[0,1] = W_i[1,0] = w_i · 2·(y_i − μ_i)·exp(−2 s_i)  ∂²ρ/∂μ∂s
957/// ```
958///
959/// The off-diagonal cross-channel term `∂²ρ/∂μ∂s` is nonzero whenever the
960/// residual `(y_i − μ_i) ≠ 0`, i.e. away from the fitted mean.
961pub struct GaussianLocationScaleChannelHessian {
962    /// Row-major `(n × 2 × 2)` PSD-clamped per-subject Hessian.
963    pub(crate) h: ndarray::Array3<f64>,
964}
965
966impl GaussianLocationScaleChannelHessian {
967    /// Construct the raw (un-PSD-clamped) per-subject observed Hessian.
968    ///
969    /// For Gaussian location-scale the 2×2 observed Hessian
970    /// `[[w·e^{-2s}, 2·w·r·e^{-2s}], [2·w·r·e^{-2s}, 2·w·r²·e^{-2s}]]`
971    /// has determinant `-2·w²·r²·e^{-4s}` which is non-positive whenever
972    /// the residual `r = y − μ ≠ 0`. Tests that finite-difference the row
973    /// NLL must compare against this raw observed Hessian — PSD clamping
974    /// alters the eigenvalues and the FD-versus-closed-form match fails.
975    ///
976    /// Production code that needs a PSD matrix (e.g. the canonicalize gate)
977    /// must call [`Self::from_pilot`] which PSD-clamps via 2×2
978    /// eigendecomposition.
979    pub fn from_pilot_observed_unclamped(
980        y: &ndarray::Array1<f64>,
981        w: &ndarray::Array1<f64>,
982        eta_mu: &ndarray::Array1<f64>,
983        eta_log_sigma: &ndarray::Array1<f64>,
984    ) -> Result<Self, String> {
985        let n = y.len();
986        if w.len() != n || eta_mu.len() != n || eta_log_sigma.len() != n {
987            return Err(format!(
988                "GaussianLocationScaleChannelHessian::from_pilot_observed_unclamped: \
989                 length mismatch y={n} w={} eta_mu={} eta_log_sigma={}",
990                w.len(),
991                eta_mu.len(),
992                eta_log_sigma.len(),
993            ));
994        }
995        let mut h = ndarray::Array3::<f64>::zeros((n, 2, 2));
996        for i in 0..n {
997            let wi = w[i];
998            let mu_i = eta_mu[i];
999            let s_i = eta_log_sigma[i];
1000            let inv_sigma2 = (-2.0 * s_i).exp();
1001            let resid = y[i] - mu_i;
1002            h[[i, 0, 0]] = wi * inv_sigma2;
1003            h[[i, 1, 1]] = wi * 2.0 * resid * resid * inv_sigma2;
1004            h[[i, 0, 1]] = wi * 2.0 * resid * inv_sigma2;
1005            h[[i, 1, 0]] = h[[i, 0, 1]];
1006        }
1007        Ok(Self { h })
1008    }
1009
1010    /// Construct from pilot predictors (μ and log σ at current β) and data,
1011    /// with PSD eigenvalue clamping applied per subject.
1012    ///
1013    /// `y` is the response, `w` the per-row sample weights, `eta_mu` and
1014    /// `eta_log_sigma` the current linear predictors. Negative eigenvalues
1015    /// are projected to zero (PSD clamp) before storage so the resulting
1016    /// matrix is a valid metric for the W-Gram identifiability compile.
1017    pub fn from_pilot(
1018        y: &ndarray::Array1<f64>,
1019        w: &ndarray::Array1<f64>,
1020        eta_mu: &ndarray::Array1<f64>,
1021        eta_log_sigma: &ndarray::Array1<f64>,
1022    ) -> Result<Self, String> {
1023        let n = y.len();
1024        if w.len() != n || eta_mu.len() != n || eta_log_sigma.len() != n {
1025            return Err(format!(
1026                "GaussianLocationScaleChannelHessian::from_pilot: \
1027                 length mismatch y={n} w={} eta_mu={} eta_log_sigma={}",
1028                w.len(),
1029                eta_mu.len(),
1030                eta_log_sigma.len(),
1031            ));
1032        }
1033        let mut h = ndarray::Array3::<f64>::zeros((n, 2, 2));
1034        for i in 0..n {
1035            let wi = w[i];
1036            let mu_i = eta_mu[i];
1037            let s_i = eta_log_sigma[i];
1038            let inv_sigma2 = (-2.0 * s_i).exp(); // exp(-2s) = 1/sigma^2
1039            let resid = y[i] - mu_i;
1040            // Hessian of w_i * ρ_i
1041            let h00 = wi * inv_sigma2;
1042            let h11 = wi * 2.0 * resid * resid * inv_sigma2;
1043            let h01 = wi * 2.0 * resid * inv_sigma2;
1044            // PSD clamp via eigendecomposition of 2×2 matrix.
1045            // psd_clamp_2x2 returns (λ1, λ2, u1[0], u1[1], u2[0], u2[1])
1046            // where u1 and u2 are unit eigenvectors for λ1 and λ2.
1047            // Reconstruction: H_psd = λ1·u1·u1ᵀ + λ2·u2·u2ᵀ
1048            let (e0, e1, u1_0, u1_1, u2_0, u2_1) = psd_clamp_2x2(h00, h01, h11);
1049            h[[i, 0, 0]] = e0 * u1_0 * u1_0 + e1 * u2_0 * u2_0;
1050            h[[i, 0, 1]] = e0 * u1_0 * u1_1 + e1 * u2_0 * u2_1;
1051            h[[i, 1, 0]] = h[[i, 0, 1]];
1052            h[[i, 1, 1]] = e0 * u1_1 * u1_1 + e1 * u2_1 * u2_1;
1053        }
1054        Ok(Self { h })
1055    }
1056}
1057
1058impl FamilyChannelHessian for GaussianLocationScaleChannelHessian {
1059    fn n_outputs(&self) -> usize {
1060        2
1061    }
1062
1063    fn n_subjects(&self) -> usize {
1064        self.h.shape()[0]
1065    }
1066
1067    fn fill_subject(&self, i: usize, out: &mut [f64]) {
1068        assert_eq!(out.len(), 4);
1069        out[0] = self.h[[i, 0, 0]];
1070        out[1] = self.h[[i, 0, 1]];
1071        out[2] = self.h[[i, 1, 0]];
1072        out[3] = self.h[[i, 1, 1]];
1073    }
1074
1075    fn evaluate_full(&self) -> ndarray::Array3<f64> {
1076        self.h.clone()
1077    }
1078}
1079
1080impl CustomFamily for GaussianLocationScaleFamily {
1081    /// The Gaussian location-scale joint Hessian depends on β because the
1082    /// cross-block (μ,log σ) and (log σ,log σ) blocks contain the residual
1083    /// r = y − μ (via the row scalars m = r·w and n = r²·w), which changes
1084    /// when β_μ moves.  The (μ,μ) block weight w = 1/σ² also depends on
1085    /// β_{log σ}.  This override is essential for correct M_j[u] drift
1086    /// corrections when ψ hyperparameters move the design matrices.
1087    fn exact_newton_joint_hessian_beta_dependent(&self) -> bool {
1088        true
1089    }
1090
1091    /// Gaussian location-scale carries a NON-profiled second (log-σ) linear
1092    /// predictor, so — unlike an ordinary Gaussian GAM whose scalar dispersion is
1093    /// profiled out analytically — its smoothing-parameter selection exhibits the
1094    /// same capped-screening over-smoothing bias as a GLM block: the capped
1095    /// inner-iteration screening proxy ranks an over-smoothed scale seed cheapest
1096    /// (its coefficients collapse into the penalty null space and the proxy looks
1097    /// converged), so the log-σ smooth is flattened toward a constant σ, the
1098    /// 1/σ² IRLS weights go wrong, and the weight-coupled mean degrades too.
1099    ///
1100    /// The default trait config classifies this as the generic
1101    /// `GeneralizedLinear` profile (seed_budget=1, capped screening, a seed grid
1102    /// reaching only ρ≈−2, and the *parsimonious* — smoothing-biased — keep-best),
1103    /// every part of which pushes the scale toward over-smoothing. The spatial
1104    /// (Matérn/GP) location-scale path already classifies the family as
1105    /// `GaussianLocationScale`; this override extends that same correct
1106    /// classification to the NON-spatial (thin-plate / P-spline) rho-only path,
1107    /// which is the one a `s(x, bs='tp')` location-scale fit actually takes. The
1108    /// `GaussianLocationScale` profile reuses Gaussian's flexible seed grid (which
1109    /// reaches the low-λ scale basin) and Gaussian's lowest-cost keep-best (no
1110    /// smoothing-biased tie-break), while still taking the interior-extreme seed
1111    /// promotion so the flexible basin is actually full-solved. The budget mirrors
1112    /// the spatial `exact_joint_seed_config(Gaussian)` (max_seeds=4, seed_budget=2).
1113    fn outer_seed_config(&self, n_params: usize) -> crate::seeding::SeedConfig {
1114        if n_params == 0 {
1115            return crate::seeding::SeedConfig::default();
1116        }
1117        let mut config = crate::seeding::SeedConfig::default();
1118        config.risk_profile = crate::seeding::SeedRiskProfile::GaussianLocationScale;
1119        config.max_seeds = 4;
1120        config.seed_budget = 2;
1121        config
1122    }
1123
1124    /// Two independent linear predictors: block 0 → μ channel, block 1 → log σ
1125    /// channel. Declaring the channel topology lets `fit_custom_family` route
1126    /// the identifiability audit channel-aware even when a caller builds the
1127    /// blocks by hand (without `build_location_scale_block`'s callbacks), so a
1128    /// shared μ/log-σ covariate basis is recognised as block-diagonal rather
1129    /// than mistaken for cross-block intercept aliases (#558).
1130    fn output_channel_assignment(&self, specs: &[ParameterBlockSpec]) -> Option<Vec<usize>> {
1131        // Two-channel families: `[mu, log_sigma]`. The optional trailing
1132        // zero-channel wiggle block (when present) also drives channel 0.
1133        Some(
1134            (0..specs.len())
1135                .map(|i| usize::from(i == Self::BLOCK_LOG_SIGMA))
1136                .collect(),
1137        )
1138    }
1139
1140    fn coefficient_hessian_cost(&self, specs: &[ParameterBlockSpec]) -> u64 {
1141        // Operator-aware: when the unified evaluator picks the matrix-free
1142        // joint Hessian path (see `use_joint_matrix_free_path`), the workspace
1143        // applies the joint Hessian via row-streaming Khatri-Rao matvecs at
1144        // O(n · (p_t + p_ℓ)) per Hv, never building the dense (p_t + p_ℓ)²
1145        // matrix. Report the operator work model so diagnostics and
1146        // first-order-only policies reflect the representation that actually
1147        // runs.
1148        crate::location_scale_engine::location_scale_coefficient_hessian_cost(
1149            self.y.len() as u64,
1150            specs,
1151        )
1152    }
1153
1154    fn evaluate(&self, block_states: &[ParameterBlockState]) -> Result<FamilyEvaluation, String> {
1155        validate_block_count::<GamlssError>("GaussianLocationScaleFamily", 2, block_states.len())?;
1156        let n = self.y.len();
1157        let etamu = &block_states[Self::BLOCK_MU].eta;
1158        let eta_log_sigma = &block_states[Self::BLOCK_LOG_SIGMA].eta;
1159        if etamu.len() != n || eta_log_sigma.len() != n || self.weights.len() != n {
1160            return Err(GamlssError::DimensionMismatch {
1161                reason: "GaussianLocationScaleFamily input size mismatch".to_string(),
1162            }
1163            .into());
1164        }
1165
1166        // Diagonal IRLS weights for the inner solver.
1167        //
1168        // For the location block (identity link): wmu = pw / sigma^2. Since the
1169        // location link is identity, observed = Fisher --- no correction needed.
1170        //
1171        // For the log-sigma block (log link): w_ls = 2 * pw * (dsigma/deta)^2 / sigma^2.
1172        // This is the Fisher weight. For the outer REML, the joint
1173        // `exact_newton_joint_hessian` provides the full observed Hessian directly,
1174        // so these Diagonal weights are only used for the inner IRLS iteration
1175        // (where Fisher scoring is fine). See response.md Section 3.
1176        //
1177        let mut zmu = Array1::<f64>::zeros(n);
1178        let mut wmu = Array1::<f64>::zeros(n);
1179        let mut z_ls = Array1::<f64>::zeros(n);
1180        let mut w_ls = Array1::<f64>::zeros(n);
1181        let ln2pi = (2.0 * std::f64::consts::PI).ln();
1182        let mut ll = 0.0;
1183
1184        const CHUNK: usize = 1024;
1185        if let (
1186            Some(y_s),
1187            Some(w_s),
1188            Some(mu_s),
1189            Some(ls_s),
1190            Some(zmu_s),
1191            Some(wmu_s),
1192            Some(zls_s),
1193            Some(wls_s),
1194        ) = (
1195            self.y.as_slice_memory_order(),
1196            self.weights.as_slice_memory_order(),
1197            etamu.as_slice_memory_order(),
1198            eta_log_sigma.as_slice_memory_order(),
1199            zmu.as_slice_memory_order_mut(),
1200            wmu.as_slice_memory_order_mut(),
1201            z_ls.as_slice_memory_order_mut(),
1202            w_ls.as_slice_memory_order_mut(),
1203        ) {
1204            // Per-row Gaussian LS kernel writes 4 working arrays directly into
1205            // the output slices; ll is reduced via Rayon's sum. Independent
1206            // across rows.
1207            ll += zmu_s
1208                .par_chunks_mut(CHUNK)
1209                .zip(wmu_s.par_chunks_mut(CHUNK))
1210                .zip(zls_s.par_chunks_mut(CHUNK))
1211                .zip(wls_s.par_chunks_mut(CHUNK))
1212                .enumerate()
1213                .map(|(chunk_idx, (((zmu_c, wmu_c), zls_c), wls_c))| {
1214                    let start = chunk_idx * CHUNK;
1215                    let mut local_ll = 0.0;
1216                    for local in 0..zmu_c.len() {
1217                        let i = start + local;
1218                        let row =
1219                            gaussian_diagonal_row_kernel(y_s[i], mu_s[i], ls_s[i], w_s[i], ln2pi);
1220                        zmu_c[local] = mu_s[i] + row.location_working_shift;
1221                        wmu_c[local] = row.location_working_weight;
1222                        zls_c[local] = row.log_sigma_working_response;
1223                        wls_c[local] = row.log_sigma_working_weight;
1224                        local_ll += row.log_likelihood;
1225                    }
1226                    local_ll
1227                })
1228                .sum::<f64>();
1229        } else {
1230            // Fallback path: inputs are not contiguous. Outputs (just-allocated
1231            // Array1::zeros) always are. Reborrow input views into the closure.
1232            let y_view = self.y.view();
1233            let w_view = self.weights.view();
1234            let mu_view = etamu.view();
1235            let ls_view = eta_log_sigma.view();
1236            let zmu_s = zmu
1237                .as_slice_memory_order_mut()
1238                .expect("zeros is contiguous");
1239            let wmu_s = wmu
1240                .as_slice_memory_order_mut()
1241                .expect("zeros is contiguous");
1242            let zls_s = z_ls
1243                .as_slice_memory_order_mut()
1244                .expect("zeros is contiguous");
1245            let wls_s = w_ls
1246                .as_slice_memory_order_mut()
1247                .expect("zeros is contiguous");
1248            ll += zmu_s
1249                .par_chunks_mut(CHUNK)
1250                .zip(wmu_s.par_chunks_mut(CHUNK))
1251                .zip(zls_s.par_chunks_mut(CHUNK))
1252                .zip(wls_s.par_chunks_mut(CHUNK))
1253                .enumerate()
1254                .map(|(chunk_idx, (((zmu_c, wmu_c), zls_c), wls_c))| {
1255                    let start = chunk_idx * CHUNK;
1256                    let mut local_ll = 0.0;
1257                    for local in 0..zmu_c.len() {
1258                        let i = start + local;
1259                        let row = gaussian_diagonal_row_kernel(
1260                            y_view[i], mu_view[i], ls_view[i], w_view[i], ln2pi,
1261                        );
1262                        zmu_c[local] = mu_view[i] + row.location_working_shift;
1263                        wmu_c[local] = row.location_working_weight;
1264                        zls_c[local] = row.log_sigma_working_response;
1265                        wls_c[local] = row.log_sigma_working_weight;
1266                        local_ll += row.log_likelihood;
1267                    }
1268                    local_ll
1269                })
1270                .sum::<f64>();
1271        }
1272
1273        Ok(FamilyEvaluation {
1274            log_likelihood: ll,
1275            blockworking_sets: vec![
1276                BlockWorkingSet::diagonal_checked(zmu, wmu)?,
1277                BlockWorkingSet::diagonal_checked(z_ls, w_ls)?,
1278            ],
1279        })
1280    }
1281
1282    fn log_likelihood_only(&self, block_states: &[ParameterBlockState]) -> Result<f64, String> {
1283        validate_block_count::<GamlssError>("GaussianLocationScaleFamily", 2, block_states.len())?;
1284        let n = self.y.len();
1285        let etamu = &block_states[Self::BLOCK_MU].eta;
1286        let eta_log_sigma = &block_states[Self::BLOCK_LOG_SIGMA].eta;
1287        if etamu.len() != n || eta_log_sigma.len() != n || self.weights.len() != n {
1288            return Err(GamlssError::DimensionMismatch {
1289                reason: "GaussianLocationScaleFamily input size mismatch".to_string(),
1290            }
1291            .into());
1292        }
1293        // logb noise link: σ(η_ls) = LOGB_SIGMA_FLOOR + exp(η_ls). σ ≥ b > 0
1294        // bounds the loglik below (−Σlog σ ≥ −n log b) and bounds 1/σ² by 1/b²,
1295        // so the previous `inv_s2.min(1e24)` cap is structurally unnecessary.
1296        let ln2pi = (2.0 * std::f64::consts::PI).ln();
1297        let mut ll = 0.0;
1298        if let (Some(y_s), Some(w_s), Some(mu_s), Some(ls_s)) = (
1299            self.y.as_slice_memory_order(),
1300            self.weights.as_slice_memory_order(),
1301            etamu.as_slice_memory_order(),
1302            eta_log_sigma.as_slice_memory_order(),
1303        ) {
1304            use rayon::iter::{IntoParallelIterator, ParallelIterator};
1305            ll += (0..n)
1306                .into_par_iter()
1307                .map(|i| {
1308                    let wi = w_s[i];
1309                    if wi == 0.0 {
1310                        return 0.0;
1311                    }
1312                    let sigma_i = logb_sigma_from_eta_scalar(ls_s[i]);
1313                    let inv_s2 = (sigma_i * sigma_i).recip();
1314                    let r = y_s[i] - mu_s[i];
1315                    wi * (-0.5 * (r * r * inv_s2 + ln2pi + 2.0 * sigma_i.ln()))
1316                })
1317                .sum::<f64>();
1318        } else {
1319            use rayon::iter::{IntoParallelIterator, ParallelIterator};
1320            ll += (0..n)
1321                .into_par_iter()
1322                .map(|i| {
1323                    let wi = self.weights[i];
1324                    if wi == 0.0 {
1325                        return 0.0;
1326                    }
1327                    let sigma_i = logb_sigma_from_eta_scalar(eta_log_sigma[i]);
1328                    let inv_s2 = (sigma_i * sigma_i).recip();
1329                    let r = self.y[i] - etamu[i];
1330                    wi * (-0.5 * (r * r * inv_s2 + ln2pi + 2.0 * sigma_i.ln()))
1331                })
1332                .sum::<f64>();
1333        }
1334        Ok(ll)
1335    }
1336
1337    /// Outer-only log-likelihood with optional row subsample.
1338    ///
1339    /// When `options.outer_score_subsample` is `Some`, only the sampled rows
1340    /// contribute; each row's per-row log-likelihood term is multiplied by
1341    /// `WeightedOuterRow.weight`, the Horvitz–Thompson inverse-inclusion
1342    /// factor 1/π_i (uniform or stratified sampling both supported), so the
1343    /// partial sum is an unbiased estimator of the full-data log-likelihood.
1344    /// When `None`, this returns the full-data `log_likelihood_only`. Inner
1345    /// PIRLS line searches never install the subsample option, so they
1346    /// continue to score the exact full-data log-likelihood.
1347    fn log_likelihood_only_with_options(
1348        &self,
1349        block_states: &[ParameterBlockState],
1350        options: &BlockwiseFitOptions,
1351    ) -> Result<f64, String> {
1352        let Some(subsample) = options.outer_score_subsample.as_ref() else {
1353            return self.log_likelihood_only(block_states);
1354        };
1355        validate_block_count::<GamlssError>("GaussianLocationScaleFamily", 2, block_states.len())?;
1356        let n = self.y.len();
1357        let etamu = &block_states[Self::BLOCK_MU].eta;
1358        let eta_log_sigma = &block_states[Self::BLOCK_LOG_SIGMA].eta;
1359        if etamu.len() != n || eta_log_sigma.len() != n || self.weights.len() != n {
1360            return Err(GamlssError::DimensionMismatch {
1361                reason: "GaussianLocationScaleFamily input size mismatch".to_string(),
1362            }
1363            .into());
1364        }
1365        let ln2pi = (2.0 * std::f64::consts::PI).ln();
1366        use rayon::iter::ParallelIterator;
1367        let ll: f64 = subsample
1368            .rows
1369            .par_iter()
1370            .map(|row| {
1371                let i = row.index;
1372                let wi = self.weights[i];
1373                if wi == 0.0 {
1374                    return 0.0;
1375                }
1376                let sigma_i = logb_sigma_from_eta_scalar(eta_log_sigma[i]);
1377                let inv_s2 = (sigma_i * sigma_i).recip();
1378                let r = self.y[i] - etamu[i];
1379                row.weight * wi * (-0.5 * (r * r * inv_s2 + ln2pi + 2.0 * sigma_i.ln()))
1380            })
1381            .sum();
1382        Ok(ll)
1383    }
1384
1385    fn exact_newton_joint_hessian(
1386        &self,
1387        block_states: &[ParameterBlockState],
1388    ) -> Result<Option<Array2<f64>>, String> {
1389        self.exact_newton_joint_hessian_for_specs(block_states, None)
1390    }
1391
1392    fn has_explicit_joint_hessian(&self) -> bool {
1393        true
1394    }
1395
1396    /// The Gaussian location-scale likelihood has no separation /
1397    /// under-identification regime that the full-span Jeffreys curvature `H_Φ`
1398    /// is meant to regularize: with the soft floor `σ ≥ b > 0` the per-row
1399    /// Fisher information `diag(a/σ², 2κ²a)` is bounded and `O(n)` on every
1400    /// identified direction at every working point, so the well-conditioned-`H`
1401    /// Jeffreys gate smooth-steps `H_Φ` to ~0 — yet the matching score `∇Φ`
1402    /// kept leaking a *phantom* penalized-stationarity residual into the inner
1403    /// joint-Newton (a nonzero `|∇L − Sβ|` paired with a numerically null `H_Φ`
1404    /// and a full-rank `H_pen`), so the KKT certificate refused every iterate
1405    /// and the outer REML rejected all seeds — aborting heteroscedastic
1406    /// location-scale fits (#684–#688). This is the same opt-out
1407    /// `TransformationNormalFamily` takes for the same structural reason
1408    /// (continuous response, `O(n)` Fisher information everywhere); it removes
1409    /// the phantom residual and drops the per-cycle `O(n·p²)` Jeffreys
1410    /// directional-derivative overhead.
1411    fn joint_jeffreys_term_required(&self) -> bool {
1412        false
1413    }
1414
1415    fn exact_newton_joint_hessian_directional_derivative(
1416        &self,
1417        block_states: &[ParameterBlockState],
1418        d_beta_flat: &Array1<f64>,
1419    ) -> Result<Option<Array2<f64>>, String> {
1420        self.exact_newton_joint_hessian_directional_derivative_for_specs(
1421            block_states,
1422            None,
1423            d_beta_flat,
1424        )
1425    }
1426
1427    fn exact_newton_joint_hessiansecond_directional_derivative(
1428        &self,
1429        block_states: &[ParameterBlockState],
1430        d_beta_u_flat: &Array1<f64>,
1431        d_betav_flat: &Array1<f64>,
1432    ) -> Result<Option<Array2<f64>>, String> {
1433        self.exact_newton_joint_hessian_second_directional_derivative_for_specs(
1434            block_states,
1435            None,
1436            d_beta_u_flat,
1437            d_betav_flat,
1438        )
1439    }
1440
1441    fn diagonalworking_weights_directional_derivative(
1442        &self,
1443        block_states: &[ParameterBlockState],
1444        block_idx: usize,
1445        d_eta: &Array1<f64>,
1446    ) -> Result<Option<Array1<f64>>, String> {
1447        validate_block_count::<GamlssError>("GaussianLocationScaleFamily", 2, block_states.len())?;
1448        let n = self.y.len();
1449        let eta_t = &block_states[Self::BLOCK_MU].eta;
1450        let eta_ls = &block_states[Self::BLOCK_LOG_SIGMA].eta;
1451        if eta_t.len() != n || eta_ls.len() != n || self.weights.len() != n || d_eta.len() != n {
1452            return Err(GamlssError::DimensionMismatch {
1453                reason: "GaussianLocationScaleFamily input size mismatch".to_string(),
1454            }
1455            .into());
1456        }
1457
1458        let sigma = eta_ls.mapv(logb_sigma_from_eta_scalar);
1459        let mut dw = Array1::<f64>::zeros(n);
1460        match block_idx {
1461            Self::BLOCK_MU => {
1462                // Gaussian location block:
1463                //
1464                //   wmu = weight / sigma^2.
1465                //
1466                // This depends only on the scale predictor, so along a
1467                // location-only direction d etamu the directional derivative is
1468                // identically zero.
1469                Ok(Some(dw))
1470            }
1471            Self::BLOCK_LOG_SIGMA => {
1472                // Gaussian log-sigma block:
1473                //
1474                // The PIRLS information weight is
1475                //
1476                //   w_ls = max(2 * weight * clamp(g, -1, 1)^2, MIN_WEIGHT),
1477                //   g    = sigma'(eta_ls) / sigma(eta_ls),
1478                // with the semantic rule that zero observation weights stay zero.
1479                //
1480                // Along a direction d eta_ls,
1481                //
1482                //   dw_ls is the directional derivative of that piecewise
1483                //   definition. On the active clamp branch or active MIN_WEIGHT
1484                //   floor branch, the returned derivative is zero to match the
1485                //   selected local piece of the evaluated weight.
1486                //
1487                // This is the exact directional derivative needed by the REML
1488                // trace term
1489                //
1490                //   0.5 tr(J^{-1} D_beta J[u])
1491                //   = 0.5 sum_i (x_i^T J^{-1} x_i) dw_i
1492                //
1493                // for diagonal working-set blocks.
1494                use rayon::iter::{IntoParallelIterator, ParallelIterator};
1495                let dw_vec: Vec<f64> = (0..n)
1496                    .into_par_iter()
1497                    .map(|i| {
1498                        let d1 = crate::sigma_link::logb_sigma_jet1_scalar(eta_ls[i]).d1;
1499                        gaussian_log_sigma_irlsinfo_directional_derivative(
1500                            self.weights[i],
1501                            sigma[i],
1502                            d1,
1503                            d_eta[i],
1504                        )
1505                    })
1506                    .collect();
1507                for (i, v) in dw_vec.into_iter().enumerate() {
1508                    dw[i] = v;
1509                }
1510                Ok(Some(dw))
1511            }
1512            _ => Ok(None),
1513        }
1514    }
1515
1516    fn exact_newton_joint_hessian_with_specs(
1517        &self,
1518        block_states: &[ParameterBlockState],
1519        specs: &[ParameterBlockSpec],
1520    ) -> Result<Option<Array2<f64>>, String> {
1521        self.exact_newton_joint_hessian_for_specs(block_states, Some(specs))
1522    }
1523
1524    fn exact_newton_joint_hessian_directional_derivative_with_specs(
1525        &self,
1526        block_states: &[ParameterBlockState],
1527        specs: &[ParameterBlockSpec],
1528        d_beta_flat: &Array1<f64>,
1529    ) -> Result<Option<Array2<f64>>, String> {
1530        self.exact_newton_joint_hessian_directional_derivative_for_specs(
1531            block_states,
1532            Some(specs),
1533            d_beta_flat,
1534        )
1535    }
1536
1537    fn exact_newton_joint_hessian_second_directional_derivative_with_specs(
1538        &self,
1539        block_states: &[ParameterBlockState],
1540        specs: &[ParameterBlockSpec],
1541        d_beta_u_flat: &Array1<f64>,
1542        d_betav_flat: &Array1<f64>,
1543    ) -> Result<Option<Array2<f64>>, String> {
1544        self.exact_newton_joint_hessian_second_directional_derivative_for_specs(
1545            block_states,
1546            Some(specs),
1547            d_beta_u_flat,
1548            d_betav_flat,
1549        )
1550    }
1551
1552    fn exact_newton_joint_psi_terms(
1553        &self,
1554        block_states: &[ParameterBlockState],
1555        specs: &[ParameterBlockSpec],
1556        derivative_blocks: &[Vec<crate::custom_family::CustomFamilyBlockPsiDerivative>],
1557        psi_index: usize,
1558    ) -> Result<Option<crate::custom_family::ExactNewtonJointPsiTerms>, String> {
1559        self.exact_newton_joint_psi_terms_for_specs(
1560            block_states,
1561            specs,
1562            derivative_blocks,
1563            psi_index,
1564        )
1565    }
1566
1567    fn exact_newton_joint_psisecond_order_terms(
1568        &self,
1569        block_states: &[ParameterBlockState],
1570        specs: &[ParameterBlockSpec],
1571        derivative_blocks: &[Vec<crate::custom_family::CustomFamilyBlockPsiDerivative>],
1572        psi_i: usize,
1573        psi_j: usize,
1574    ) -> Result<Option<crate::custom_family::ExactNewtonJointPsiSecondOrderTerms>, String> {
1575        self.exact_newton_joint_psisecond_order_terms_for_specs(
1576            block_states,
1577            specs,
1578            derivative_blocks,
1579            psi_i,
1580            psi_j,
1581        )
1582    }
1583
1584    fn exact_newton_joint_psihessian_directional_derivative(
1585        &self,
1586        block_states: &[ParameterBlockState],
1587        specs: &[ParameterBlockSpec],
1588        derivative_blocks: &[Vec<crate::custom_family::CustomFamilyBlockPsiDerivative>],
1589        psi_index: usize,
1590        d_beta_flat: &Array1<f64>,
1591    ) -> Result<Option<Array2<f64>>, String> {
1592        self.exact_newton_joint_psihessian_directional_derivative_for_specs(
1593            block_states,
1594            specs,
1595            derivative_blocks,
1596            psi_index,
1597            d_beta_flat,
1598        )
1599    }
1600
1601    fn exact_newton_joint_psi_workspace(
1602        &self,
1603        block_states: &[ParameterBlockState],
1604        specs: &[ParameterBlockSpec],
1605        derivative_blocks: &[Vec<crate::custom_family::CustomFamilyBlockPsiDerivative>],
1606    ) -> Result<Option<Arc<dyn ExactNewtonJointPsiWorkspace>>, String> {
1607        validate_block_count::<GamlssError>("GaussianLocationScaleFamily", 2, block_states.len())?;
1608        if specs.len() != 2 || derivative_blocks.len() != 2 {
1609            return Err(GamlssError::DimensionMismatch { reason: format!(
1610                "GaussianLocationScaleFamily joint psi workspace expects 2 specs and 2 derivative block lists, got {} / {}",
1611                specs.len(),
1612                derivative_blocks.len()
1613            ) }.into());
1614        }
1615        Ok(Some(Arc::new(
1616            GaussianLocationScaleExactNewtonJointPsiWorkspace::new(
1617                self.clone(),
1618                block_states.to_vec(),
1619                specs,
1620                derivative_blocks.to_vec(),
1621            )?,
1622        )))
1623    }
1624
1625    /// Outer-aware joint ψ workspace with optional row subsample.
1626    ///
1627    /// When `options.outer_score_subsample` is `None`, this is byte-identical
1628    /// to `exact_newton_joint_psi_workspace`. When `Some`, the subsample is
1629    /// stored in the workspace and forwarded into every per-row weight array
1630    /// produced by `gaussian_joint_psi_firstweights`,
1631    /// `gaussian_joint_psisecondweights`, and
1632    /// `gaussian_joint_psi_mixed_driftweights`: each sampled row's
1633    /// contribution is multiplied by `WeightedOuterRow.weight = 1/π_i` and
1634    /// non-sampled rows are zeroed. Every downstream assembly
1635    /// (`gaussian_joint_psi*_fromweights`, `weighted_crossprod_psi_maps`,
1636    /// `xt_diag_*_dense`,
1637    /// `build_two_block_custom_family_joint_psi_operator_from_actions`) is
1638    /// row-linear in these arrays via `Xᵀ diag(W) Y`, so the resulting
1639    /// second-order ψ Hessian and ψ-Hessian directional derivative are
1640    /// unbiased Horvitz–Thompson estimators of the full-data quantities.
1641    /// Inner-PIRLS and final-covariance paths never install the option.
1642    fn exact_newton_joint_psi_workspace_with_options(
1643        &self,
1644        block_states: &[ParameterBlockState],
1645        specs: &[ParameterBlockSpec],
1646        derivative_blocks: &[Vec<crate::custom_family::CustomFamilyBlockPsiDerivative>],
1647        options: &BlockwiseFitOptions,
1648    ) -> Result<Option<Arc<dyn ExactNewtonJointPsiWorkspace>>, String> {
1649        validate_block_count::<GamlssError>("GaussianLocationScaleFamily", 2, block_states.len())?;
1650        if specs.len() != 2 || derivative_blocks.len() != 2 {
1651            return Err(GamlssError::DimensionMismatch { reason: format!(
1652                "GaussianLocationScaleFamily joint psi workspace expects 2 specs and 2 derivative block lists, got {} / {}",
1653                specs.len(),
1654                derivative_blocks.len()
1655            ) }.into());
1656        }
1657        Ok(Some(Arc::new(
1658            GaussianLocationScaleExactNewtonJointPsiWorkspace::new_with_subsample(
1659                self.clone(),
1660                block_states.to_vec(),
1661                specs,
1662                derivative_blocks.to_vec(),
1663                options.outer_score_subsample.clone(),
1664            )?,
1665        )))
1666    }
1667
1668    fn exact_newton_joint_hessian_workspace(
1669        &self,
1670        block_states: &[ParameterBlockState],
1671        specs: &[ParameterBlockSpec],
1672    ) -> Result<Option<Arc<dyn ExactNewtonJointHessianWorkspace>>, String> {
1673        let Some((xmu, x_ls)) = self.exact_joint_dense_block_designs(Some(specs))? else {
1674            return Ok(None);
1675        };
1676        let workspace = GaussianLocationScaleHessianWorkspace::new(
1677            self.clone(),
1678            block_states.to_vec(),
1679            xmu.into_owned(),
1680            x_ls.into_owned(),
1681        )?;
1682        Ok(Some(Arc::new(workspace)))
1683    }
1684
1685    /// Outer-aware joint-Hessian workspace with optional row subsample.
1686    ///
1687    /// When `options.outer_score_subsample` is `None`, this is byte-identical
1688    /// to `exact_newton_joint_hessian_workspace`. When `Some`, the precomputed
1689    /// per-row coefficient arrays (`coeff_mm`, `coeff_ml`, `coeff_ll`) — which
1690    /// every downstream assembly (`hessian_dense`, `hessian_matvec`,
1691    /// `hessian_diagonal`) consumes row-linearly via `Xᵀ diag(W) X` — are
1692    /// replaced by a Horvitz–Thompson mask: each sampled row's coefficient is
1693    /// multiplied by `WeightedOuterRow.weight` (the inverse-inclusion factor
1694    /// 1/π_i; uniform or stratified sampling both supported), and non-sampled
1695    /// rows are zeroed. The resulting joint Hessian is an unbiased estimator
1696    /// of the full-data joint Hessian. Inner PIRLS never installs the option,
1697    /// so the inner solve continues to consume the exact full-data Hessian.
1698    fn exact_newton_joint_hessian_workspace_with_options(
1699        &self,
1700        block_states: &[ParameterBlockState],
1701        specs: &[ParameterBlockSpec],
1702        options: &BlockwiseFitOptions,
1703    ) -> Result<Option<Arc<dyn ExactNewtonJointHessianWorkspace>>, String> {
1704        let Some((xmu, x_ls)) = self.exact_joint_dense_block_designs(Some(specs))? else {
1705            return Ok(None);
1706        };
1707        let mut workspace = GaussianLocationScaleHessianWorkspace::new(
1708            self.clone(),
1709            block_states.to_vec(),
1710            xmu.into_owned(),
1711            x_ls.into_owned(),
1712        )?;
1713        if let Some(subsample) = options.outer_score_subsample.as_ref() {
1714            workspace.apply_outer_subsample(subsample.rows.as_ref());
1715        }
1716        Ok(Some(Arc::new(workspace)))
1717    }
1718
1719    fn inner_coefficient_hessian_hvp_available(&self, specs: &[ParameterBlockSpec]) -> bool {
1720        // The Gaussian location-scale workspace is returned by
1721        // `exact_newton_joint_hessian_workspace` whenever
1722        // `exact_joint_dense_block_designs` succeeds, which itself depends on
1723        // both block designs being present. This is only a β-space operator
1724        // capability; outer θθ Hessian availability is declared separately.
1725        self.exact_joint_supported()
1726            && matches!(
1727                self.exact_joint_dense_block_designs(Some(specs)),
1728                Ok(Some(_))
1729            )
1730    }
1731
1732    /// Outer-derivative policy: declare HT-subsample capability.
1733    ///
1734    /// GaussianLocationScaleFamily overrides
1735    /// `log_likelihood_only_with_options`,
1736    /// `exact_newton_joint_hessian_workspace_with_options`, and
1737    /// `exact_newton_joint_psi_workspace_with_options` to consume
1738    /// `options.outer_score_subsample` with per-row Horvitz–Thompson weights
1739    /// (each sampled row's contribution is multiplied by
1740    /// `WeightedOuterRow.weight = 1/π_i`; non-sampled rows are zeroed),
1741    /// yielding unbiased estimators of the full-data log-likelihood, joint
1742    /// Hessian, and second-order ψ Hessian / ψ-Hessian directional
1743    /// derivative. The ψ-workspace masking happens inside
1744    /// `apply_ht_mask_first`, `apply_ht_mask_second`, and
1745    /// `apply_ht_mask_mixed` on the `GaussianJointPsi{First,Second,
1746    /// MixedDrift}Weights` per-row arrays, immediately after the row-scalar
1747    /// reductions and before the row-linear `weighted_crossprod_psi_maps` /
1748    /// `xt_diag_*_dense` assemblies, so the masked outputs remain unbiased.
1749    /// First-order ψ terms remain full-data exact (= trivially unbiased), so
1750    /// the total outer score is still unbiased. Inner-PIRLS and final-
1751    /// covariance paths never install the option, so they continue to
1752    /// consume the exact full-data quantities.
1753    fn outer_derivative_subsample_capable(&self) -> bool {
1754        true
1755    }
1756}
1757
1758impl CustomFamilyGenerative for GaussianLocationScaleFamily {
1759    fn generativespec(
1760        &self,
1761        block_states: &[ParameterBlockState],
1762    ) -> Result<GenerativeSpec, String> {
1763        validate_block_count::<GamlssError>("GaussianLocationScaleFamily", 2, block_states.len())?;
1764        let mu = block_states[Self::BLOCK_MU].eta.clone();
1765        let eta_log_sigma = &block_states[Self::BLOCK_LOG_SIGMA].eta;
1766        let sigma = gamlss_rowwise_map(eta_log_sigma.len(), |i| {
1767            logb_sigma_from_eta_scalar(eta_log_sigma[i])
1768        });
1769        Ok(GenerativeSpec {
1770            mean: mu,
1771            noise: NoiseModel::Gaussian { sigma },
1772        })
1773    }
1774}