Skip to main content

gam_models/gamlss/gaussian/
location_scale.rs

1// Real concern-organized submodule of the gamlss family stack.
2// Cross-module items are re-exported flat through the parent (`gamlss.rs`),
3// so `use super::*;` makes the sibling-concern symbols this module references
4// resolve through the parent namespace.
5use super::*;
6
7pub struct GaussianLocationScaleFamily {
8    pub y: Array1<f64>,
9    pub weights: Array1<f64>,
10    pub mu_design: Option<DesignMatrix>,
11    pub log_sigma_design: Option<DesignMatrix>,
12    /// Resource policy threaded into PsiDesignMap construction (and any other
13    /// per-call materialization decision) made during exact-Newton joint psi
14    /// derivative evaluation. Defaults to `ResourcePolicy::default_library()`
15    /// when the family is built without an explicit policy.
16    pub policy: gam_runtime::resource::ResourcePolicy,
17    /// Cached per-observation row scalars keyed by the FULL `(η_μ, η_logσ)`
18    /// predictor pair the scalars were computed at. The row scalars are a
19    /// deterministic function of `(η_μ, η_logσ)` (plus the fixed `y`/`weights`),
20    /// so a hit is only valid when both eta vectors match bit-for-bit element by
21    /// element — a lossy 3-point fingerprint could collide two genuinely
22    /// different predictors and serve STALE scalars, so the key is the whole
23    /// vectors. The compare is O(n), far cheaper than the O(n) transcendental
24    /// recompute it guards, and is hit K+ times per REML gradient/Hessian
25    /// evaluation under the same predictors.
26    pub cached_row_scalars:
27        std::sync::RwLock<Option<(Array1<f64>, Array1<f64>, Arc<GaussianJointRowScalars>)>>,
28}
29
30impl Clone for GaussianLocationScaleFamily {
31    fn clone(&self) -> Self {
32        Self {
33            y: self.y.clone(),
34            weights: self.weights.clone(),
35            mu_design: self.mu_design.clone(),
36            log_sigma_design: self.log_sigma_design.clone(),
37            policy: self.policy.clone(),
38            cached_row_scalars: std::sync::RwLock::new(
39                self.cached_row_scalars
40                    .read()
41                    .expect("lock poisoned")
42                    .clone(),
43            ),
44        }
45    }
46}
47
48impl GaussianLocationScaleFamily {
49    pub const BLOCK_MU: usize = 0;
50    pub const BLOCK_LOG_SIGMA: usize = 1;
51
52    /// Bit-exact equality of two η vectors as a cache key. Within one outer REML
53    /// evaluation η_μ and η_logσ are the fixed inner-converged predictors, so
54    /// every exact-joint consumer (Hessian / directional / second-directional /
55    /// ψ paths) is handed the identical pair; matching the full vectors lets us
56    /// recompute the O(n) transcendental row-scalar stack
57    /// (`gaussian_jointrow_scalars`: one `sigma_link` jet + `exp`/reciprocal per
58    /// row) ONCE and share an `Arc` across all of them — without ever serving a
59    /// stale cache hit to a different predictor. Comparison is on the raw bit
60    /// patterns so a stored entry whose key contains any `NaN` (e.g. a degenerate
61    /// predictor) never spuriously matches a fresh `NaN`-free key and vice versa,
62    /// and `±0.0` are kept distinct; the underlying constructor handles n = 0.
63    #[inline]
64    fn eta_keys_match(stored: &Array1<f64>, query: &Array1<f64>) -> bool {
65        stored.len() == query.len()
66            && stored
67                .iter()
68                .zip(query.iter())
69                .all(|(a, b)| a.to_bits() == b.to_bits())
70    }
71
72    pub(crate) fn get_or_compute_row_scalars(
73        &self,
74        etamu: &Array1<f64>,
75        eta_ls: &Array1<f64>,
76    ) -> Result<Arc<GaussianJointRowScalars>, String> {
77        // Fast path: full-key hit under a shared read lock. A cached entry is
78        // only reused when BOTH eta vectors match the query bit-for-bit, so a
79        // distinct predictor can never be served stale scalars.
80        if let Ok(guard) = self.cached_row_scalars.read() {
81            if let Some((cmu, cls, rows)) = guard.as_ref() {
82                if Self::eta_keys_match(cmu, etamu) && Self::eta_keys_match(cls, eta_ls) {
83                    return Ok(Arc::clone(rows));
84                }
85            }
86        }
87        // Miss: compute once and publish under the write lock. A concurrent
88        // race recomputing the same (η_μ, η_logσ) is harmless — identical
89        // inputs yield bit-identical scalars — so last-writer-wins is safe and
90        // every reader observes equal contents.
91        let rows = Arc::new(gaussian_jointrow_scalars(
92            &self.y,
93            etamu,
94            eta_ls,
95            &self.weights,
96        )?);
97        if let Ok(mut guard) = self.cached_row_scalars.write() {
98            *guard = Some((etamu.clone(), eta_ls.clone(), Arc::clone(&rows)));
99        }
100        Ok(rows)
101    }
102
103    pub fn parameternames() -> &'static [&'static str] {
104        &["mu", "log_sigma"]
105    }
106
107    pub fn parameter_links() -> &'static [ParameterLink] {
108        &[ParameterLink::Identity, ParameterLink::Log]
109    }
110
111    pub fn metadata() -> FamilyMetadata {
112        FamilyMetadata {
113            name: "gaussian_location_scale",
114            parameternames: Self::parameternames(),
115            parameter_links: Self::parameter_links(),
116        }
117    }
118
119    pub(crate) fn exact_joint_supported(&self) -> bool {
120        self.mu_design.is_some() && self.log_sigma_design.is_some()
121    }
122
123    pub(crate) fn exact_block_designs(
124        &self,
125    ) -> Result<(DenseOrOperator<'_>, DenseOrOperator<'_>), String> {
126        let mu_design = self.mu_design.as_ref().ok_or_else(|| {
127            "GaussianLocationScaleFamily exact path is missing mu design".to_string()
128        })?;
129        let log_sigma_design = self.log_sigma_design.as_ref().ok_or_else(|| {
130            "GaussianLocationScaleFamily exact path is missing log-sigma design".to_string()
131        })?;
132        let planned = dense_blocks_planned_budget(&[mu_design, log_sigma_design]);
133        let xmu = dense_block_or_operator(
134            mu_design,
135            mu_design.nrows(),
136            mu_design.ncols(),
137            planned[0],
138            &self.policy,
139        );
140        let x_ls = dense_block_or_operator(
141            log_sigma_design,
142            log_sigma_design.nrows(),
143            log_sigma_design.ncols(),
144            planned[1],
145            &self.policy,
146        );
147        Ok((xmu, x_ls))
148    }
149
150    pub(crate) fn exact_block_designs_fromspecs<'a>(
151        &self,
152        specs: &'a [ParameterBlockSpec],
153    ) -> Result<(DenseOrOperator<'a>, DenseOrOperator<'a>), String> {
154        if specs.len() != 2 {
155            return Err(GamlssError::DimensionMismatch {
156                reason: format!(
157                    "GaussianLocationScaleFamily spec-aware exact path expects 2 specs, got {}",
158                    specs.len()
159                ),
160            }
161            .into());
162        }
163        let mu_design = &specs[Self::BLOCK_MU].design;
164        let log_sigma_design = &specs[Self::BLOCK_LOG_SIGMA].design;
165        let planned = dense_blocks_planned_budget(&[mu_design, log_sigma_design]);
166        let xmu = dense_block_or_operator(
167            mu_design,
168            mu_design.nrows(),
169            mu_design.ncols(),
170            planned[0],
171            &self.policy,
172        );
173        let x_ls = dense_block_or_operator(
174            log_sigma_design,
175            log_sigma_design.nrows(),
176            log_sigma_design.ncols(),
177            planned[1],
178            &self.policy,
179        );
180        Ok((xmu, x_ls))
181    }
182
183    pub(crate) fn exact_joint_block_designs<'a>(
184        &'a self,
185        specs: Option<&'a [ParameterBlockSpec]>,
186    ) -> Result<Option<(DenseOrOperator<'a>, DenseOrOperator<'a>)>, String> {
187        // #1504: prefer the identifiability-CONSTRAINED block designs carried by
188        // `specs` whenever they are provided. The inner joint-Newton solve sizes
189        // its coefficient vector — and the consumer's dense-Hessian `total` — from
190        // these post-audit specs, which can be NARROWER than the family's stored
191        // pre-audit designs: a by-group smooth in BOTH the mean and log-σ blocks
192        // has aliased columns the identifiability audit drops. Reaching for the
193        // stored (unconstrained) designs here regardless of `specs` sized the joint
194        // Hessian to the wider pre-audit width and tripped the dense-Hessian shape
195        // check ("got 36x36, expected 32x32") on every such fit. When no audit
196        // reduction occurred the specs designs equal the stored ones, so ordinary
197        // (non-by-group) gaulss fits are byte-identical; the analytic location-scale
198        // Hessian is design-agnostic, so using the constrained designs loses no
199        // exactness. Fall back to the stored designs only when no specs are given.
200        if let Some(specs) = specs {
201            return self.exact_block_designs_fromspecs(specs).map(Some);
202        }
203        if self.exact_joint_supported() {
204            return self.exact_block_designs().map(Some);
205        }
206        Ok(None)
207    }
208
209    pub(crate) fn exact_joint_dense_block_designs<'a>(
210        &'a self,
211        specs: Option<&'a [ParameterBlockSpec]>,
212    ) -> Result<Option<(Cow<'a, Array2<f64>>, Cow<'a, Array2<f64>>)>, String> {
213        let Some((xmu, x_ls)) = self.exact_joint_block_designs(specs)? else {
214            return Ok(None);
215        };
216        let xmu = match xmu {
217            DenseOrOperator::Borrowed(dense) => Cow::Borrowed(dense),
218            DenseOrOperator::Owned(dense) => Cow::Owned(dense),
219            DenseOrOperator::Operator(_) => {
220                return Err(
221                    "GaussianLocationScaleFamily exact psi path requires chunked operator support for oversized designs"
222                        .to_string(),
223                );
224            }
225        };
226        let x_ls = match x_ls {
227            DenseOrOperator::Borrowed(dense) => Cow::Borrowed(dense),
228            DenseOrOperator::Owned(dense) => Cow::Owned(dense),
229            DenseOrOperator::Operator(_) => {
230                return Err(
231                    "GaussianLocationScaleFamily exact psi path requires chunked operator support for oversized designs"
232                        .to_string(),
233                );
234            }
235        };
236        Ok(Some((xmu, x_ls)))
237    }
238
239    pub(crate) fn exact_newton_joint_hessian_for_specs(
240        &self,
241        block_states: &[ParameterBlockState],
242        specs: Option<&[ParameterBlockSpec]>,
243    ) -> Result<Option<Array2<f64>>, String> {
244        let Some((xmu, x_ls)) = self.exact_joint_block_designs(specs)? else {
245            return Ok(None);
246        };
247        self.exact_newton_joint_hessian_from_designs(block_states, &xmu, &x_ls)
248    }
249
250    pub(crate) fn exact_newton_joint_hessian_directional_derivative_for_specs(
251        &self,
252        block_states: &[ParameterBlockState],
253        specs: Option<&[ParameterBlockSpec]>,
254        d_beta_flat: &Array1<f64>,
255    ) -> Result<Option<Array2<f64>>, String> {
256        let Some((xmu, x_ls)) = self.exact_joint_block_designs(specs)? else {
257            return Ok(None);
258        };
259        self.exact_newton_joint_hessian_directional_derivative_from_designs(
260            block_states,
261            &xmu,
262            &x_ls,
263            d_beta_flat,
264        )
265    }
266
267    pub(crate) fn exact_newton_joint_hessian_second_directional_derivative_for_specs(
268        &self,
269        block_states: &[ParameterBlockState],
270        specs: Option<&[ParameterBlockSpec]>,
271        d_beta_u_flat: &Array1<f64>,
272        d_betav_flat: &Array1<f64>,
273    ) -> Result<Option<Array2<f64>>, String> {
274        let Some((xmu, x_ls)) = self.exact_joint_block_designs(specs)? else {
275            return Ok(None);
276        };
277        self.exact_newton_joint_hessiansecond_directional_derivative_from_designs(
278            block_states,
279            &xmu,
280            &x_ls,
281            d_beta_u_flat,
282            d_betav_flat,
283        )
284    }
285
286    pub(crate) fn exact_newton_joint_psi_terms_for_specs(
287        &self,
288        block_states: &[ParameterBlockState],
289        specs: &[ParameterBlockSpec],
290        derivative_blocks: &[Vec<crate::custom_family::CustomFamilyBlockPsiDerivative>],
291        psi_index: usize,
292    ) -> Result<Option<crate::custom_family::ExactNewtonJointPsiTerms>, String> {
293        let Some((xmu, x_ls)) = self.exact_joint_dense_block_designs(Some(specs))? else {
294            return Ok(None);
295        };
296        self.exact_newton_joint_psi_terms_from_designs(
297            block_states,
298            specs,
299            derivative_blocks,
300            psi_index,
301            &xmu,
302            &x_ls,
303        )
304    }
305
306    pub(crate) fn exact_newton_joint_psisecond_order_terms_for_specs(
307        &self,
308        block_states: &[ParameterBlockState],
309        specs: &[ParameterBlockSpec],
310        derivative_blocks: &[Vec<crate::custom_family::CustomFamilyBlockPsiDerivative>],
311        psi_i: usize,
312        psi_j: usize,
313    ) -> Result<Option<crate::custom_family::ExactNewtonJointPsiSecondOrderTerms>, String> {
314        let Some((xmu, x_ls)) = self.exact_joint_dense_block_designs(Some(specs))? else {
315            return Ok(None);
316        };
317        self.exact_newton_joint_psisecond_order_terms_from_designs(
318            block_states,
319            derivative_blocks,
320            psi_i,
321            psi_j,
322            &xmu,
323            &x_ls,
324        )
325    }
326
327    pub(crate) fn exact_newton_joint_psihessian_directional_derivative_for_specs(
328        &self,
329        block_states: &[ParameterBlockState],
330        specs: &[ParameterBlockSpec],
331        derivative_blocks: &[Vec<crate::custom_family::CustomFamilyBlockPsiDerivative>],
332        psi_index: usize,
333        d_beta_flat: &Array1<f64>,
334    ) -> Result<Option<Array2<f64>>, String> {
335        let Some((xmu, x_ls)) = self.exact_joint_dense_block_designs(Some(specs))? else {
336            return Ok(None);
337        };
338        self.exact_newton_joint_psihessian_directional_derivative_from_designs(
339            block_states,
340            derivative_blocks,
341            psi_index,
342            d_beta_flat,
343            &xmu,
344            &x_ls,
345        )
346    }
347
348    pub(crate) fn exact_newton_joint_hessian_from_designs(
349        &self,
350        block_states: &[ParameterBlockState],
351        xmu: &DenseOrOperator<'_>,
352        x_ls: &DenseOrOperator<'_>,
353    ) -> Result<Option<Array2<f64>>, String> {
354        validate_block_count::<GamlssError>("GaussianLocationScaleFamily", 2, block_states.len())?;
355        let n = self.y.len();
356        let etamu = &block_states[Self::BLOCK_MU].eta;
357        let eta_ls = &block_states[Self::BLOCK_LOG_SIGMA].eta;
358        if etamu.len() != n || eta_ls.len() != n || self.weights.len() != n {
359            return Err(GamlssError::DimensionMismatch {
360                reason: "GaussianLocationScaleFamily input size mismatch".to_string(),
361            }
362            .into());
363        }
364
365        let rows = self.get_or_compute_row_scalars(etamu, eta_ls)?;
366        // Block-diagonal Gaussian Fisher curvature (μ ⊥ σ ⇒ cross = 0, #684;
367        // (ls,ls) = 2κ²a, #566), built from the shared single-source-of-truth
368        // constructor so this dense path and the matrix-free workspace can never
369        // disagree on the cross block. See `gaussian_locscale_fisher_joint_row_coeffs`.
370        let (mm, cross, scale) = gaussian_locscale_fisher_joint_row_coeffs(&rows);
371        Ok(Some(gaussian_joint_hessian_from_designs(
372            xmu, x_ls, &mm, &cross, &scale,
373        )?))
374    }
375
376    pub(crate) fn exact_newton_joint_hessian_directional_derivative_from_designs(
377        &self,
378        block_states: &[ParameterBlockState],
379        xmu: &DenseOrOperator<'_>,
380        x_ls: &DenseOrOperator<'_>,
381        d_beta_flat: &Array1<f64>,
382    ) -> Result<Option<Array2<f64>>, String> {
383        validate_block_count::<GamlssError>("GaussianLocationScaleFamily", 2, block_states.len())?;
384        let n = self.y.len();
385        let etamu = &block_states[Self::BLOCK_MU].eta;
386        let eta_ls = &block_states[Self::BLOCK_LOG_SIGMA].eta;
387        if etamu.len() != n || eta_ls.len() != n || self.weights.len() != n {
388            return Err(GamlssError::DimensionMismatch {
389                reason: "GaussianLocationScaleFamily input size mismatch".to_string(),
390            }
391            .into());
392        }
393
394        let pmu = xmu.ncols();
395        let p_ls = x_ls.ncols();
396        let total = pmu + p_ls;
397        if d_beta_flat.len() != total {
398            return Err(GamlssError::DimensionMismatch {
399                reason: format!(
400                    "GaussianLocationScaleFamily joint d_beta length mismatch: got {}, expected {}",
401                    d_beta_flat.len(),
402                    total
403                ),
404            }
405            .into());
406        }
407        let ximu = xmu.dot(d_beta_flat.slice(s![0..pmu]));
408        let xi_ls = x_ls.dot(d_beta_flat.slice(s![pmu..pmu + p_ls]));
409        let rows = self.get_or_compute_row_scalars(etamu, eta_ls)?;
410        let directional = gaussian_joint_first_directionalweights(&rows, &ximu, &xi_ls);
411        let dhmumu = directional.0;
412        let dh_ls_ls = directional.2;
413        // Fisher cross block E[H_{μ,ls}] ≡ 0 (μ ⊥ σ; see
414        // exact_newton_joint_hessian_from_designs / #684), so its directional
415        // derivative is identically 0 — keep the Hessian's curvature object the
416        // block-diagonal Gaussian Fisher information at every order. The
417        // observed-cross directional weight (`directional.1`) is therefore not
418        // assembled.
419        let dhmu_ls = Array1::<f64>::zeros(dhmumu.len());
420
421        Ok(Some(gaussian_joint_hessian_from_designs(
422            xmu, x_ls, &dhmumu, &dhmu_ls, &dh_ls_ls,
423        )?))
424    }
425
426    pub(crate) fn exact_newton_joint_hessiansecond_directional_derivative_from_designs(
427        &self,
428        block_states: &[ParameterBlockState],
429        xmu: &DenseOrOperator<'_>,
430        x_ls: &DenseOrOperator<'_>,
431        d_beta_u_flat: &Array1<f64>,
432        d_betav_flat: &Array1<f64>,
433    ) -> Result<Option<Array2<f64>>, String> {
434        validate_block_count::<GamlssError>("GaussianLocationScaleFamily", 2, block_states.len())?;
435        let n = self.y.len();
436        let etamu = &block_states[Self::BLOCK_MU].eta;
437        let eta_ls = &block_states[Self::BLOCK_LOG_SIGMA].eta;
438        if etamu.len() != n || eta_ls.len() != n || self.weights.len() != n {
439            return Err(GamlssError::DimensionMismatch {
440                reason: "GaussianLocationScaleFamily input size mismatch".to_string(),
441            }
442            .into());
443        }
444
445        let pmu = xmu.ncols();
446        let p_ls = x_ls.ncols();
447        let total = pmu + p_ls;
448        if d_beta_u_flat.len() != total || d_betav_flat.len() != total {
449            return Err(GamlssError::DimensionMismatch { reason: format!(
450                "GaussianLocationScaleFamily joint second directional derivative length mismatch: got {} and {}, expected {}",
451                d_beta_u_flat.len(),
452                d_betav_flat.len(),
453                total
454            ) }.into());
455        }
456        let ximu_u = xmu.dot(d_beta_u_flat.slice(s![0..pmu]));
457        let xi_ls_u = x_ls.dot(d_beta_u_flat.slice(s![pmu..pmu + p_ls]));
458        let ximuv = xmu.dot(d_betav_flat.slice(s![0..pmu]));
459        let xi_lsv = x_ls.dot(d_betav_flat.slice(s![pmu..pmu + p_ls]));
460        let rows = self.get_or_compute_row_scalars(etamu, eta_ls)?;
461        let second =
462            gaussian_jointsecond_directionalweights(&rows, &ximu_u, &xi_ls_u, &ximuv, &xi_lsv);
463        let d2hmumu = second.0;
464        let d2h_ls_ls = second.2;
465        // Fisher cross block E[H_{μ,ls}] ≡ 0 (μ ⊥ σ; #684), so its second
466        // directional derivative is identically 0; `second.1` (observed) is not
467        // assembled, keeping the curvature object block-diagonal Fisher.
468        let d2hmu_ls = Array1::<f64>::zeros(d2hmumu.len());
469
470        Ok(Some(gaussian_joint_hessian_from_designs(
471            xmu, x_ls, &d2hmumu, &d2hmu_ls, &d2h_ls_ls,
472        )?))
473    }
474
475    pub(crate) fn exact_newton_joint_psi_direction(
476        &self,
477        block_states: &[ParameterBlockState],
478        derivative_blocks: &[Vec<crate::custom_family::CustomFamilyBlockPsiDerivative>],
479        psi_index: usize,
480        xmu: &Array2<f64>,
481        x_ls: &Array2<f64>,
482        policy: &gam_runtime::resource::ResourcePolicy,
483    ) -> Result<Option<LocationScaleJointPsiDirection>, String> {
484        let Some(parts) = locscale_joint_psi_direction_parts(
485            block_states,
486            derivative_blocks,
487            psi_index,
488            self.y.len(),
489            xmu.ncols(),
490            x_ls.ncols(),
491            Self::BLOCK_MU,
492            Self::BLOCK_LOG_SIGMA,
493            2,
494            "GaussianLocationScaleFamily",
495            "mu",
496            policy,
497        )?
498        else {
499            return Ok(None);
500        };
501        Ok(Some(LocationScaleJointPsiDirection {
502            block_idx: parts.block_idx,
503            local_idx: parts.local_idx,
504            z_primary_psi: parts.primary_z,
505            z_ls_psi: parts.log_sigma_z,
506            x_primary_psi: parts.primary_psi,
507            x_ls_psi: parts.log_sigma_psi,
508        }))
509    }
510
511    pub(crate) fn exact_newton_joint_psisecond_design_drifts(
512        &self,
513        block_states: &[ParameterBlockState],
514        derivative_blocks: &[Vec<crate::custom_family::CustomFamilyBlockPsiDerivative>],
515        psi_a: &LocationScaleJointPsiDirection,
516        psi_b: &LocationScaleJointPsiDirection,
517        xmu: &Array2<f64>,
518        x_ls: &Array2<f64>,
519    ) -> Result<LocationScaleJointPsiSecondDrifts, String> {
520        locscale_joint_psisecond_design_drifts(
521            block_states,
522            derivative_blocks,
523            psi_a,
524            psi_b,
525            LocScalePsiDriftConfig {
526                n: self.y.len(),
527                p_primary: xmu.ncols(),
528                p_log_sigma: x_ls.ncols(),
529                primary_block_idx: Self::BLOCK_MU,
530                log_sigma_block_idx: Self::BLOCK_LOG_SIGMA,
531                family_name: "GaussianLocationScaleFamily",
532                primary_label: "mu",
533                policy: &self.policy,
534            },
535        )
536    }
537
538    pub(crate) fn exact_newton_joint_psi_terms_from_designs(
539        &self,
540        block_states: &[ParameterBlockState],
541        specs: &[ParameterBlockSpec],
542        derivative_blocks: &[Vec<crate::custom_family::CustomFamilyBlockPsiDerivative>],
543        psi_index: usize,
544        xmu: &Array2<f64>,
545        x_ls: &Array2<f64>,
546    ) -> Result<Option<crate::custom_family::ExactNewtonJointPsiTerms>, String> {
547        validate_block_count::<GamlssError>("GaussianLocationScaleFamily", 2, block_states.len())?;
548        if specs.len() != 2 || derivative_blocks.len() != 2 {
549            return Err(GamlssError::DimensionMismatch { reason: format!(
550                "GaussianLocationScaleFamily joint psi terms expect 2 specs and 2 derivative blocks, got {} and {}",
551                specs.len(),
552                derivative_blocks.len()
553            ) }.into());
554        }
555        let Some(dir_a) = self.exact_newton_joint_psi_direction(
556            block_states,
557            derivative_blocks,
558            psi_index,
559            xmu,
560            x_ls,
561            &self.policy,
562        )?
563        else {
564            return Ok(None);
565        };
566        // Gaussian 2-block location-scale family in the unified flattened
567        // coefficient space beta = [betamu; beta_sigma]:
568        //
569        //   mu_i = z_i^T betamu,
570        //   ell_i = x_i^T beta_sigma,
571        //   s_i = exp(ell_i),
572        //   r_i = y_i - mu_i,
573        //   q_i = r_i / s_i,
574        //   w_i = s_i^{-2},
575        //   alpha_i = r_i s_i^{-2},
576        //   b_i = q_i^2.
577        //
578        // The first fixed-beta psi object returned here is likelihood-only:
579        //
580        //   D_a         = -alpha^T m_a + (1 - b)^T ell_a
581        //   D_{beta a}  = [ -Xmu^T alpha_a - X_{mu,a}^T alpha ;
582        //                   -X_sigma^T b_a + X_{sigma,a}^T (1-b) ]
583        //   D_{bb a}    = [ Xmu^T W_a Xmu + X_{mu,a}^T W Xmu + Xmu^T W X_{mu,a},
584        //                   2( Xmu^T A_a X_sigma + X_{mu,a}^T A X_sigma + Xmu^T A X_{sigma,a} );
585        //                   sym,
586        //                   2( X_sigma^T B_a X_sigma + X_{sigma,a}^T B X_sigma + X_sigma^T B X_{sigma,a} ) ]
587        //
588        // with m_a = X_{mu,a} betamu, ell_a = X_{sigma,a} beta_sigma and
589        // rowwise scalar drifts
590        //
591        //   w_a     = -2 w * ell_a
592        //   alpha_a = -w * m_a - 2 alpha * ell_a
593        //   b_a     = -2 alpha * m_a - 2 b * ell_a.
594        //
595        // Generic code in custom_family.rs promotes these likelihood-only
596        // objects to the full fixed-beta V_a / g_a / H_a by adding S_a.
597        let etamu = &block_states[Self::BLOCK_MU].eta;
598        let eta_ls = &block_states[Self::BLOCK_LOG_SIGMA].eta;
599        let rows = self.get_or_compute_row_scalars(etamu, eta_ls)?;
600        let weights_a =
601            gaussian_joint_psi_firstweights(&rows, &dir_a.z_primary_psi, &dir_a.z_ls_psi);
602        let objective_psi = weights_a.objective_psirow.sum();
603        let xmu_map = dir_a.x_primary_psi.as_linear_map_ref();
604        let x_ls_map = dir_a.x_ls_psi.as_linear_map_ref();
605        let score_mu =
606            xmu_map.transpose_mul(weights_a.scoremu.view()) + fast_atv(xmu, &weights_a.dscoremu);
607        let score_ls = x_ls_map.transpose_mul(weights_a.score_ls.view())
608            + fast_atv(x_ls, &weights_a.dscore_ls);
609        let score_psi = gaussian_pack_joint_score(&score_mu, &score_ls);
610        let hessian_psi_operator = build_two_block_custom_family_joint_psi_operator_from_actions(
611            dir_a.x_primary_psi.cloned_first_action(),
612            dir_a.x_ls_psi.cloned_first_action(),
613            0..xmu.ncols(),
614            xmu.ncols()..xmu.ncols() + x_ls.ncols(),
615            xmu,
616            x_ls,
617            &weights_a.hmumu,
618            &weights_a.hmu_ls,
619            &weights_a.h_ls_ls,
620            &weights_a.dhmumu,
621            &weights_a.dhmu_ls,
622            &weights_a.dh_ls_ls,
623        )?;
624        let hessian_psi = if hessian_psi_operator.is_some() {
625            Array2::zeros((0, 0))
626        } else {
627            gaussian_joint_psihessian_fromweights(xmu, x_ls, xmu_map, x_ls_map, &weights_a)?
628        };
629
630        Ok(Some(crate::custom_family::ExactNewtonJointPsiTerms {
631            objective_psi,
632            score_psi,
633            hessian_psi,
634            hessian_psi_operator,
635        }))
636    }
637
638    pub(crate) fn exact_newton_joint_psisecond_order_terms_from_designs(
639        &self,
640        block_states: &[ParameterBlockState],
641        derivative_blocks: &[Vec<crate::custom_family::CustomFamilyBlockPsiDerivative>],
642        psi_i: usize,
643        psi_j: usize,
644        xmu: &Array2<f64>,
645        x_ls: &Array2<f64>,
646    ) -> Result<Option<crate::custom_family::ExactNewtonJointPsiSecondOrderTerms>, String> {
647        let Some(dir_i) = self.exact_newton_joint_psi_direction(
648            block_states,
649            derivative_blocks,
650            psi_i,
651            xmu,
652            x_ls,
653            &self.policy,
654        )?
655        else {
656            return Ok(None);
657        };
658        let Some(dir_j) = self.exact_newton_joint_psi_direction(
659            block_states,
660            derivative_blocks,
661            psi_j,
662            xmu,
663            x_ls,
664            &self.policy,
665        )?
666        else {
667            return Ok(None);
668        };
669        Ok(Some(
670            self.exact_newton_joint_psisecond_order_terms_from_parts(
671                block_states,
672                derivative_blocks,
673                &dir_i,
674                &dir_j,
675                xmu,
676                x_ls,
677                None,
678            )?,
679        ))
680    }
681
682    pub(crate) fn exact_newton_joint_psisecond_order_terms_from_parts(
683        &self,
684        block_states: &[ParameterBlockState],
685        derivative_blocks: &[Vec<crate::custom_family::CustomFamilyBlockPsiDerivative>],
686        dir_i: &LocationScaleJointPsiDirection,
687        dir_j: &LocationScaleJointPsiDirection,
688        xmu: &Array2<f64>,
689        x_ls: &Array2<f64>,
690        subsample: Option<&[crate::outer_subsample::WeightedOuterRow]>,
691    ) -> Result<crate::custom_family::ExactNewtonJointPsiSecondOrderTerms, String> {
692        let second_drifts = self.exact_newton_joint_psisecond_design_drifts(
693            block_states,
694            derivative_blocks,
695            dir_i,
696            dir_j,
697            xmu,
698            x_ls,
699        )?;
700        let n = self.y.len();
701        let xmu_i_map = dir_i.x_primary_psi.as_linear_map_ref();
702        let x_ls_i_map = dir_i.x_ls_psi.as_linear_map_ref();
703        let xmu_j_map = dir_j.x_primary_psi.as_linear_map_ref();
704        let x_ls_j_map = dir_j.x_ls_psi.as_linear_map_ref();
705        let xmu_ab_map = second_psi_linear_map(
706            second_drifts.x_primary_ab_action.as_ref(),
707            second_drifts.x_primary_ab.as_ref(),
708            n,
709            xmu.ncols(),
710        );
711        let x_ls_ab_map = second_psi_linear_map(
712            second_drifts.x_ls_ab_action.as_ref(),
713            second_drifts.x_ls_ab.as_ref(),
714            n,
715            x_ls.ncols(),
716        );
717        // Second fixed-beta psi objects for the same Gaussian location-scale
718        // kernel. Using the notation from the first-order comment, the rowwise
719        // second psi drifts are
720        //
721        //   w_ab     = 4 w * ell_a * ell_b - 2 w * ell_ab
722        //   alpha_ab = 2 w * (m_a * ell_b + m_b * ell_a)
723        //              + 4 alpha * ell_a * ell_b
724        //              - w * m_ab
725        //              - 2 alpha * ell_ab
726        //   b_ab     = 2 w * m_a * m_b
727        //              + 4 alpha * (m_a * ell_b + m_b * ell_a)
728        //              + 4 b * ell_a * ell_b
729        //              - 2 alpha * m_ab
730        //              - 2 b * ell_ab.
731        //
732        // The exact likelihood-only second-order objects are then:
733        //
734        //   D_ab,
735        //   D_{beta ab},
736        //   D_{beta beta ab},
737        //
738        // assembled from the usual product-rule expansion over realized
739        // design motion X_{.,a}, X_{.,b}, X_{.,ab}. Generic code adds S_ab.
740        let etamu = &block_states[Self::BLOCK_MU].eta;
741        let eta_ls = &block_states[Self::BLOCK_LOG_SIGMA].eta;
742        let rows = self.get_or_compute_row_scalars(etamu, eta_ls)?;
743        let mut weights_i =
744            gaussian_joint_psi_firstweights(&rows, &dir_i.z_primary_psi, &dir_i.z_ls_psi);
745        let mut weights_j =
746            gaussian_joint_psi_firstweights(&rows, &dir_j.z_primary_psi, &dir_j.z_ls_psi);
747        let mut secondweights = gaussian_joint_psisecondweights(
748            &rows,
749            &dir_i.z_primary_psi,
750            &dir_i.z_ls_psi,
751            &dir_j.z_primary_psi,
752            &dir_j.z_ls_psi,
753            &second_drifts.z_primary_ab,
754            &second_drifts.z_ls_ab,
755        );
756        if let Some(sub_rows) = subsample {
757            // HT mask: every downstream consumer (gaussian_joint_psisecondhessian_fromweights,
758            // weighted_crossprod_psi_maps with weights_*.{hmumu,hmu_ls,h_ls_ls},
759            // fast_atv on d2score_* and dscore_*) is row-linear in these arrays, so
760            // scaling sampled rows by 1/π_i and zeroing the rest yields an unbiased
761            // estimator of the full-data second-order ψ Hessian and ψ score.
762            apply_ht_mask_first(&mut weights_i, sub_rows);
763            apply_ht_mask_first(&mut weights_j, sub_rows);
764            apply_ht_mask_second(&mut secondweights, sub_rows);
765        }
766        let objective_psi_psi = secondweights.objective_psi_psirow.sum();
767
768        let score_psi_psi = gaussian_pack_joint_score(
769            &(xmu_ab_map.transpose_mul(weights_i.scoremu.view())
770                + xmu_i_map.transpose_mul(weights_j.dscoremu.view())
771                + xmu_j_map.transpose_mul(weights_i.dscoremu.view())
772                + fast_atv(xmu, &secondweights.d2scoremu)),
773            &(x_ls_ab_map.transpose_mul(weights_i.score_ls.view())
774                + x_ls_i_map.transpose_mul(weights_j.dscore_ls.view())
775                + x_ls_j_map.transpose_mul(weights_i.dscore_ls.view())
776                + fast_atv(x_ls, &secondweights.d2score_ls)),
777        );
778        let hessian_psi_psi = gaussian_joint_psisecondhessian_fromweights(
779            xmu,
780            x_ls,
781            xmu_i_map,
782            x_ls_i_map,
783            xmu_j_map,
784            x_ls_j_map,
785            xmu_ab_map,
786            x_ls_ab_map,
787            &weights_i,
788            &weights_j,
789            &secondweights,
790        )?;
791
792        Ok(crate::custom_family::ExactNewtonJointPsiSecondOrderTerms {
793            objective_psi_psi,
794            score_psi_psi,
795            hessian_psi_psi,
796            hessian_psi_psi_operator: None,
797        })
798    }
799
800    pub(crate) fn exact_newton_joint_psihessian_directional_derivative_from_designs(
801        &self,
802        block_states: &[ParameterBlockState],
803        derivative_blocks: &[Vec<crate::custom_family::CustomFamilyBlockPsiDerivative>],
804        psi_index: usize,
805        d_beta_flat: &Array1<f64>,
806        xmu: &Array2<f64>,
807        x_ls: &Array2<f64>,
808    ) -> Result<Option<Array2<f64>>, String> {
809        let Some(dir_a) = self.exact_newton_joint_psi_direction(
810            block_states,
811            derivative_blocks,
812            psi_index,
813            xmu,
814            x_ls,
815            &self.policy,
816        )?
817        else {
818            return Ok(None);
819        };
820        Ok(Some(
821            self.exact_newton_joint_psihessian_directional_derivative_from_parts(
822                block_states,
823                &dir_a,
824                d_beta_flat,
825                xmu,
826                x_ls,
827                None,
828            )?,
829        ))
830    }
831
832    pub(crate) fn exact_newton_joint_psihessian_directional_derivative_from_parts(
833        &self,
834        block_states: &[ParameterBlockState],
835        dir_a: &LocationScaleJointPsiDirection,
836        d_beta_flat: &Array1<f64>,
837        xmu: &Array2<f64>,
838        x_ls: &Array2<f64>,
839        subsample: Option<&[crate::outer_subsample::WeightedOuterRow]>,
840    ) -> Result<Array2<f64>, String> {
841        let etamu = &block_states[Self::BLOCK_MU].eta;
842        let eta_ls = &block_states[Self::BLOCK_LOG_SIGMA].eta;
843        let pmu = xmu.ncols();
844        let p_ls = x_ls.ncols();
845        let xmu_map = dir_a.x_primary_psi.as_linear_map_ref();
846        let x_ls_map = dir_a.x_ls_psi.as_linear_map_ref();
847        let total = pmu + p_ls;
848        if d_beta_flat.len() != total {
849            return Err(GamlssError::DimensionMismatch { reason: format!(
850                "GaussianLocationScaleFamily joint psi hessian directional derivative length mismatch: got {}, expected {}",
851                d_beta_flat.len(),
852                total
853            ) }.into());
854        }
855        // Only the log-σ–channel direction enters the surviving Fisher blocks
856        // of the mixed drift (the μ-channel direction fed the observed cross
857        // block, now Fisher 0; μ ⊥ σ, #684).
858        let u_ls = d_beta_flat.slice(s![pmu..pmu + p_ls]);
859        let xi_ls = fast_av(x_ls, &u_ls);
860        let uza_ls = x_ls_map.forward_mul(u_ls);
861        // Mixed drift T_a[u] = D_beta H_a^{(D)}[u] for the Gaussian family.
862        //
863        // Along u = [umu; u_sigma], define xi = Xmu umu and zeta = X_sigma u_sigma.
864        // The first beta-directional drifts of the Gaussian row scalars are
865        //
866        //   d_u w     = -2 w * zeta
867        //   d_u alpha = -w * xi - 2 alpha * zeta
868        //   d_u b     = -2 alpha * xi - 2 b * zeta.
869        //
870        // Differentiating the psi-a scalar drifts once more gives
871        //
872        //   d_u w_a     = 4 w * ell_a * zeta - 2 w * zeta_a
873        //   d_u alpha_a = 2 w * (m_a * zeta + ell_a * xi)
874        //                 - w * xi_a
875        //                 + 4 alpha * ell_a * zeta
876        //                 - 2 alpha * zeta_a
877        //   d_u b_a     = 2 w * m_a * xi
878        //                 + 4 alpha * (m_a * zeta + ell_a * xi)
879        //                 + 4 b * ell_a * zeta
880        //                 - 2 alpha * xi_a
881        //                 - 2 b * zeta_a.
882        //
883        // The matrix drift returned here is the exact likelihood-only
884        //
885        //   T_a[u] = D_beta H_{psi_a}^{(D)}[u],
886        //
887        // assembled blockwise as
888        //
889        //   Kmumu,a[u]   = Xmu^T W_a[u] Xmu
890        //                   + X_{mu,a}^T W[u] Xmu
891        //                   + Xmu^T W[u] X_{mu,a}
892        //   Kmusigma,a[u]= 2( Xmu^T A_a[u] X_sigma
893        //                   + X_{mu,a}^T A[u] X_sigma
894        //                   + Xmu^T A[u] X_{sigma,a} )
895        //   K_sigmasigma,a[u]
896        //                   = 2( X_sigma^T B_a[u] X_sigma
897        //                   + X_{sigma,a}^T B[u] X_sigma
898        //                   + X_sigma^T B[u] X_{sigma,a} ).
899        //
900        // Generic code then combines this with S(theta)-motion and the profile
901        // mode responses to form ddot H_{ij}.
902        let rows = self.get_or_compute_row_scalars(etamu, eta_ls)?;
903        let mut mixedweights =
904            gaussian_joint_psi_mixed_driftweights(&rows, &xi_ls, &dir_a.z_ls_psi, &uza_ls);
905        if let Some(sub_rows) = subsample {
906            // HT mask: `gaussian_joint_psi_mixedhessian_drift_fromweights` is
907            // row-linear in every `mixedweights.*` array via `xt_diag_*_dense`
908            // and `weighted_crossprod_psi_maps`, so the masked Hessian-drift
909            // remains an unbiased estimator of the full-data drift.
910            apply_ht_mask_mixed(&mut mixedweights, sub_rows);
911        }
912
913        gaussian_joint_psi_mixedhessian_drift_fromweights(
914            xmu,
915            x_ls,
916            xmu_map,
917            x_ls_map,
918            &mixedweights,
919        )
920    }
921
922    /// Build the [`BlockEffectiveJacobian`] for block `block_idx` given the
923    /// realised block specs.  Returns an [`AdditiveBlockJacobian`] encoding the
924    /// linear map η_r[i] = X_r[i,:] · β_r:
925    ///
926    /// - block 0 (mu):       output 0 = design rows, output 1 = zeros
927    /// - block 1 (log_sigma): output 0 = zeros, output 1 = design rows
928    pub fn block_effective_jacobian(
929        specs: &[ParameterBlockSpec],
930        block_idx: usize,
931    ) -> Result<Box<dyn BlockEffectiveJacobian>, String> {
932        crate::block_layout::block_jacobian::AdditiveWiggleBlockLayout {
933            family: "GaussianLocationScaleFamily",
934            n_outputs: 2,
935            additive_blocks: &[Self::BLOCK_MU, Self::BLOCK_LOG_SIGMA],
936            wiggle_block: None,
937        }
938        .block_effective_jacobian(specs, block_idx)
939    }
940}
941
942/// Per-subject 2×2 channel Hessian `W_i` for Gaussian location-scale.
943///
944/// The row negative log-likelihood (with per-row weight `w_i`, response `y_i`,
945/// mean predictor `μ_i`, log-scale predictor `s_i = log σ_i`) is
946///
947/// ```text
948/// ρ_i(μ, s) = w_i [s + 0.5·(y_i − μ)²·exp(−2s)]
949/// ```
950///
951/// The 2×2 Hessian in `(μ, s)` coordinates:
952///
953/// ```text
954/// W_i[0,0] = w_i · exp(−2 s_i)                        ∂²ρ/∂μ²
955/// W_i[1,1] = w_i · 2·(y_i − μ_i)²·exp(−2 s_i)        ∂²ρ/∂s²
956/// W_i[0,1] = W_i[1,0] = w_i · 2·(y_i − μ_i)·exp(−2 s_i)  ∂²ρ/∂μ∂s
957/// ```
958///
959/// The off-diagonal cross-channel term `∂²ρ/∂μ∂s` is nonzero whenever the
960/// residual `(y_i − μ_i) ≠ 0`, i.e. away from the fitted mean.
961pub struct GaussianLocationScaleChannelHessian {
962    /// Row-major `(n × 2 × 2)` PSD-clamped per-subject Hessian.
963    pub(crate) h: ndarray::Array3<f64>,
964}
965
966impl GaussianLocationScaleChannelHessian {
967    /// Construct the raw (un-PSD-clamped) per-subject observed Hessian.
968    ///
969    /// For Gaussian location-scale the 2×2 observed Hessian
970    /// `[[w·e^{-2s}, 2·w·r·e^{-2s}], [2·w·r·e^{-2s}, 2·w·r²·e^{-2s}]]`
971    /// has determinant `-2·w²·r²·e^{-4s}` which is non-positive whenever
972    /// the residual `r = y − μ ≠ 0`. Tests that finite-difference the row
973    /// NLL must compare against this raw observed Hessian — PSD clamping
974    /// alters the eigenvalues and the FD-versus-closed-form match fails.
975    ///
976    /// Production code that needs a PSD matrix (e.g. the canonicalize gate)
977    /// must call [`Self::from_pilot`] which PSD-clamps via 2×2
978    /// eigendecomposition.
979    pub fn from_pilot_observed_unclamped(
980        y: &ndarray::Array1<f64>,
981        w: &ndarray::Array1<f64>,
982        eta_mu: &ndarray::Array1<f64>,
983        eta_log_sigma: &ndarray::Array1<f64>,
984    ) -> Result<Self, String> {
985        let n = y.len();
986        if w.len() != n || eta_mu.len() != n || eta_log_sigma.len() != n {
987            return Err(format!(
988                "GaussianLocationScaleChannelHessian::from_pilot_observed_unclamped: \
989                 length mismatch y={n} w={} eta_mu={} eta_log_sigma={}",
990                w.len(),
991                eta_mu.len(),
992                eta_log_sigma.len(),
993            ));
994        }
995        let mut h = ndarray::Array3::<f64>::zeros((n, 2, 2));
996        for i in 0..n {
997            let wi = w[i];
998            let mu_i = eta_mu[i];
999            let s_i = eta_log_sigma[i];
1000            let inv_sigma2 = (-2.0 * s_i).exp();
1001            let resid = y[i] - mu_i;
1002            h[[i, 0, 0]] = wi * inv_sigma2;
1003            h[[i, 1, 1]] = wi * 2.0 * resid * resid * inv_sigma2;
1004            h[[i, 0, 1]] = wi * 2.0 * resid * inv_sigma2;
1005            h[[i, 1, 0]] = h[[i, 0, 1]];
1006        }
1007        Ok(Self { h })
1008    }
1009
1010    /// Construct from pilot predictors (μ and log σ at current β) and data,
1011    /// with PSD eigenvalue clamping applied per subject.
1012    ///
1013    /// `y` is the response, `w` the per-row sample weights, `eta_mu` and
1014    /// `eta_log_sigma` the current linear predictors. Negative eigenvalues
1015    /// are projected to zero (PSD clamp) before storage so the resulting
1016    /// matrix is a valid metric for the W-Gram identifiability compile.
1017    pub fn from_pilot(
1018        y: &ndarray::Array1<f64>,
1019        w: &ndarray::Array1<f64>,
1020        eta_mu: &ndarray::Array1<f64>,
1021        eta_log_sigma: &ndarray::Array1<f64>,
1022    ) -> Result<Self, String> {
1023        let n = y.len();
1024        if w.len() != n || eta_mu.len() != n || eta_log_sigma.len() != n {
1025            return Err(format!(
1026                "GaussianLocationScaleChannelHessian::from_pilot: \
1027                 length mismatch y={n} w={} eta_mu={} eta_log_sigma={}",
1028                w.len(),
1029                eta_mu.len(),
1030                eta_log_sigma.len(),
1031            ));
1032        }
1033        let mut h = ndarray::Array3::<f64>::zeros((n, 2, 2));
1034        for i in 0..n {
1035            let wi = w[i];
1036            let mu_i = eta_mu[i];
1037            let s_i = eta_log_sigma[i];
1038            let inv_sigma2 = (-2.0 * s_i).exp(); // exp(-2s) = 1/sigma^2
1039            let resid = y[i] - mu_i;
1040            // Hessian of w_i * ρ_i
1041            let h00 = wi * inv_sigma2;
1042            let h11 = wi * 2.0 * resid * resid * inv_sigma2;
1043            let h01 = wi * 2.0 * resid * inv_sigma2;
1044            // PSD clamp via eigendecomposition of 2×2 matrix.
1045            // psd_clamp_2x2 returns (λ1, λ2, u1[0], u1[1], u2[0], u2[1])
1046            // where u1 and u2 are unit eigenvectors for λ1 and λ2.
1047            // Reconstruction: H_psd = λ1·u1·u1ᵀ + λ2·u2·u2ᵀ
1048            let (e0, e1, u1_0, u1_1, u2_0, u2_1) = psd_clamp_2x2(h00, h01, h11);
1049            h[[i, 0, 0]] = e0 * u1_0 * u1_0 + e1 * u2_0 * u2_0;
1050            h[[i, 0, 1]] = e0 * u1_0 * u1_1 + e1 * u2_0 * u2_1;
1051            h[[i, 1, 0]] = h[[i, 0, 1]];
1052            h[[i, 1, 1]] = e0 * u1_1 * u1_1 + e1 * u2_1 * u2_1;
1053        }
1054        Ok(Self { h })
1055    }
1056}
1057
1058impl FamilyChannelHessian for GaussianLocationScaleChannelHessian {
1059    fn n_outputs(&self) -> usize {
1060        2
1061    }
1062
1063    fn n_subjects(&self) -> usize {
1064        self.h.shape()[0]
1065    }
1066
1067    fn fill_subject(&self, i: usize, out: &mut [f64]) {
1068        assert_eq!(out.len(), 4);
1069        out[0] = self.h[[i, 0, 0]];
1070        out[1] = self.h[[i, 0, 1]];
1071        out[2] = self.h[[i, 1, 0]];
1072        out[3] = self.h[[i, 1, 1]];
1073    }
1074
1075    fn evaluate_full(&self) -> ndarray::Array3<f64> {
1076        self.h.clone()
1077    }
1078}
1079
1080impl CustomFamily for GaussianLocationScaleFamily {
1081    /// The Gaussian location-scale joint curvature is the EXACT FISHER
1082    /// (expected) information, which is block-diagonal: μ ⊥ log σ so the
1083    /// cross block E[H_{μ,log σ}] ≡ 0, the (μ,μ) block weight is a/σ², and the
1084    /// (log σ,log σ) block weight is 2κ²a (κ = dlog σ/dη); see
1085    /// `gaussian_locscale_fisher_joint_row_coeffs` and #684/#566. (The earlier
1086    /// observed Hessian — with residual-dependent row scalars m = r·w, n = r²·w
1087    /// in the cross and (log σ,log σ) blocks — was replaced by this Fisher
1088    /// information; the row scalars still carry m/n but the curvature
1089    /// constructor discards them.) Both Fisher weights depend on β through the
1090    /// scale predictor (σ = σ(η_{log σ}), κ = κ(η_{log σ})), so the curvature
1091    /// moves when β_{log σ} moves — hence this override is `true`. It does NOT
1092    /// depend on β_μ. The β-dependence is essential for correct M_j[u] drift
1093    /// corrections when ψ hyperparameters move the design matrices.
1094    fn exact_newton_joint_hessian_beta_dependent(&self) -> bool {
1095        true
1096    }
1097
1098    /// Gaussian location-scale carries a NON-profiled second (log-σ) linear
1099    /// predictor, so — unlike an ordinary Gaussian GAM whose scalar dispersion is
1100    /// profiled out analytically — its smoothing-parameter selection exhibits the
1101    /// same capped-screening over-smoothing bias as a GLM block: the capped
1102    /// inner-iteration screening proxy ranks an over-smoothed scale seed cheapest
1103    /// (its coefficients collapse into the penalty null space and the proxy looks
1104    /// converged), so the log-σ smooth is flattened toward a constant σ, the
1105    /// 1/σ² IRLS weights go wrong, and the weight-coupled mean degrades too.
1106    ///
1107    /// The default trait config classifies this as the generic
1108    /// `GeneralizedLinear` profile (seed_budget=1, capped screening, a seed grid
1109    /// reaching only ρ≈−2, and the *parsimonious* — smoothing-biased — keep-best),
1110    /// every part of which pushes the scale toward over-smoothing. The spatial
1111    /// (Matérn/GP) location-scale path already classifies the family as
1112    /// `GaussianLocationScale`; this override extends that same correct
1113    /// classification to the NON-spatial (thin-plate / P-spline) rho-only path,
1114    /// which is the one a `s(x, bs='tp')` location-scale fit actually takes. The
1115    /// `GaussianLocationScale` profile reuses Gaussian's flexible seed grid (which
1116    /// reaches the low-λ scale basin) and Gaussian's lowest-cost keep-best (no
1117    /// smoothing-biased tie-break), while still taking the interior-extreme seed
1118    /// promotion so the flexible basin is actually full-solved. The budget mirrors
1119    /// the spatial `exact_joint_seed_config(Gaussian)` (max_seeds=4, seed_budget=2).
1120    fn outer_seed_config(&self, n_params: usize) -> crate::seeding::SeedConfig {
1121        if n_params == 0 {
1122            return crate::seeding::SeedConfig::default();
1123        }
1124        let mut config = crate::seeding::SeedConfig::default();
1125        config.risk_profile = crate::seeding::SeedRiskProfile::GaussianLocationScale;
1126        config.max_seeds = 4;
1127        config.seed_budget = 2;
1128        config
1129    }
1130
1131    /// Two independent linear predictors: block 0 → μ channel, block 1 → log σ
1132    /// channel. Declaring the channel topology lets `fit_custom_family` route
1133    /// the identifiability audit channel-aware even when a caller builds the
1134    /// blocks by hand (without `build_location_scale_block`'s callbacks), so a
1135    /// shared μ/log-σ covariate basis is recognised as block-diagonal rather
1136    /// than mistaken for cross-block intercept aliases (#558).
1137    fn output_channel_assignment(&self, specs: &[ParameterBlockSpec]) -> Option<Vec<usize>> {
1138        // Two-channel families: `[mu, log_sigma]`. The optional trailing
1139        // zero-channel wiggle block (when present) also drives channel 0.
1140        Some(
1141            (0..specs.len())
1142                .map(|i| usize::from(i == Self::BLOCK_LOG_SIGMA))
1143                .collect(),
1144        )
1145    }
1146
1147    fn coefficient_hessian_cost(&self, specs: &[ParameterBlockSpec]) -> u64 {
1148        // Operator-aware: when the unified evaluator picks the matrix-free
1149        // joint Hessian path (see `use_joint_matrix_free_path`), the workspace
1150        // applies the joint Hessian via row-streaming Khatri-Rao matvecs at
1151        // O(n · (p_t + p_ℓ)) per Hv, never building the dense (p_t + p_ℓ)²
1152        // matrix. Report the operator work model so diagnostics and
1153        // first-order-only policies reflect the representation that actually
1154        // runs.
1155        crate::location_scale_engine::location_scale_coefficient_hessian_cost(
1156            self.y.len() as u64,
1157            specs,
1158        )
1159    }
1160
1161    fn evaluate(&self, block_states: &[ParameterBlockState]) -> Result<FamilyEvaluation, String> {
1162        validate_block_count::<GamlssError>("GaussianLocationScaleFamily", 2, block_states.len())?;
1163        let n = self.y.len();
1164        let etamu = &block_states[Self::BLOCK_MU].eta;
1165        let eta_log_sigma = &block_states[Self::BLOCK_LOG_SIGMA].eta;
1166        if etamu.len() != n || eta_log_sigma.len() != n || self.weights.len() != n {
1167            return Err(GamlssError::DimensionMismatch {
1168                reason: "GaussianLocationScaleFamily input size mismatch".to_string(),
1169            }
1170            .into());
1171        }
1172
1173        // Diagonal IRLS weights for the inner solver.
1174        //
1175        // For the location block (identity link): wmu = pw / sigma^2. Since the
1176        // location link is identity, observed = Fisher --- no correction needed.
1177        //
1178        // For the log-sigma block (log link): w_ls = 2 * pw * (dsigma/deta)^2 / sigma^2.
1179        // This is the Fisher weight. For the outer REML, the joint
1180        // `exact_newton_joint_hessian` provides the full observed Hessian directly,
1181        // so these Diagonal weights are only used for the inner IRLS iteration
1182        // (where Fisher scoring is fine). See response.md Section 3.
1183        //
1184        let mut zmu = Array1::<f64>::zeros(n);
1185        let mut wmu = Array1::<f64>::zeros(n);
1186        let mut z_ls = Array1::<f64>::zeros(n);
1187        let mut w_ls = Array1::<f64>::zeros(n);
1188        let ln2pi = (2.0 * std::f64::consts::PI).ln();
1189        let mut ll = 0.0;
1190
1191        const CHUNK: usize = 1024;
1192        if let (
1193            Some(y_s),
1194            Some(w_s),
1195            Some(mu_s),
1196            Some(ls_s),
1197            Some(zmu_s),
1198            Some(wmu_s),
1199            Some(zls_s),
1200            Some(wls_s),
1201        ) = (
1202            self.y.as_slice_memory_order(),
1203            self.weights.as_slice_memory_order(),
1204            etamu.as_slice_memory_order(),
1205            eta_log_sigma.as_slice_memory_order(),
1206            zmu.as_slice_memory_order_mut(),
1207            wmu.as_slice_memory_order_mut(),
1208            z_ls.as_slice_memory_order_mut(),
1209            w_ls.as_slice_memory_order_mut(),
1210        ) {
1211            // Per-row Gaussian LS kernel writes 4 working arrays directly into
1212            // the output slices; ll is reduced via Rayon's sum. Independent
1213            // across rows.
1214            ll += zmu_s
1215                .par_chunks_mut(CHUNK)
1216                .zip(wmu_s.par_chunks_mut(CHUNK))
1217                .zip(zls_s.par_chunks_mut(CHUNK))
1218                .zip(wls_s.par_chunks_mut(CHUNK))
1219                .enumerate()
1220                .map(|(chunk_idx, (((zmu_c, wmu_c), zls_c), wls_c))| {
1221                    let start = chunk_idx * CHUNK;
1222                    let mut local_ll = 0.0;
1223                    for local in 0..zmu_c.len() {
1224                        let i = start + local;
1225                        let row =
1226                            gaussian_diagonal_row_kernel(y_s[i], mu_s[i], ls_s[i], w_s[i], ln2pi);
1227                        zmu_c[local] = mu_s[i] + row.location_working_shift;
1228                        wmu_c[local] = row.location_working_weight;
1229                        zls_c[local] = row.log_sigma_working_response;
1230                        wls_c[local] = row.log_sigma_working_weight;
1231                        local_ll += row.log_likelihood;
1232                    }
1233                    local_ll
1234                })
1235                .sum::<f64>();
1236        } else {
1237            // Fallback path: inputs are not contiguous. Outputs (just-allocated
1238            // Array1::zeros) always are. Reborrow input views into the closure.
1239            let y_view = self.y.view();
1240            let w_view = self.weights.view();
1241            let mu_view = etamu.view();
1242            let ls_view = eta_log_sigma.view();
1243            let zmu_s = zmu
1244                .as_slice_memory_order_mut()
1245                .expect("zeros is contiguous");
1246            let wmu_s = wmu
1247                .as_slice_memory_order_mut()
1248                .expect("zeros is contiguous");
1249            let zls_s = z_ls
1250                .as_slice_memory_order_mut()
1251                .expect("zeros is contiguous");
1252            let wls_s = w_ls
1253                .as_slice_memory_order_mut()
1254                .expect("zeros is contiguous");
1255            ll += zmu_s
1256                .par_chunks_mut(CHUNK)
1257                .zip(wmu_s.par_chunks_mut(CHUNK))
1258                .zip(zls_s.par_chunks_mut(CHUNK))
1259                .zip(wls_s.par_chunks_mut(CHUNK))
1260                .enumerate()
1261                .map(|(chunk_idx, (((zmu_c, wmu_c), zls_c), wls_c))| {
1262                    let start = chunk_idx * CHUNK;
1263                    let mut local_ll = 0.0;
1264                    for local in 0..zmu_c.len() {
1265                        let i = start + local;
1266                        let row = gaussian_diagonal_row_kernel(
1267                            y_view[i], mu_view[i], ls_view[i], w_view[i], ln2pi,
1268                        );
1269                        zmu_c[local] = mu_view[i] + row.location_working_shift;
1270                        wmu_c[local] = row.location_working_weight;
1271                        zls_c[local] = row.log_sigma_working_response;
1272                        wls_c[local] = row.log_sigma_working_weight;
1273                        local_ll += row.log_likelihood;
1274                    }
1275                    local_ll
1276                })
1277                .sum::<f64>();
1278        }
1279
1280        Ok(FamilyEvaluation {
1281            log_likelihood: ll,
1282            blockworking_sets: vec![
1283                BlockWorkingSet::diagonal_checked(zmu, wmu)?,
1284                BlockWorkingSet::diagonal_checked(z_ls, w_ls)?,
1285            ],
1286        })
1287    }
1288
1289    fn log_likelihood_only(&self, block_states: &[ParameterBlockState]) -> Result<f64, String> {
1290        validate_block_count::<GamlssError>("GaussianLocationScaleFamily", 2, block_states.len())?;
1291        let n = self.y.len();
1292        let etamu = &block_states[Self::BLOCK_MU].eta;
1293        let eta_log_sigma = &block_states[Self::BLOCK_LOG_SIGMA].eta;
1294        if etamu.len() != n || eta_log_sigma.len() != n || self.weights.len() != n {
1295            return Err(GamlssError::DimensionMismatch {
1296                reason: "GaussianLocationScaleFamily input size mismatch".to_string(),
1297            }
1298            .into());
1299        }
1300        // logb noise link: σ(η_ls) = LOGB_SIGMA_FLOOR + exp(η_ls). σ ≥ b > 0
1301        // bounds the loglik below (−Σlog σ ≥ −n log b) and bounds 1/σ² by 1/b²,
1302        // so the previous `inv_s2.min(1e24)` cap is structurally unnecessary.
1303        let ln2pi = (2.0 * std::f64::consts::PI).ln();
1304        let mut ll = 0.0;
1305        if let (Some(y_s), Some(w_s), Some(mu_s), Some(ls_s)) = (
1306            self.y.as_slice_memory_order(),
1307            self.weights.as_slice_memory_order(),
1308            etamu.as_slice_memory_order(),
1309            eta_log_sigma.as_slice_memory_order(),
1310        ) {
1311            use rayon::iter::{IntoParallelIterator, ParallelIterator};
1312            ll += (0..n)
1313                .into_par_iter()
1314                .map(|i| {
1315                    let wi = w_s[i];
1316                    if wi == 0.0 {
1317                        return 0.0;
1318                    }
1319                    let sigma_i = logb_sigma_from_eta_scalar(ls_s[i]);
1320                    let inv_s2 = (sigma_i * sigma_i).recip();
1321                    let r = y_s[i] - mu_s[i];
1322                    wi * (-0.5 * (r * r * inv_s2 + ln2pi + 2.0 * sigma_i.ln()))
1323                })
1324                .sum::<f64>();
1325        } else {
1326            use rayon::iter::{IntoParallelIterator, ParallelIterator};
1327            ll += (0..n)
1328                .into_par_iter()
1329                .map(|i| {
1330                    let wi = self.weights[i];
1331                    if wi == 0.0 {
1332                        return 0.0;
1333                    }
1334                    let sigma_i = logb_sigma_from_eta_scalar(eta_log_sigma[i]);
1335                    let inv_s2 = (sigma_i * sigma_i).recip();
1336                    let r = self.y[i] - etamu[i];
1337                    wi * (-0.5 * (r * r * inv_s2 + ln2pi + 2.0 * sigma_i.ln()))
1338                })
1339                .sum::<f64>();
1340        }
1341        Ok(ll)
1342    }
1343
1344    /// Outer-only log-likelihood with optional row subsample.
1345    ///
1346    /// When `options.outer_score_subsample` is `Some`, only the sampled rows
1347    /// contribute; each row's per-row log-likelihood term is multiplied by
1348    /// `WeightedOuterRow.weight`, the Horvitz–Thompson inverse-inclusion
1349    /// factor 1/π_i (uniform or stratified sampling both supported), so the
1350    /// partial sum is an unbiased estimator of the full-data log-likelihood.
1351    /// When `None`, this returns the full-data `log_likelihood_only`. Inner
1352    /// PIRLS line searches never install the subsample option, so they
1353    /// continue to score the exact full-data log-likelihood.
1354    fn log_likelihood_only_with_options(
1355        &self,
1356        block_states: &[ParameterBlockState],
1357        options: &BlockwiseFitOptions,
1358    ) -> Result<f64, String> {
1359        let Some(subsample) = options.outer_score_subsample.as_ref() else {
1360            return self.log_likelihood_only(block_states);
1361        };
1362        validate_block_count::<GamlssError>("GaussianLocationScaleFamily", 2, block_states.len())?;
1363        let n = self.y.len();
1364        let etamu = &block_states[Self::BLOCK_MU].eta;
1365        let eta_log_sigma = &block_states[Self::BLOCK_LOG_SIGMA].eta;
1366        if etamu.len() != n || eta_log_sigma.len() != n || self.weights.len() != n {
1367            return Err(GamlssError::DimensionMismatch {
1368                reason: "GaussianLocationScaleFamily input size mismatch".to_string(),
1369            }
1370            .into());
1371        }
1372        let ln2pi = (2.0 * std::f64::consts::PI).ln();
1373        use rayon::iter::ParallelIterator;
1374        let ll: f64 = subsample
1375            .rows
1376            .par_iter()
1377            .map(|row| {
1378                let i = row.index;
1379                let wi = self.weights[i];
1380                if wi == 0.0 {
1381                    return 0.0;
1382                }
1383                let sigma_i = logb_sigma_from_eta_scalar(eta_log_sigma[i]);
1384                let inv_s2 = (sigma_i * sigma_i).recip();
1385                let r = self.y[i] - etamu[i];
1386                row.weight * wi * (-0.5 * (r * r * inv_s2 + ln2pi + 2.0 * sigma_i.ln()))
1387            })
1388            .sum();
1389        Ok(ll)
1390    }
1391
1392    fn exact_newton_joint_hessian(
1393        &self,
1394        block_states: &[ParameterBlockState],
1395    ) -> Result<Option<Array2<f64>>, String> {
1396        self.exact_newton_joint_hessian_for_specs(block_states, None)
1397    }
1398
1399    fn has_explicit_joint_hessian(&self) -> bool {
1400        true
1401    }
1402
1403    /// The Gaussian location-scale likelihood has no separation /
1404    /// under-identification regime that the full-span Jeffreys curvature `H_Φ`
1405    /// is meant to regularize: with the soft floor `σ ≥ b > 0` the per-row
1406    /// Fisher information `diag(a/σ², 2κ²a)` is bounded and `O(n)` on every
1407    /// identified direction at every working point, so the well-conditioned-`H`
1408    /// Jeffreys gate smooth-steps `H_Φ` to ~0 — yet the matching score `∇Φ`
1409    /// kept leaking a *phantom* penalized-stationarity residual into the inner
1410    /// joint-Newton (a nonzero `|∇L − Sβ|` paired with a numerically null `H_Φ`
1411    /// and a full-rank `H_pen`), so the KKT certificate refused every iterate
1412    /// and the outer REML rejected all seeds — aborting heteroscedastic
1413    /// location-scale fits (#684–#688). This is the same opt-out
1414    /// `TransformationNormalFamily` takes for the same structural reason
1415    /// (continuous response, `O(n)` Fisher information everywhere); it removes
1416    /// the phantom residual and drops the per-cycle `O(n·p²)` Jeffreys
1417    /// directional-derivative overhead.
1418    fn joint_jeffreys_term_required(&self) -> bool {
1419        false
1420    }
1421
1422    fn exact_newton_joint_hessian_directional_derivative(
1423        &self,
1424        block_states: &[ParameterBlockState],
1425        d_beta_flat: &Array1<f64>,
1426    ) -> Result<Option<Array2<f64>>, String> {
1427        self.exact_newton_joint_hessian_directional_derivative_for_specs(
1428            block_states,
1429            None,
1430            d_beta_flat,
1431        )
1432    }
1433
1434    fn exact_newton_joint_hessiansecond_directional_derivative(
1435        &self,
1436        block_states: &[ParameterBlockState],
1437        d_beta_u_flat: &Array1<f64>,
1438        d_betav_flat: &Array1<f64>,
1439    ) -> Result<Option<Array2<f64>>, String> {
1440        self.exact_newton_joint_hessian_second_directional_derivative_for_specs(
1441            block_states,
1442            None,
1443            d_beta_u_flat,
1444            d_betav_flat,
1445        )
1446    }
1447
1448    fn diagonalworking_weights_directional_derivative(
1449        &self,
1450        block_states: &[ParameterBlockState],
1451        block_idx: usize,
1452        d_eta: &Array1<f64>,
1453    ) -> Result<Option<Array1<f64>>, String> {
1454        validate_block_count::<GamlssError>("GaussianLocationScaleFamily", 2, block_states.len())?;
1455        let n = self.y.len();
1456        let eta_t = &block_states[Self::BLOCK_MU].eta;
1457        let eta_ls = &block_states[Self::BLOCK_LOG_SIGMA].eta;
1458        if eta_t.len() != n || eta_ls.len() != n || self.weights.len() != n || d_eta.len() != n {
1459            return Err(GamlssError::DimensionMismatch {
1460                reason: "GaussianLocationScaleFamily input size mismatch".to_string(),
1461            }
1462            .into());
1463        }
1464
1465        let sigma = eta_ls.mapv(logb_sigma_from_eta_scalar);
1466        let mut dw = Array1::<f64>::zeros(n);
1467        match block_idx {
1468            Self::BLOCK_MU => {
1469                // Gaussian location block:
1470                //
1471                //   wmu = weight / sigma^2.
1472                //
1473                // This depends only on the scale predictor, so along a
1474                // location-only direction d etamu the directional derivative is
1475                // identically zero.
1476                Ok(Some(dw))
1477            }
1478            Self::BLOCK_LOG_SIGMA => {
1479                // Gaussian log-sigma block:
1480                //
1481                // The PIRLS information weight is
1482                //
1483                //   w_ls = max(2 * weight * clamp(g, -1, 1)^2, MIN_WEIGHT),
1484                //   g    = sigma'(eta_ls) / sigma(eta_ls),
1485                // with the semantic rule that zero observation weights stay zero.
1486                //
1487                // Along a direction d eta_ls,
1488                //
1489                //   dw_ls is the directional derivative of that piecewise
1490                //   definition. On the active clamp branch or active MIN_WEIGHT
1491                //   floor branch, the returned derivative is zero to match the
1492                //   selected local piece of the evaluated weight.
1493                //
1494                // This is the exact directional derivative needed by the REML
1495                // trace term
1496                //
1497                //   0.5 tr(J^{-1} D_beta J[u])
1498                //   = 0.5 sum_i (x_i^T J^{-1} x_i) dw_i
1499                //
1500                // for diagonal working-set blocks.
1501                use rayon::iter::{IntoParallelIterator, ParallelIterator};
1502                let dw_vec: Vec<f64> = (0..n)
1503                    .into_par_iter()
1504                    .map(|i| {
1505                        let d1 = crate::sigma_link::logb_sigma_jet1_scalar(eta_ls[i]).d1;
1506                        gaussian_log_sigma_irlsinfo_directional_derivative(
1507                            self.weights[i],
1508                            sigma[i],
1509                            d1,
1510                            d_eta[i],
1511                        )
1512                    })
1513                    .collect();
1514                for (i, v) in dw_vec.into_iter().enumerate() {
1515                    dw[i] = v;
1516                }
1517                Ok(Some(dw))
1518            }
1519            _ => Ok(None),
1520        }
1521    }
1522
1523    fn exact_newton_joint_hessian_with_specs(
1524        &self,
1525        block_states: &[ParameterBlockState],
1526        specs: &[ParameterBlockSpec],
1527    ) -> Result<Option<Array2<f64>>, String> {
1528        self.exact_newton_joint_hessian_for_specs(block_states, Some(specs))
1529    }
1530
1531    fn exact_newton_joint_hessian_directional_derivative_with_specs(
1532        &self,
1533        block_states: &[ParameterBlockState],
1534        specs: &[ParameterBlockSpec],
1535        d_beta_flat: &Array1<f64>,
1536    ) -> Result<Option<Array2<f64>>, String> {
1537        self.exact_newton_joint_hessian_directional_derivative_for_specs(
1538            block_states,
1539            Some(specs),
1540            d_beta_flat,
1541        )
1542    }
1543
1544    fn exact_newton_joint_hessian_second_directional_derivative_with_specs(
1545        &self,
1546        block_states: &[ParameterBlockState],
1547        specs: &[ParameterBlockSpec],
1548        d_beta_u_flat: &Array1<f64>,
1549        d_betav_flat: &Array1<f64>,
1550    ) -> Result<Option<Array2<f64>>, String> {
1551        self.exact_newton_joint_hessian_second_directional_derivative_for_specs(
1552            block_states,
1553            Some(specs),
1554            d_beta_u_flat,
1555            d_betav_flat,
1556        )
1557    }
1558
1559    fn exact_newton_joint_psi_terms(
1560        &self,
1561        block_states: &[ParameterBlockState],
1562        specs: &[ParameterBlockSpec],
1563        derivative_blocks: &[Vec<crate::custom_family::CustomFamilyBlockPsiDerivative>],
1564        psi_index: usize,
1565    ) -> Result<Option<crate::custom_family::ExactNewtonJointPsiTerms>, String> {
1566        self.exact_newton_joint_psi_terms_for_specs(
1567            block_states,
1568            specs,
1569            derivative_blocks,
1570            psi_index,
1571        )
1572    }
1573
1574    fn exact_newton_joint_psisecond_order_terms(
1575        &self,
1576        block_states: &[ParameterBlockState],
1577        specs: &[ParameterBlockSpec],
1578        derivative_blocks: &[Vec<crate::custom_family::CustomFamilyBlockPsiDerivative>],
1579        psi_i: usize,
1580        psi_j: usize,
1581    ) -> Result<Option<crate::custom_family::ExactNewtonJointPsiSecondOrderTerms>, String> {
1582        self.exact_newton_joint_psisecond_order_terms_for_specs(
1583            block_states,
1584            specs,
1585            derivative_blocks,
1586            psi_i,
1587            psi_j,
1588        )
1589    }
1590
1591    fn exact_newton_joint_psihessian_directional_derivative(
1592        &self,
1593        block_states: &[ParameterBlockState],
1594        specs: &[ParameterBlockSpec],
1595        derivative_blocks: &[Vec<crate::custom_family::CustomFamilyBlockPsiDerivative>],
1596        psi_index: usize,
1597        d_beta_flat: &Array1<f64>,
1598    ) -> Result<Option<Array2<f64>>, String> {
1599        self.exact_newton_joint_psihessian_directional_derivative_for_specs(
1600            block_states,
1601            specs,
1602            derivative_blocks,
1603            psi_index,
1604            d_beta_flat,
1605        )
1606    }
1607
1608    fn exact_newton_joint_psi_workspace(
1609        &self,
1610        block_states: &[ParameterBlockState],
1611        specs: &[ParameterBlockSpec],
1612        derivative_blocks: &[Vec<crate::custom_family::CustomFamilyBlockPsiDerivative>],
1613    ) -> Result<Option<Arc<dyn ExactNewtonJointPsiWorkspace>>, String> {
1614        validate_block_count::<GamlssError>("GaussianLocationScaleFamily", 2, block_states.len())?;
1615        if specs.len() != 2 || derivative_blocks.len() != 2 {
1616            return Err(GamlssError::DimensionMismatch { reason: format!(
1617                "GaussianLocationScaleFamily joint psi workspace expects 2 specs and 2 derivative block lists, got {} / {}",
1618                specs.len(),
1619                derivative_blocks.len()
1620            ) }.into());
1621        }
1622        Ok(Some(Arc::new(
1623            GaussianLocationScaleExactNewtonJointPsiWorkspace::new(
1624                self.clone(),
1625                block_states.to_vec(),
1626                specs,
1627                derivative_blocks.to_vec(),
1628            )?,
1629        )))
1630    }
1631
1632    /// Outer-aware joint ψ workspace with optional row subsample.
1633    ///
1634    /// When `options.outer_score_subsample` is `None`, this is byte-identical
1635    /// to `exact_newton_joint_psi_workspace`. When `Some`, the subsample is
1636    /// stored in the workspace and forwarded into every per-row weight array
1637    /// produced by `gaussian_joint_psi_firstweights`,
1638    /// `gaussian_joint_psisecondweights`, and
1639    /// `gaussian_joint_psi_mixed_driftweights`: each sampled row's
1640    /// contribution is multiplied by `WeightedOuterRow.weight = 1/π_i` and
1641    /// non-sampled rows are zeroed. Every downstream assembly
1642    /// (`gaussian_joint_psi*_fromweights`, `weighted_crossprod_psi_maps`,
1643    /// `xt_diag_*_dense`,
1644    /// `build_two_block_custom_family_joint_psi_operator_from_actions`) is
1645    /// row-linear in these arrays via `Xᵀ diag(W) Y`, so the resulting
1646    /// second-order ψ Hessian and ψ-Hessian directional derivative are
1647    /// unbiased Horvitz–Thompson estimators of the full-data quantities.
1648    /// Inner-PIRLS and final-covariance paths never install the option.
1649    fn exact_newton_joint_psi_workspace_with_options(
1650        &self,
1651        block_states: &[ParameterBlockState],
1652        specs: &[ParameterBlockSpec],
1653        derivative_blocks: &[Vec<crate::custom_family::CustomFamilyBlockPsiDerivative>],
1654        options: &BlockwiseFitOptions,
1655    ) -> Result<Option<Arc<dyn ExactNewtonJointPsiWorkspace>>, String> {
1656        validate_block_count::<GamlssError>("GaussianLocationScaleFamily", 2, block_states.len())?;
1657        if specs.len() != 2 || derivative_blocks.len() != 2 {
1658            return Err(GamlssError::DimensionMismatch { reason: format!(
1659                "GaussianLocationScaleFamily joint psi workspace expects 2 specs and 2 derivative block lists, got {} / {}",
1660                specs.len(),
1661                derivative_blocks.len()
1662            ) }.into());
1663        }
1664        Ok(Some(Arc::new(
1665            GaussianLocationScaleExactNewtonJointPsiWorkspace::new_with_subsample(
1666                self.clone(),
1667                block_states.to_vec(),
1668                specs,
1669                derivative_blocks.to_vec(),
1670                options.outer_score_subsample.clone(),
1671            )?,
1672        )))
1673    }
1674
1675    fn exact_newton_joint_hessian_workspace(
1676        &self,
1677        block_states: &[ParameterBlockState],
1678        specs: &[ParameterBlockSpec],
1679    ) -> Result<Option<Arc<dyn ExactNewtonJointHessianWorkspace>>, String> {
1680        let Some((xmu, x_ls)) = self.exact_joint_dense_block_designs(Some(specs))? else {
1681            return Ok(None);
1682        };
1683        let workspace = GaussianLocationScaleHessianWorkspace::new(
1684            self.clone(),
1685            block_states.to_vec(),
1686            xmu.into_owned(),
1687            x_ls.into_owned(),
1688        )?;
1689        Ok(Some(Arc::new(workspace)))
1690    }
1691
1692    /// Outer-aware joint-Hessian workspace with optional row subsample.
1693    ///
1694    /// When `options.outer_score_subsample` is `None`, this is byte-identical
1695    /// to `exact_newton_joint_hessian_workspace`. When `Some`, the precomputed
1696    /// per-row coefficient arrays (`coeff_mm`, `coeff_ml`, `coeff_ll`) — which
1697    /// every downstream assembly (`hessian_dense`, `hessian_matvec`,
1698    /// `hessian_diagonal`) consumes row-linearly via `Xᵀ diag(W) X` — are
1699    /// replaced by a Horvitz–Thompson mask: each sampled row's coefficient is
1700    /// multiplied by `WeightedOuterRow.weight` (the inverse-inclusion factor
1701    /// 1/π_i; uniform or stratified sampling both supported), and non-sampled
1702    /// rows are zeroed. The resulting joint Hessian is an unbiased estimator
1703    /// of the full-data joint Hessian. Inner PIRLS never installs the option,
1704    /// so the inner solve continues to consume the exact full-data Hessian.
1705    fn exact_newton_joint_hessian_workspace_with_options(
1706        &self,
1707        block_states: &[ParameterBlockState],
1708        specs: &[ParameterBlockSpec],
1709        options: &BlockwiseFitOptions,
1710    ) -> Result<Option<Arc<dyn ExactNewtonJointHessianWorkspace>>, String> {
1711        let Some((xmu, x_ls)) = self.exact_joint_dense_block_designs(Some(specs))? else {
1712            return Ok(None);
1713        };
1714        let mut workspace = GaussianLocationScaleHessianWorkspace::new(
1715            self.clone(),
1716            block_states.to_vec(),
1717            xmu.into_owned(),
1718            x_ls.into_owned(),
1719        )?;
1720        if let Some(subsample) = options.outer_score_subsample.as_ref() {
1721            workspace.apply_outer_subsample(subsample.rows.as_ref());
1722        }
1723        Ok(Some(Arc::new(workspace)))
1724    }
1725
1726    fn inner_coefficient_hessian_hvp_available(&self, specs: &[ParameterBlockSpec]) -> bool {
1727        // The Gaussian location-scale workspace is returned by
1728        // `exact_newton_joint_hessian_workspace` whenever
1729        // `exact_joint_dense_block_designs` succeeds, which itself depends on
1730        // both block designs being present. This is only a β-space operator
1731        // capability; outer θθ Hessian availability is declared separately.
1732        self.exact_joint_supported()
1733            && matches!(
1734                self.exact_joint_dense_block_designs(Some(specs)),
1735                Ok(Some(_))
1736            )
1737    }
1738
1739    /// Outer-derivative policy: declare HT-subsample capability.
1740    ///
1741    /// GaussianLocationScaleFamily overrides
1742    /// `log_likelihood_only_with_options`,
1743    /// `exact_newton_joint_hessian_workspace_with_options`, and
1744    /// `exact_newton_joint_psi_workspace_with_options` to consume
1745    /// `options.outer_score_subsample` with per-row Horvitz–Thompson weights
1746    /// (each sampled row's contribution is multiplied by
1747    /// `WeightedOuterRow.weight = 1/π_i`; non-sampled rows are zeroed),
1748    /// yielding unbiased estimators of the full-data log-likelihood, joint
1749    /// Hessian, and second-order ψ Hessian / ψ-Hessian directional
1750    /// derivative. The ψ-workspace masking happens inside
1751    /// `apply_ht_mask_first`, `apply_ht_mask_second`, and
1752    /// `apply_ht_mask_mixed` on the `GaussianJointPsi{First,Second,
1753    /// MixedDrift}Weights` per-row arrays, immediately after the row-scalar
1754    /// reductions and before the row-linear `weighted_crossprod_psi_maps` /
1755    /// `xt_diag_*_dense` assemblies, so the masked outputs remain unbiased.
1756    /// First-order ψ terms remain full-data exact (= trivially unbiased), so
1757    /// the total outer score is still unbiased. Inner-PIRLS and final-
1758    /// covariance paths never install the option, so they continue to
1759    /// consume the exact full-data quantities.
1760    fn outer_derivative_subsample_capable(&self) -> bool {
1761        true
1762    }
1763}
1764
1765impl CustomFamilyGenerative for GaussianLocationScaleFamily {
1766    fn generativespec(
1767        &self,
1768        block_states: &[ParameterBlockState],
1769    ) -> Result<GenerativeSpec, String> {
1770        validate_block_count::<GamlssError>("GaussianLocationScaleFamily", 2, block_states.len())?;
1771        let mu = block_states[Self::BLOCK_MU].eta.clone();
1772        let eta_log_sigma = &block_states[Self::BLOCK_LOG_SIGMA].eta;
1773        let sigma = gamlss_rowwise_map(eta_log_sigma.len(), |i| {
1774            logb_sigma_from_eta_scalar(eta_log_sigma[i])
1775        });
1776        Ok(GenerativeSpec {
1777            mean: mu,
1778            noise: NoiseModel::Gaussian { sigma },
1779        })
1780    }
1781}