gam_models/inference/
generative.rs

1use gam_custom_family::{CustomFamily, ParameterBlockState};
2use gam_solve::estimate::EstimationError;
3use crate::inference::predict_io::PredictResult;
4use gam_problem::types::{
5    LikelihoodScaleMetadata, LikelihoodSpec, ResponseFamily, is_valid_tweedie_power,
6};
7use ndarray::{Array1, Array2};
8
9/// THE single source of truth for the scalar dispersion the generative
10/// observation model uses for a fitted family — the value handed to
11/// [`NoiseModel::from_likelihood`] / [`generativespec_from_predict`] as
12/// `gaussian_scale`.
13///
14/// For every exponential-dispersion / overdispersed family the dispersion is
15/// **estimated jointly with the mean** and recorded in the fit's
16/// [`LikelihoodScaleMetadata`] (`scale`); the value embedded in the response
17/// spec (`likelihood.response`) is only the construction-time *seed* (e.g.
18/// `theta = 1.0`, `phi = 1.0`), left un-updated after the fit refreshes the
19/// estimate. Generation must therefore read the *fitted* dispersion off `scale`,
20/// falling back to the seed only for fit-free construction. Reading the seed was
21/// the shared root cause of a whole family of bugs — Gamma #678, Beta #769/#770,
22/// Tweedie #771, and the NB sibling #1124 (`Var = mu + mu^2` instead of
23/// `mu + mu^2/theta_hat`).
24///
25/// This helper exists in exactly one place precisely because that bug class
26/// recurred: the dispersion-picking logic had been duplicated across the CLI
27/// `gam generate` path and the Python `sample_replicates` path, and fixing one
28/// copy left the other drawing at the seed. Both paths now call this function,
29/// so the set of supported families and the interpretation of each dispersion
30/// parameter can never diverge again. (The per-row dispersion location-scale
31/// path, #913/#1125, is the one exception that bypasses this scalar picker — it
32/// threads a full `exp(eta_d(x))` vector via
33/// [`NoiseModel::from_likelihood_with_per_row_dispersion`] instead.)
34///
35/// `standard_deviation` is the fit's residual scale, used as the Gamma-shape and
36/// Gaussian-`sigma` fallback. Returns `None` only for families that carry no
37/// dispersion at all in the fallback arm (never, in practice, for the families
38/// above).
39pub fn family_noise_parameter(
40    scale: LikelihoodScaleMetadata,
41    standard_deviation: f64,
42    likelihood: &LikelihoodSpec,
43) -> Option<f64> {
44    match likelihood.response {
45        // Tweedie: `gaussian_scale` carries the *dispersion* phi; the variance
46        // power `p` is read straight off the family spec by `from_likelihood`.
47        // phi is estimated jointly with the mean (#771), so consult the fit's
48        // scale metadata; unit dispersion is the fit-free fallback.
49        ResponseFamily::Tweedie { .. } => scale.fixed_phi().or(Some(1.0)),
50        // NB overdispersion theta is estimated jointly with the mean and stored
51        // as `EstimatedNegBinTheta`; the spec theta is only the seed (#1124).
52        ResponseFamily::NegativeBinomial { theta, .. } => scale.negbin_theta().or(Some(theta)),
53        // Beta precision phi is estimated jointly with the mean (#567/#770); the
54        // spec phi is only the seed.
55        ResponseFamily::Beta { phi } => scale.fixed_phi().or(Some(phi)),
56        // Gamma shape k is estimated jointly with the mean (#678); fall back to
57        // the residual scale only when the fit recorded no shape.
58        ResponseFamily::Gamma => scale.gamma_shape().or(Some(standard_deviation)),
59        // Gaussian / Poisson / Binomial: the residual scale is the generative
60        // sigma (Poisson/Binomial ignore it downstream).
61        _ => Some(standard_deviation),
62    }
63}
64
65/// Observation-noise model used for generative sampling.
66#[derive(Clone, Debug)]
67pub enum NoiseModel {
68    Gaussian {
69        /// Per-observation standard deviation.
70        sigma: Array1<f64>,
71    },
72    Poisson,
73    Tweedie {
74        p: f64,
75        /// Per-observation dispersion φ (> 0). A scalar-dispersion fit broadcasts
76        /// one value to every row; a dispersion location-scale fit (#913/#1125)
77        /// supplies the fitted per-row φ = 1/exp(eta_d(x)).
78        phi: Array1<f64>,
79    },
80    NegativeBinomial {
81        /// Per-observation overdispersion θ (> 0); see `Tweedie::phi`.
82        theta: Array1<f64>,
83    },
84    Beta {
85        /// Per-observation precision φ (> 0); see `Tweedie::phi`.
86        phi: Array1<f64>,
87    },
88    Gamma {
89        /// Per-observation Gamma shape k (> 0), with mean-driven scale; see
90        /// `Tweedie::phi`.
91        shape: Array1<f64>,
92    },
93    Bernoulli,
94    /// Inverse-transform sampling for a conditional transformation-normal (CTM)
95    /// model (issue #1613). The fitted latent transform `h(·|x_i)` is strictly
96    /// increasing in `y` and `h(Y|x) ~ N(0, 1)`, so a response-scale draw is
97    /// `Y = h⁻¹(Z | x_i)` with `Z ~ N(0, 1)`. The earlier generate path drew
98    /// Gaussian noise around the mean, which produced latent-scale draws whose
99    /// per-row mean moved the wrong way with the covariate; this variant instead
100    /// samples from the genuine conditional law `F(·|x)`.
101    ///
102    /// Both this sampler and the response-scale conditional mean `E[Y|x]` used by
103    /// `predict` (#1612) invert the SAME per-row monotone curve, materialized on
104    /// a shared response grid, so the two paths cannot disagree on the underlying
105    /// transform.
106    TransformationNormalQuantile {
107        /// Shared, strictly increasing response grid (length `g ≥ 2`).
108        grid_y: Array1<f64>,
109        /// `h_grid[[i, k]] = h(grid_y[k] | x_i)`, strictly increasing in `k` for
110        /// every row `i` (one row per observation).
111        h_grid: Array2<f64>,
112    },
113}
114
115/// Invert a monotone increasing tabulated function `z = h_row(grid_y)` at the
116/// latent value `target`: find the bracketing grid interval and linearly
117/// interpolate `y`. Values below/above the tabulated range map to the support
118/// endpoints (the finite-support tails carry no mass between the clamp and the
119/// endpoint). This is the same bracketing inversion the CTM `predict` mean
120/// (#1612) uses on its quadrature nodes, applied here to a random latent draw.
121fn invert_monotone_grid(grid_y: &Array1<f64>, h_row: ndarray::ArrayView1<'_, f64>, target: f64) -> f64 {
122    let g = grid_y.len();
123    if target <= h_row[0] {
124        return grid_y[0];
125    }
126    if target >= h_row[g - 1] {
127        return grid_y[g - 1];
128    }
129    let mut lo = 0usize;
130    let mut hi = g - 1;
131    while hi - lo > 1 {
132        let mid = (lo + hi) / 2;
133        if h_row[mid] <= target {
134            lo = mid;
135        } else {
136            hi = mid;
137        }
138    }
139    let t = (target - h_row[lo]) / (h_row[hi] - h_row[lo]);
140    grid_y[lo] + t * (grid_y[hi] - grid_y[lo])
141}
142
143/// First-class generative specification: mean process + observation noise.
144#[derive(Clone, Debug)]
145pub struct GenerativeSpec {
146    pub mean: Array1<f64>,
147    pub noise: NoiseModel,
148}
149
150impl GenerativeSpec {
151    /// Number of observations `n` in the mean vector, matching the row
152    /// count of the design used to produce this generative specification.
153    pub fn nobs(&self) -> usize {
154        self.mean.len()
155    }
156}
157
158/// Build a generative specification for built-in GAM families from eta/mean.
159pub fn generativespec_from_predict(
160    prediction: PredictResult,
161    likelihood: LikelihoodSpec,
162    gaussian_scale: Option<f64>,
163) -> Result<GenerativeSpec, EstimationError> {
164    let noise = NoiseModel::from_likelihood(&likelihood, prediction.mean.len(), gaussian_scale)?;
165    Ok(GenerativeSpec {
166        mean: prediction.mean,
167        noise,
168    })
169}
170
171impl NoiseModel {
172    /// Single canonical mapping from a fitted `LikelihoodSpec` (response
173    /// distribution + dispersion `gaussian_scale`) to the observation
174    /// `NoiseModel` used for generative sampling. Both simulation
175    /// (`FamilyStrategy::simulate_noise`) and generative inference
176    /// (`generativespec_from_predict`) route through this one helper so the
177    /// set of supported likelihoods and the interpretation of dispersion
178    /// parameters can never diverge between the two paths.
179    ///
180    /// `nobs` is the number of observations the resulting per-observation
181    /// Gaussian `sigma` vector should span; it is ignored for families whose
182    /// noise carries no per-observation state.
183    pub fn from_likelihood(
184        likelihood: &LikelihoodSpec,
185        nobs: usize,
186        gaussian_scale: Option<f64>,
187    ) -> Result<NoiseModel, EstimationError> {
188        match &likelihood.response {
189            ResponseFamily::Gaussian => {
190                let sigma =
191                    Self::require_noise_parameter(likelihood, "Gaussian sigma", gaussian_scale)?;
192                if sigma < 0.0 {
193                    crate::bail_invalid_estim!(
194                        "{} generative sampling requires Gaussian sigma >= 0; got {sigma}",
195                        likelihood.pretty_name()
196                    );
197                }
198                Ok(NoiseModel::Gaussian {
199                    sigma: Array1::from_elem(nobs, sigma),
200                })
201            }
202            ResponseFamily::Binomial => Ok(NoiseModel::Bernoulli),
203            ResponseFamily::Poisson => Ok(NoiseModel::Poisson),
204            ResponseFamily::Tweedie { p } => {
205                let p = *p;
206                if !is_valid_tweedie_power(p) {
207                    crate::bail_invalid_estim!(
208                        "Tweedie variance power must be finite and strictly between 1 and 2; got {p}"
209                    );
210                }
211                let phi = Self::require_positive_noise_parameter(
212                    likelihood,
213                    "Tweedie dispersion phi",
214                    gaussian_scale,
215                )?;
216                Ok(NoiseModel::Tweedie {
217                    p,
218                    // Scalar-dispersion fit: broadcast one φ to every row. The
219                    // dispersion location-scale path (#1125) builds the per-row
220                    // vector directly in `run_generate_unified` instead.
221                    phi: Array1::from_elem(nobs, phi),
222                })
223            }
224            ResponseFamily::NegativeBinomial { theta, .. } => {
225                // The NB overdispersion θ is estimated jointly with the mean and
226                // the authoritative post-fit value is handed in as
227                // `gaussian_scale` (from `likelihood_scale.negbin_theta()`);
228                // the θ embedded in the response spec is only the seed (1.0).
229                // Reading the seed was the NB sibling of the Beta #770 bug:
230                // generate drew Var = μ + μ² (θ = 1) regardless of the fitted
231                // overdispersion (#1124). Mirror the Beta arm below.
232                let theta = gaussian_scale.unwrap_or(*theta);
233                if !(theta.is_finite() && theta > 0.0) {
234                    crate::bail_invalid_estim!(
235                        "negative-binomial theta must be finite and > 0; got {theta}"
236                    );
237                }
238                Ok(NoiseModel::NegativeBinomial {
239                    theta: Array1::from_elem(nobs, theta),
240                })
241            }
242            ResponseFamily::Beta { phi } => {
243                // The Beta precision φ is estimated jointly with the mean
244                // (issue #567), so the authoritative value after fitting is the
245                // dispersion handed in as `gaussian_scale` — exactly as Gamma's
246                // shape and Tweedie's φ already take theirs. The `phi` embedded
247                // in the response spec is only the construction-time *seed* (left
248                // at its original value, e.g. 1.0, after the fit refreshes the
249                // estimate in `likelihood_scale`), so it serves solely as a
250                // fallback for fit-free construction where no fitted dispersion
251                // is supplied. Reading the seed instead of `gaussian_scale` was
252                // issue #770: the generative/observation path drew Beta responses
253                // with φ = 1.0 regardless of the data — nearly uniform on (0,1),
254                // ~20× too much variance — even though the fit estimated φ and
255                // the caller forwarded it here.
256                let phi = gaussian_scale.unwrap_or(*phi);
257                if !(phi.is_finite() && phi > 0.0) {
258                    crate::bail_invalid_estim!(
259                        "beta-regression phi must be finite and > 0; got {phi}"
260                    );
261                }
262                Ok(NoiseModel::Beta {
263                    phi: Array1::from_elem(nobs, phi),
264                })
265            }
266            ResponseFamily::Gamma => {
267                let shape = Self::require_positive_noise_parameter(
268                    likelihood,
269                    "Gamma shape",
270                    gaussian_scale,
271                )?;
272                Ok(NoiseModel::Gamma {
273                    shape: Array1::from_elem(nobs, shape),
274                })
275            }
276            ResponseFamily::RoystonParmar => Err(EstimationError::InvalidInput(
277                "RoystonParmar generative sampling is not exposed via generic generation"
278                    .to_string(),
279            )),
280        }
281    }
282
283    /// Build the observation `NoiseModel` for a dispersion location-scale fit
284    /// (#1125) from a fitted PER-ROW dispersion surface `dispersion[i]` (the
285    /// predictor's `exp(eta_d(x_i))` mapped into NoiseModel units — NB θ, Gamma
286    /// shape, Beta φ directly, Tweedie φ as the reciprocal). Unlike
287    /// `from_likelihood`, which broadcasts a single scalar dispersion to every
288    /// row, this threads the genuine per-observation precision channel so
289    /// generated data reproduces the fitted non-constant dispersion instead of
290    /// coming out homoscedastic at the seed.
291    pub fn from_likelihood_with_per_row_dispersion(
292        likelihood: &LikelihoodSpec,
293        dispersion: Array1<f64>,
294    ) -> Result<NoiseModel, EstimationError> {
295        match &likelihood.response {
296            ResponseFamily::Tweedie { p } => {
297                let p = *p;
298                if !is_valid_tweedie_power(p) {
299                    crate::bail_invalid_estim!(
300                        "Tweedie variance power must be finite and strictly between 1 and 2; got {p}"
301                    );
302                }
303                Ok(NoiseModel::Tweedie { p, phi: dispersion })
304            }
305            ResponseFamily::NegativeBinomial { .. } => {
306                Ok(NoiseModel::NegativeBinomial { theta: dispersion })
307            }
308            ResponseFamily::Beta { .. } => Ok(NoiseModel::Beta { phi: dispersion }),
309            ResponseFamily::Gamma => Ok(NoiseModel::Gamma { shape: dispersion }),
310            other => Err(EstimationError::InvalidInput(format!(
311                "per-row dispersion generative sampling is only defined for the dispersion \
312                 location-scale families (Gamma/NegativeBinomial/Beta/Tweedie); got {other:?}"
313            ))),
314        }
315    }
316
317    fn require_noise_parameter(
318        likelihood: &LikelihoodSpec,
319        parameter_name: &str,
320        value: Option<f64>,
321    ) -> Result<f64, EstimationError> {
322        let value = value.ok_or_else(|| {
323            EstimationError::InvalidInput(format!(
324                "{} generative sampling requires fitted {parameter_name}",
325                likelihood.pretty_name()
326            ))
327        })?;
328        if value.is_finite() {
329            Ok(value)
330        } else {
331            Err(EstimationError::InvalidInput(format!(
332                "{} generative sampling requires finite {parameter_name}; got {value}",
333                likelihood.pretty_name()
334            )))
335        }
336    }
337
338    fn require_positive_noise_parameter(
339        likelihood: &LikelihoodSpec,
340        parameter_name: &str,
341        value: Option<f64>,
342    ) -> Result<f64, EstimationError> {
343        let value = Self::require_noise_parameter(likelihood, parameter_name, value)?;
344        if value > 0.0 {
345            Ok(value)
346        } else {
347            Err(EstimationError::InvalidInput(format!(
348                "{} generative sampling requires {parameter_name} > 0; got {value}",
349                likelihood.pretty_name()
350            )))
351        }
352    }
353}
354
355/// Validate that a per-observation dispersion vector matches the mean length.
356/// Scalar-dispersion fits broadcast one value across all rows (length `n`);
357/// dispersion location-scale fits (#1125) carry the genuine per-row vector.
358fn check_dispersion_len(
359    dispersion: &Array1<f64>,
360    nobs: usize,
361    name: &str,
362) -> Result<(), EstimationError> {
363    if dispersion.len() != nobs {
364        crate::bail_invalid_estim!(
365            "{name} length {} does not match mean length {nobs}",
366            dispersion.len()
367        );
368    }
369    Ok(())
370}
371
372/// Draw one synthetic observation vector from a generative spec.
373pub fn sampleobservations<R: rand::Rng + ?Sized>(
374    spec: &GenerativeSpec,
375    rng: &mut R,
376) -> Result<Array1<f64>, EstimationError> {
377    if spec.mean.iter().any(|m| !m.is_finite()) {
378        crate::bail_invalid_estim!("generative mean contains non-finite values");
379    }
380    match &spec.noise {
381        NoiseModel::Gaussian { sigma } => {
382            if sigma.len() != spec.mean.len() {
383                crate::bail_invalid_estim!(
384                    "Gaussian sigma length {} does not match mean length {}",
385                    sigma.len(),
386                    spec.mean.len()
387                );
388            }
389            let mut y = spec.mean.clone();
390            for i in 0..y.len() {
391                let sd = sigma[i].max(0.0);
392                if sd == 0.0 {
393                    continue;
394                }
395                let dist = rand_distr::Normal::new(0.0, sd).map_err(|e| {
396                    EstimationError::InvalidInput(format!("invalid Gaussian noise scale {sd}: {e}"))
397                })?;
398                y[i] += rand_distr::Distribution::sample(&dist, rng);
399            }
400            Ok(y)
401        }
402        NoiseModel::Poisson => {
403            let mut y = Array1::<f64>::zeros(spec.mean.len());
404            for i in 0..y.len() {
405                let lam = spec.mean[i].max(1e-12);
406                let dist = rand_distr::Poisson::new(lam).map_err(|e| {
407                    EstimationError::InvalidInput(format!("invalid Poisson rate {lam}: {e}"))
408                })?;
409                let draw = rand_distr::Distribution::sample(&dist, rng);
410                y[i] = draw;
411            }
412            Ok(y)
413        }
414        NoiseModel::Tweedie { p, phi } => {
415            if !(p.is_finite() && *p >= 1.0 && *p <= 2.0) {
416                crate::bail_invalid_estim!("invalid Tweedie power p: {p}");
417            }
418            check_dispersion_len(phi, spec.mean.len(), "Tweedie dispersion phi")?;
419            for (i, &phi_i) in phi.iter().enumerate() {
420                if !(phi_i.is_finite() && phi_i > 0.0) {
421                    crate::bail_invalid_estim!(
422                        "invalid Tweedie dispersion phi at row {i}: {phi_i}"
423                    );
424                }
425            }
426            let mut y = Array1::<f64>::zeros(spec.mean.len());
427            if (*p - 1.0).abs() <= 1.0e-12 {
428                for i in 0..y.len() {
429                    let phi_i = phi[i];
430                    let lam = (spec.mean[i] / phi_i).max(1e-12);
431                    let dist = rand_distr::Poisson::new(lam).map_err(|e| {
432                        EstimationError::InvalidInput(format!(
433                            "invalid Tweedie-Poisson rate {lam}: {e}"
434                        ))
435                    })?;
436                    y[i] = phi_i * rand_distr::Distribution::sample(&dist, rng);
437                }
438                return Ok(y);
439            }
440            if (*p - 2.0).abs() <= 1.0e-12 {
441                for i in 0..y.len() {
442                    let phi_i = phi[i];
443                    let shape = (1.0 / phi_i).max(1e-12);
444                    let mu = spec.mean[i].max(1e-12);
445                    let scale = (mu * phi_i).max(1e-12);
446                    let dist = rand_distr::Gamma::new(shape, scale).map_err(|e| {
447                        EstimationError::InvalidInput(format!(
448                            "invalid Tweedie-Gamma params shape={shape} scale={scale}: {e}"
449                        ))
450                    })?;
451                    y[i] = rand_distr::Distribution::sample(&dist, rng);
452                }
453                return Ok(y);
454            }
455            let alpha = (2.0 - *p) / (*p - 1.0);
456            for i in 0..y.len() {
457                let phi_i = phi[i];
458                let mu = spec.mean[i].max(1e-12);
459                let lambda = (mu.powf(2.0 - *p) / (phi_i * (2.0 - *p))).max(1e-12);
460                let scale = (phi_i * (*p - 1.0) * mu.powf(*p - 1.0)).max(1e-12);
461                let count_dist = rand_distr::Poisson::new(lambda).map_err(|e| {
462                    EstimationError::InvalidInput(format!(
463                        "invalid Tweedie compound-Poisson rate {lambda}: {e}"
464                    ))
465                })?;
466                let count = rand_distr::Distribution::sample(&count_dist, rng) as usize;
467                if count == 0 {
468                    continue;
469                }
470                let jump_dist = rand_distr::Gamma::new(alpha, scale).map_err(|e| {
471                    EstimationError::InvalidInput(format!(
472                        "invalid Tweedie jump params shape={alpha} scale={scale}: {e}"
473                    ))
474                })?;
475                y[i] = (0..count)
476                    .map(|_| rand_distr::Distribution::sample(&jump_dist, rng))
477                    .sum();
478            }
479            Ok(y)
480        }
481        NoiseModel::NegativeBinomial { theta } => {
482            check_dispersion_len(theta, spec.mean.len(), "NegativeBinomial theta")?;
483            let mut y = Array1::<f64>::zeros(spec.mean.len());
484            for i in 0..y.len() {
485                let theta_i = theta[i];
486                if !(theta_i.is_finite() && theta_i > 0.0) {
487                    crate::bail_invalid_estim!(
488                        "invalid negative-binomial theta at row {i}: {theta_i}"
489                    );
490                }
491                let mu = spec.mean[i].max(1e-12);
492                let scale = (mu / theta_i).max(1e-12);
493                let gamma = rand_distr::Gamma::new(theta_i, scale).map_err(|e| {
494                    EstimationError::InvalidInput(format!(
495                        "invalid NegativeBinomial gamma mixture params theta={theta_i} scale={scale}: {e}"
496                    ))
497                })?;
498                let lambda = rand_distr::Distribution::sample(&gamma, rng).max(1e-12);
499                let poisson = rand_distr::Poisson::new(lambda).map_err(|e| {
500                    EstimationError::InvalidInput(format!(
501                        "invalid NegativeBinomial Poisson rate {lambda}: {e}"
502                    ))
503                })?;
504                y[i] = rand_distr::Distribution::sample(&poisson, rng);
505            }
506            Ok(y)
507        }
508        NoiseModel::Beta { phi } => {
509            check_dispersion_len(phi, spec.mean.len(), "Beta phi")?;
510            let mut y = Array1::<f64>::zeros(spec.mean.len());
511            for i in 0..y.len() {
512                let phi_i = phi[i];
513                if !(phi_i.is_finite() && phi_i > 0.0) {
514                    crate::bail_invalid_estim!("invalid beta-regression phi at row {i}: {phi_i}");
515                }
516                let mu = spec.mean[i].clamp(1e-12, 1.0 - 1e-12);
517                let alpha = (mu * phi_i).max(1e-12);
518                let beta = ((1.0 - mu) * phi_i).max(1e-12);
519                let dist = rand_distr::Beta::new(alpha, beta).map_err(|e| {
520                    EstimationError::InvalidInput(format!(
521                        "invalid Beta params alpha={alpha} beta={beta}: {e}"
522                    ))
523                })?;
524                y[i] = rand_distr::Distribution::sample(&dist, rng);
525            }
526            Ok(y)
527        }
528        NoiseModel::Gamma { shape } => {
529            check_dispersion_len(shape, spec.mean.len(), "Gamma shape")?;
530            let mut y = Array1::<f64>::zeros(spec.mean.len());
531            for i in 0..y.len() {
532                let shape_i = shape[i];
533                if !shape_i.is_finite() || shape_i <= 0.0 {
534                    crate::bail_invalid_estim!("invalid Gamma shape at row {i}: {shape_i}");
535                }
536                let mu = spec.mean[i].max(1e-12);
537                let scale = (mu / shape_i).max(1e-12);
538                let dist = rand_distr::Gamma::new(shape_i, scale).map_err(|e| {
539                    EstimationError::InvalidInput(format!(
540                        "invalid Gamma params shape={shape_i} scale={scale}: {e}"
541                    ))
542                })?;
543                y[i] = rand_distr::Distribution::sample(&dist, rng);
544            }
545            Ok(y)
546        }
547        NoiseModel::Bernoulli => {
548            let mut y = Array1::<f64>::zeros(spec.mean.len());
549            for i in 0..y.len() {
550                let p = spec.mean[i];
551                let dist = rand_distr::Bernoulli::new(p).map_err(|e| {
552                    EstimationError::InvalidInput(format!("invalid Bernoulli probability {p}: {e}"))
553                })?;
554                y[i] = if rand_distr::Distribution::sample(&dist, rng) {
555                    1.0
556                } else {
557                    0.0
558                };
559            }
560            Ok(y)
561        }
562        NoiseModel::TransformationNormalQuantile { grid_y, h_grid } => {
563            let n = spec.mean.len();
564            if h_grid.nrows() != n {
565                crate::bail_invalid_estim!(
566                    "transformation-normal h_grid has {} rows but mean length is {n}",
567                    h_grid.nrows()
568                );
569            }
570            let g = grid_y.len();
571            if g < 2 || h_grid.ncols() != g {
572                crate::bail_invalid_estim!(
573                    "transformation-normal grid is degenerate: grid_y len {g}, h_grid cols {}",
574                    h_grid.ncols()
575                );
576            }
577            // `h(Y|x) ~ N(0,1)` ⇒ a response-scale draw is `Y = h⁻¹(Z | x)`,
578            // `Z ~ N(0,1)`. One independent latent draw per observation, inverted
579            // through that row's monotone transform.
580            let dist = rand_distr::Normal::new(0.0, 1.0).map_err(|e| {
581                EstimationError::InvalidInput(format!(
582                    "invalid standard-normal latent sampler: {e}"
583                ))
584            })?;
585            let mut y = Array1::<f64>::zeros(n);
586            for i in 0..n {
587                let z: f64 = rand_distr::Distribution::sample(&dist, rng);
588                y[i] = invert_monotone_grid(grid_y, h_grid.row(i), z);
589            }
590            Ok(y)
591        }
592    }
593}
594
595/// Draw multiple synthetic replicates (n_draws x nobs).
596pub fn sampleobservation_replicates<R: rand::Rng + ?Sized>(
597    spec: &GenerativeSpec,
598    n_draws: usize,
599    rng: &mut R,
600) -> Result<Array2<f64>, EstimationError> {
601    let n = spec.nobs();
602    let mut out = Array2::<f64>::zeros((n_draws, n));
603    for d in 0..n_draws {
604        let draw = sampleobservations(spec, rng)?;
605        out.row_mut(d).assign(&draw);
606    }
607    Ok(out)
608}
609
610/// Extension trait for custom multi-block families that provide explicit
611/// generative semantics (mean + observation noise) at a fitted state.
612pub trait CustomFamilyGenerative: CustomFamily {
613    fn generativespec(
614        &self,
615        block_states: &[ParameterBlockState],
616    ) -> Result<GenerativeSpec, String>;
617}
618
619#[cfg(test)]
620mod tests {
621    use super::*;
622    use crate::family_runtime::{FamilyStrategy, strategy_for_spec};
623
624    /// The CTM inverse-transform sampler (#1613) must draw `Y = h⁻¹(Z|x)`,
625    /// `Z ~ N(0,1)`, from each row's monotone transform — NOT Gaussian noise on
626    /// the latent scale. With the analytically invertible linear transform
627    /// `h(y|x_i) = slope_i·(y − center_i)` we have `h⁻¹(z) = center_i + z/slope_i`,
628    /// so the draws must be `N(center_i, (1/slope_i)²)`: the per-row mean tracks
629    /// `center_i` (response scale) and the spread is `1/slope_i` (NOT ≈ 1, the
630    /// latent scale of the old buggy path).
631    #[test]
632    fn transformation_normal_quantile_sampler_is_inverse_transform() {
633        use rand::SeedableRng;
634
635        let g = 801usize;
636        let (y_lo, y_hi) = (-12.0_f64, 12.0_f64);
637        let grid_y = Array1::from_shape_fn(g, |k| {
638            y_lo + (y_hi - y_lo) * (k as f64) / ((g - 1) as f64)
639        });
640        // Row 0: center -1, slope 2 (sd 0.5). Row 1: center +2, slope 4 (sd 0.25).
641        let centers = [-1.0_f64, 2.0_f64];
642        let slopes = [2.0_f64, 4.0_f64];
643        let mut h_grid = Array2::<f64>::zeros((2, g));
644        for i in 0..2 {
645            for k in 0..g {
646                h_grid[[i, k]] = slopes[i] * (grid_y[k] - centers[i]);
647            }
648        }
649        let spec = GenerativeSpec {
650            mean: Array1::from_vec(vec![centers[0], centers[1]]),
651            noise: NoiseModel::TransformationNormalQuantile {
652                grid_y: grid_y.clone(),
653                h_grid,
654            },
655        };
656
657        let mut rng = rand::rngs::StdRng::seed_from_u64(20240613);
658        let n_draws = 40_000usize;
659        let draws = sampleobservation_replicates(&spec, n_draws, &mut rng).unwrap();
660        assert_eq!(draws.shape(), &[n_draws, 2]);
661
662        let mut row_means = [0.0_f64; 2];
663        for i in 0..2 {
664            let col = draws.column(i);
665            let mean = col.sum() / (n_draws as f64);
666            let var =
667                col.iter().map(|v| (v - mean) * (v - mean)).sum::<f64>() / (n_draws as f64);
668            let sd = var.sqrt();
669            row_means[i] = mean;
670            assert!(
671                (mean - centers[i]).abs() < 0.02,
672                "row {i} draw mean {mean:.4} should be the response-scale center {:.4}",
673                centers[i]
674            );
675            let expected_sd = 1.0 / slopes[i];
676            assert!(
677                (sd - expected_sd).abs() < 0.02,
678                "row {i} draw sd {sd:.4} should be the response-scale 1/slope {expected_sd:.4}, \
679                 not the latent ≈1 of the old Gaussian-noise path"
680            );
681        }
682        // The conditional mean must INCREASE with the covariate-driven center —
683        // the exact direction the #1613 bug got backwards.
684        assert!(
685            row_means[1] > row_means[0],
686            "draw means must increase with center: row0={:.4} row1={:.4}",
687            row_means[0],
688            row_means[1]
689        );
690    }
691
692    /// The canonical dispersion picker must read the *fitted* dispersion off the
693    /// scale metadata, never the construction seed embedded in the response
694    /// spec. This is the single guard for the whole "generate draws at the seed
695    /// dispersion" bug family — Gamma #678, Beta #769/#770, Tweedie #771, and
696    /// the NB sibling #1124 — now that the picker lives in exactly one place
697    /// (previously three divergent copies let a fix in one miss the others).
698    #[test]
699    fn family_noise_parameter_reads_fitted_dispersion_not_seed() {
700        // NB: spec carries the seed theta = 1; the fit estimated theta_hat.
701        let nb = LikelihoodSpec::negative_binomial_log(1.0);
702        assert_eq!(
703            family_noise_parameter(
704                LikelihoodScaleMetadata::EstimatedNegBinTheta { theta: 2.97 },
705                0.0,
706                &nb,
707            ),
708            Some(2.97),
709            "NB picker must read theta_hat (#1124), not the seed theta=1"
710        );
711
712        // Tweedie: the picker must return the dispersion phi, never the variance
713        // power p that lives on the spec.
714        let tw = LikelihoodSpec::tweedie_log(1.5);
715        assert_eq!(
716            family_noise_parameter(
717                LikelihoodScaleMetadata::EstimatedTweediePhi { phi: 7.25 },
718                0.0,
719                &tw,
720            ),
721            Some(7.25),
722            "Tweedie picker must read phi_hat (#771), not the variance power p"
723        );
724
725        // Beta: spec carries the seed phi = 1; the fit estimated phi_hat.
726        let beta = LikelihoodSpec::beta_logit(1.0);
727        assert_eq!(
728            family_noise_parameter(
729                LikelihoodScaleMetadata::EstimatedBetaPhi { phi: 12.0 },
730                0.0,
731                &beta,
732            ),
733            Some(12.0),
734            "Beta picker must read phi_hat (#770), not the seed phi=1"
735        );
736
737        // Gamma: the estimated shape must win over the residual-scale fallback.
738        let gamma = LikelihoodSpec::gamma_log();
739        assert_eq!(
740            family_noise_parameter(
741                LikelihoodScaleMetadata::EstimatedGammaShape { shape: 4.5 },
742                0.123,
743                &gamma,
744            ),
745            Some(4.5),
746            "Gamma picker must read shape_hat (#678), not the residual-scale fallback"
747        );
748    }
749
750    /// With no fitted dispersion recorded (fit-free construction), the picker
751    /// falls back to the seed on the spec / the residual scale. It must never
752    /// return `None` for a dispersion family, or generation would have nothing
753    /// to draw with.
754    #[test]
755    fn family_noise_parameter_falls_back_to_seed_when_unfitted() {
756        // `ProfiledGaussian` carries no fixed_phi / negbin_theta / gamma_shape,
757        // so every accessor returns `None` and the picker must use the fallback.
758        let none = LikelihoodScaleMetadata::ProfiledGaussian;
759        assert_eq!(
760            family_noise_parameter(none, 0.0, &LikelihoodSpec::negative_binomial_log(3.5)),
761            Some(3.5),
762            "NB picker must fall back to the spec seed theta"
763        );
764        assert_eq!(
765            family_noise_parameter(none, 0.0, &LikelihoodSpec::beta_logit(8.0)),
766            Some(8.0),
767            "Beta picker must fall back to the spec seed phi"
768        );
769        assert_eq!(
770            family_noise_parameter(none, 0.0, &LikelihoodSpec::tweedie_log(1.5)),
771            Some(1.0),
772            "Tweedie picker must fall back to unit dispersion"
773        );
774        assert_eq!(
775            family_noise_parameter(none, 2.0, &LikelihoodSpec::gamma_log()),
776            Some(2.0),
777            "Gamma picker must fall back to the residual scale"
778        );
779    }
780
781    /// End-to-end through the exact composition `gam generate` and
782    /// `sample_replicates` use — picker → `from_likelihood`. The seed-spec
783    /// theta = 1 plus an estimated theta_hat must yield a per-row NB noise model
784    /// at theta_hat, not at the seed. This is the #1124 repro at the unit level,
785    /// from the angle of the *composed* path rather than `from_likelihood` alone.
786    #[test]
787    fn picker_then_from_likelihood_threads_fitted_nb_theta() {
788        let nobs = 6usize;
789        let seed_spec = LikelihoodSpec::negative_binomial_log(1.0);
790        let scale = LikelihoodScaleMetadata::EstimatedNegBinTheta { theta: 2.751 };
791        let picked = family_noise_parameter(scale, 0.0, &seed_spec);
792        let noise =
793            NoiseModel::from_likelihood(&seed_spec, nobs, picked).expect("NB noise model builds");
794        let NoiseModel::NegativeBinomial { theta } = noise else {
795            panic!("expected an NB observation noise model");
796        };
797        assert!(
798            theta.len() == nobs && theta.iter().all(|&t| (t - 2.751).abs() < 1e-12),
799            "NB generate composes the seed theta=1 instead of theta_hat (#1124): {theta:?}"
800        );
801    }
802
803    /// Structural equality for `NoiseModel` (no derived `PartialEq` so that
804    /// the live enum can carry per-observation arrays). Two models are equal
805    /// when they are the same variant with bitwise-identical parameters.
806    fn noise_models_match(a: &NoiseModel, b: &NoiseModel) -> bool {
807        match (a, b) {
808            (NoiseModel::Gaussian { sigma: sa }, NoiseModel::Gaussian { sigma: sb }) => sa == sb,
809            (NoiseModel::Poisson, NoiseModel::Poisson) => true,
810            (NoiseModel::Bernoulli, NoiseModel::Bernoulli) => true,
811            (NoiseModel::Tweedie { p: pa, phi: pha }, NoiseModel::Tweedie { p: pb, phi: phb }) => {
812                pa == pb && pha == phb
813            }
814            (
815                NoiseModel::NegativeBinomial { theta: ta },
816                NoiseModel::NegativeBinomial { theta: tb },
817            ) => ta == tb,
818            (NoiseModel::Beta { phi: pa }, NoiseModel::Beta { phi: pb }) => pa == pb,
819            (NoiseModel::Gamma { shape: sa }, NoiseModel::Gamma { shape: sb }) => sa == sb,
820            _ => false,
821        }
822    }
823
824    /// For every supported built-in family, the canonical
825    /// `NoiseModel::from_likelihood` mapping and the simulation adapter
826    /// `FamilyStrategy::simulate_noise` must produce the same `NoiseModel`
827    /// from the same fitted dispersion — this is the single-mapping guarantee
828    /// the unification provides.
829    #[test]
830    fn from_likelihood_matches_simulate_noise_for_each_family() {
831        let nobs = 5usize;
832        let mean = Array1::from_elem(nobs, 0.5_f64);
833
834        // (spec, dispersion/gaussian_scale, expected noise variant).
835        let cases: [(LikelihoodSpec, Option<f64>, NoiseModel); 7] = [
836            (
837                LikelihoodSpec::gaussian_identity(),
838                Some(0.7),
839                NoiseModel::Gaussian {
840                    sigma: Array1::from_elem(nobs, 0.7),
841                },
842            ),
843            (
844                LikelihoodSpec::binomial_logit(),
845                None,
846                NoiseModel::Bernoulli,
847            ),
848            (LikelihoodSpec::poisson_log(), None, NoiseModel::Poisson),
849            (
850                LikelihoodSpec::tweedie_log(1.4),
851                Some(0.9),
852                NoiseModel::Tweedie {
853                    p: 1.4,
854                    phi: Array1::from_elem(nobs, 0.9),
855                },
856            ),
857            (
858                LikelihoodSpec::negative_binomial_log(2.5),
859                None,
860                NoiseModel::NegativeBinomial {
861                    theta: Array1::from_elem(nobs, 2.5),
862                },
863            ),
864            (
865                LikelihoodSpec::beta_logit(3.0),
866                None,
867                NoiseModel::Beta {
868                    phi: Array1::from_elem(nobs, 3.0),
869                },
870            ),
871            (
872                LikelihoodSpec::gamma_log(),
873                Some(1.5),
874                NoiseModel::Gamma {
875                    shape: Array1::from_elem(nobs, 1.5),
876                },
877            ),
878        ];
879
880        for (spec, scale, expected) in cases {
881            let from_helper = NoiseModel::from_likelihood(&spec, nobs, scale)
882                .expect("canonical mapping must accept a supported family");
883            let from_strategy = strategy_for_spec(&spec)
884                .simulate_noise(&mean, scale)
885                .expect("simulation adapter must accept a supported family");
886
887            assert!(
888                noise_models_match(&from_helper, &expected),
889                "{} canonical mapping produced an unexpected NoiseModel",
890                spec.pretty_name()
891            );
892            assert!(
893                noise_models_match(&from_helper, &from_strategy),
894                "{} simulation and inference disagree on the NoiseModel",
895                spec.pretty_name()
896            );
897        }
898    }
899
900    /// RoystonParmar is not exposed through the generic generative path, and
901    /// both the canonical mapping and the simulation adapter must reject it
902    /// identically so the two paths stay in lockstep.
903    #[test]
904    fn royston_parmar_rejected_on_both_paths() {
905        let spec = LikelihoodSpec::royston_parmar();
906        let mean = Array1::from_elem(3, 0.0_f64);
907        assert!(NoiseModel::from_likelihood(&spec, 3, None).is_err());
908        assert!(
909            strategy_for_spec(&spec)
910                .simulate_noise(&mean, None)
911                .is_err()
912        );
913    }
914
915    /// Invalid / missing dispersion is rejected the same way regardless of
916    /// which entry point is used.
917    #[test]
918    fn invalid_dispersion_rejected_on_both_paths() {
919        let mean = Array1::from_elem(4, 0.0_f64);
920
921        // Gaussian sigma missing.
922        let gauss = LikelihoodSpec::gaussian_identity();
923        assert!(NoiseModel::from_likelihood(&gauss, 4, None).is_err());
924        assert!(
925            strategy_for_spec(&gauss)
926                .simulate_noise(&mean, None)
927                .is_err()
928        );
929
930        // Tweedie power outside (1, 2).
931        let bad_tweedie = LikelihoodSpec::tweedie_log(2.5);
932        assert!(NoiseModel::from_likelihood(&bad_tweedie, 4, Some(0.5)).is_err());
933        assert!(
934            strategy_for_spec(&bad_tweedie)
935                .simulate_noise(&mean, Some(0.5))
936                .is_err()
937        );
938
939        // Gamma shape non-positive.
940        let gamma = LikelihoodSpec::gamma_log();
941        assert!(NoiseModel::from_likelihood(&gamma, 4, Some(-1.0)).is_err());
942        assert!(
943            strategy_for_spec(&gamma)
944                .simulate_noise(&mean, Some(-1.0))
945                .is_err()
946        );
947    }
948}
gam_models/inference/generative.rs

gam_models/inference/
generative.rs