Skip to main content

gam_models/fit_orchestration/materialize/
validation.rs

1use super::*;
2use gam_terms::inference::formula_dsl::LinkMode;
3
4pub(crate) fn reject_marginal_slope_controls_for_transformation_normal(
5    config: &FitConfig,
6) -> Result<(), WorkflowError> {
7    let family_requests_marginal_slope = config.family.as_deref().is_some_and(|family| {
8        let canonical = family.to_ascii_lowercase().replace('_', "-");
9        canonical == "bernoulli-marginal-slope" || canonical == "binary-marginal-slope"
10    });
11    if family_requests_marginal_slope
12        || config.logslope_formula.is_some()
13        || config.z_column.is_some()
14        || config.ctn_stage1.is_some()
15    {
16        return Err(WorkflowError::InvalidConfig {
17            reason: "transformation_normal cannot be combined with marginal-slope family controls"
18                .to_string(),
19        });
20    }
21    Ok(())
22}
23
24/// Reject `timewiggle(...)` / `survmodel(...)` in a formula whose response is
25/// not `Surv(...)`.
26///
27/// These two DSL controls only have meaning under the survival likelihood: a
28/// `timewiggle(...)` term parameterizes the time-varying baseline-hazard /
29/// log-cumulative-hazard surface, and `survmodel(...)` selects the survival
30/// likelihood mode. Both are read exclusively by `materialize_survival`. When
31/// the main formula has no `Surv(...)` response, leaving them unguarded means
32/// the term is parsed and option-validated and then dropped on the floor —
33/// the contract violation reported in #371. We error instead, with the same
34/// "only supported in the main survival formula" phrasing the auxiliary-formula
35/// path already uses.
36pub(crate) fn reject_survival_only_terms_for_nonsurvival(
37    parsed: &ParsedFormula,
38) -> Result<(), WorkflowError> {
39    if parsed.timewiggle.is_some() {
40        return Err(WorkflowError::InvalidConfig {
41            reason: "timewiggle(...) is only supported in the main survival formula \
42                     (a formula with a Surv(...) response); it is meaningless for a \
43                     non-survival response and would otherwise be silently ignored"
44                .to_string(),
45        });
46    }
47    if parsed.survivalspec.is_some() {
48        return Err(WorkflowError::InvalidConfig {
49            reason: "survmodel(...) is only supported in the main survival formula \
50                     (a formula with a Surv(...) response); it is meaningless for a \
51                     non-survival response and would otherwise be silently ignored"
52                .to_string(),
53        });
54    }
55    Ok(())
56}
57
58/// Reject a non-default `config.survival_likelihood` when the response is not
59/// `Surv(...)`.
60///
61/// `survival_likelihood` selects the survival likelihood mode
62/// (`"location-scale"`, `"weibull"`, `"marginal-slope"`, `"latent"`,
63/// `"latent-binary"`, …) and is read *exclusively* inside `materialize_survival`.
64/// When the main formula has no `Surv(...)` response the survival materializer is
65/// never reached, so a survival-only knob like
66/// `survival_likelihood="weibull"` is parsed, validated, and then dropped on the
67/// floor — the request silently degrades to an ordinary Gaussian GAM (#1767),
68/// the same silent-no-op contract violation as the survival-only *terms* guarded
69/// by [`reject_survival_only_terms_for_nonsurvival`].
70///
71/// The default value is `"transformation"`, which is indistinguishable from
72/// "unset", so it (and only it) is allowed through here; every other,
73/// explicitly-requested mode is a configuration error and is rejected with the
74/// same phrasing the survival-only-term path uses.
75pub(crate) fn reject_survival_likelihood_for_nonsurvival(
76    config: &FitConfig,
77) -> Result<(), WorkflowError> {
78    let mode = config.survival_likelihood.trim();
79    // `"transformation"` is the default and is indistinguishable from "unset",
80    // so it is the only mode permitted on a non-survival response.
81    if mode.eq_ignore_ascii_case("transformation") {
82        return Ok(());
83    }
84    Err(WorkflowError::InvalidConfig {
85        reason: format!(
86            "survival_likelihood=\"{mode}\" is only supported in the main survival formula \
87             (a formula with a Surv(...) response); it selects a survival likelihood mode that \
88             is read exclusively by the survival fit path, so for a non-survival response it is \
89             meaningless and would otherwise be silently ignored (the requested survival model \
90             would degrade to an ordinary GAM). Wrap the response in Surv(...) or drop the \
91             survival_likelihood configuration."
92        ),
93    })
94}
95
96/// Reject an *explicitly requested* `linkwiggle(...)` term when the resolved
97/// response family is not binomial.
98///
99/// `linkwiggle(...)` adds a spline-flexible correction to the *link* function
100/// (logit / probit / cloglog), which only carries meaning for a binomial mean
101/// model — the standard and location-scale materializers wire `wiggle` into the
102/// fit only inside their `family.is_binomial()` arm. For a Gaussian / Gamma /
103/// Poisson / etc. response the term is built and then dropped on the floor,
104/// the same silent-no-op contract violation as #371. We error here.
105///
106/// This guards only the *explicit* formula term (`parsed.linkwiggle`). The
107/// implicit wiggle auto-derived from a `Flexible` link choice is checked by
108/// [`reject_flexible_link_for_nonbinomial`].
109pub(super) fn reject_explicit_linkwiggle_for_nonbinomial(
110    parsed: &ParsedFormula,
111    family: &LikelihoodSpec,
112) -> Result<(), WorkflowError> {
113    if parsed.linkwiggle.is_some() && !family.is_binomial() {
114        return Err(WorkflowError::InvalidConfig {
115            reason: "linkwiggle(...) corrects the link function of a binomial mean model \
116                     and is only supported for a binomial response; it is meaningless for \
117                     the resolved non-binomial family and would otherwise be silently ignored"
118                .to_string(),
119        });
120    }
121    Ok(())
122}
123
124pub(super) fn effective_link_choice_for_materialize(
125    parsed: &ParsedFormula,
126    config: &FitConfig,
127) -> Result<Option<LinkChoice>, WorkflowError> {
128    if let Some(linkspec) = parsed.linkspec.as_ref() {
129        if linkspec.mixture_rho.is_some()
130            || linkspec.sas_init.is_some()
131            || linkspec.beta_logistic_init.is_some()
132        {
133            return Err(WorkflowError::InvalidConfig {
134                reason: "link(...) initialization options are not supported by the materialized fit path; pass only link(type=...) in the formula"
135                    .to_string(),
136            });
137        }
138        return parse_link_choice(Some(&linkspec.link), false).map_err(WorkflowError::from);
139    }
140    parse_link_choice(config.link.as_deref(), config.flexible_link).map_err(WorkflowError::from)
141}
142
143/// Reject a `flexible(...)` link choice (the implicit link wiggle) when the
144/// resolved response family is not binomial.
145///
146/// `flexible(base)` adds a jointly-fit anchored spline offset to the base link.
147/// The whole offset engine ([`crate::gamlss::gaussian::BinomialMeanWiggleFamily`]
148/// and the location-scale wiggle solver) is specialised to the binomial mean
149/// likelihood: it differentiates the binomial neg-log-likelihood through the
150/// warped link to fourth order under a monotone-spline constraint. For a
151/// Gaussian / Poisson / Gamma / etc. response there is no implemented mean-wiggle
152/// solver, so the standard and location-scale materializers used to build the
153/// implicit wiggle and then drop it on the floor: a silent no-op of a
154/// documented link (`flexible(identity)` on Gaussian, `flexible(log)` on
155/// Poisson/Gamma fit bit-identically to the plain base link), gam#1275. Rather
156/// than silently discard a requested-and-documented link configuration we error
157/// loudly here, exactly as [`reject_explicit_linkwiggle_for_nonbinomial`] does
158/// for the explicit term. Wiring a genuine non-binomial mean-wiggle is tracked
159/// as a separate feature.
160pub(super) fn reject_flexible_link_for_nonbinomial(
161    link_choice: Option<&LinkChoice>,
162    family: &LikelihoodSpec,
163) -> Result<(), WorkflowError> {
164    let requested_flexible =
165        link_choice.is_some_and(|choice| matches!(choice.mode, LinkMode::Flexible));
166    if requested_flexible && !family.is_binomial() {
167        return Err(WorkflowError::InvalidConfig {
168            reason: format!(
169                "flexible(...) links (the jointly-fit anchored spline link offset) are \
170                 implemented only for a binomial response; the resolved family is {} (a \
171                 non-binomial family), for which the link offset has no solver and would \
172                 otherwise be silently discarded. Use the plain base link, or fit a binomial \
173                 response.",
174                family.pretty_name()
175            ),
176        });
177    }
178    Ok(())
179}
180
181/// Detect whether a response column is binary (0/1 only).
182pub fn is_binary_response(y: ArrayView1<'_, f64>) -> bool {
183    if y.is_empty() {
184        return false;
185    }
186    y.iter()
187        .all(|v| (*v - 0.0).abs() < 1e-12 || (*v - 1.0).abs() < 1e-12)
188}
189
190/// Verify that the dataset has at least as many rows as the smooth terms in
191/// `spec` need for their bases to be well-posed.
192///
193/// Each [`SmoothBasisSpec`] owns its own `min_sample_rows` lower bound — the
194/// B-spline knot count, the *penalized* tensor-product floor (the sum of the
195/// per-marginal column counts, not their Kronecker product, because a `te()`
196/// is regularized and its effective dof is a small fraction of the column
197/// count), the PCA matrix width — so this helper is a thin sum-and-compare:
198/// the workflow has no per-basis-kind knowledge. Adding a new smooth kind
199/// extends the basis `match` in `min_sample_rows`, not this gate.
200///
201/// Catches the README-quickstart failure mode (#309) where `n=4` against
202/// `y ~ s(x)` would otherwise surface as an opaque `cached inner beta has
203/// length 8` message from the inner-state seeding hook.
204pub(super) fn check_smooth_capacity(
205    spec: &gam_terms::smooth::TermCollectionSpec,
206    n_rows: usize,
207    response_name: &str,
208) -> Result<(), WorkflowError> {
209    // Intercept + 1 dof for the smoothing-parameter optimizer.
210    let mut required: usize = 2;
211    let mut per_term: Vec<(String, usize)> = Vec::new();
212    for term in &spec.smooth_terms {
213        let need = term.basis.min_sample_rows();
214        required = required.saturating_add(need);
215        per_term.push((term.name.clone(), need));
216    }
217    if per_term.is_empty() || n_rows >= required {
218        return Ok(());
219    }
220    let breakdown = per_term
221        .iter()
222        .map(|(name, k)| format!("{name}≥{k}"))
223        .collect::<Vec<_>>()
224        .join(", ");
225    Err(WorkflowError::InvalidConfig {
226        reason: format!(
227            "not enough observations to fit the requested formula: dataset has n={n_rows} \
228             rows but the smooth terms on response '{response_name}' need at least \
229             {required} rows total ({breakdown}, plus intercept + smoothing-parameter dof) \
230             before REML estimation is well-posed. \
231             Fix: add more training rows, replace `s(x)` with a linear term, or pass a \
232             smaller basis via `s(x, k=3)`."
233        ),
234    })
235}