Skip to main content

gam_models/fit_orchestration/materialize/
validation.rs

1use super::*;
2use gam_terms::inference::formula_dsl::LinkMode;
3
4pub(crate) fn reject_marginal_slope_controls_for_transformation_normal(
5    config: &FitConfig,
6) -> Result<(), WorkflowError> {
7    let family_requests_marginal_slope = config.family.as_deref().is_some_and(|family| {
8        let canonical = family.to_ascii_lowercase().replace('_', "-");
9        canonical == "bernoulli-marginal-slope" || canonical == "binary-marginal-slope"
10    });
11    if family_requests_marginal_slope
12        || config.logslope_formula.is_some()
13        || config.z_column.is_some()
14        || config.ctn_stage1.is_some()
15    {
16        return Err(WorkflowError::InvalidConfig {
17            reason: "transformation_normal cannot be combined with marginal-slope family controls"
18                .to_string(),
19        });
20    }
21    Ok(())
22}
23
24/// Reject `timewiggle(...)` / `survmodel(...)` in a formula whose response is
25/// not `Surv(...)`.
26///
27/// These two DSL controls only have meaning under the survival likelihood: a
28/// `timewiggle(...)` term parameterizes the time-varying baseline-hazard /
29/// log-cumulative-hazard surface, and `survmodel(...)` selects the survival
30/// likelihood mode. Both are read exclusively by `materialize_survival`. When
31/// the main formula has no `Surv(...)` response, leaving them unguarded means
32/// the term is parsed and option-validated and then dropped on the floor —
33/// the contract violation reported in #371. We error instead, with the same
34/// "only supported in the main survival formula" phrasing the auxiliary-formula
35/// path already uses.
36pub(crate) fn reject_survival_only_terms_for_nonsurvival(
37    parsed: &ParsedFormula,
38) -> Result<(), WorkflowError> {
39    if parsed.timewiggle.is_some() {
40        return Err(WorkflowError::InvalidConfig {
41            reason: "timewiggle(...) is only supported in the main survival formula \
42                     (a formula with a Surv(...) response); it is meaningless for a \
43                     non-survival response and would otherwise be silently ignored"
44                .to_string(),
45        });
46    }
47    if parsed.survivalspec.is_some() {
48        return Err(WorkflowError::InvalidConfig {
49            reason: "survmodel(...) is only supported in the main survival formula \
50                     (a formula with a Surv(...) response); it is meaningless for a \
51                     non-survival response and would otherwise be silently ignored"
52                .to_string(),
53        });
54    }
55    Ok(())
56}
57
58/// Reject an *explicitly requested* `linkwiggle(...)` term when the resolved
59/// response family is not binomial.
60///
61/// `linkwiggle(...)` adds a spline-flexible correction to the *link* function
62/// (logit / probit / cloglog), which only carries meaning for a binomial mean
63/// model — the standard and location-scale materializers wire `wiggle` into the
64/// fit only inside their `family.is_binomial()` arm. For a Gaussian / Gamma /
65/// Poisson / etc. response the term is built and then dropped on the floor,
66/// the same silent-no-op contract violation as #371. We error here.
67///
68/// This guards only the *explicit* formula term (`parsed.linkwiggle`). The
69/// implicit wiggle auto-derived from a `Flexible` link choice is checked by
70/// [`reject_flexible_link_for_nonbinomial`].
71pub(super) fn reject_explicit_linkwiggle_for_nonbinomial(
72    parsed: &ParsedFormula,
73    family: &LikelihoodSpec,
74) -> Result<(), WorkflowError> {
75    if parsed.linkwiggle.is_some() && !family.is_binomial() {
76        return Err(WorkflowError::InvalidConfig {
77            reason: "linkwiggle(...) corrects the link function of a binomial mean model \
78                     and is only supported for a binomial response; it is meaningless for \
79                     the resolved non-binomial family and would otherwise be silently ignored"
80                .to_string(),
81        });
82    }
83    Ok(())
84}
85
86pub(super) fn effective_link_choice_for_materialize(
87    parsed: &ParsedFormula,
88    config: &FitConfig,
89) -> Result<Option<LinkChoice>, WorkflowError> {
90    if let Some(linkspec) = parsed.linkspec.as_ref() {
91        if linkspec.mixture_rho.is_some()
92            || linkspec.sas_init.is_some()
93            || linkspec.beta_logistic_init.is_some()
94        {
95            return Err(WorkflowError::InvalidConfig {
96                reason: "link(...) initialization options are not supported by the materialized fit path; pass only link(type=...) in the formula"
97                    .to_string(),
98            });
99        }
100        return parse_link_choice(Some(&linkspec.link), false).map_err(WorkflowError::from);
101    }
102    parse_link_choice(config.link.as_deref(), config.flexible_link).map_err(WorkflowError::from)
103}
104
105/// Reject a `flexible(...)` link choice (the implicit link wiggle) when the
106/// resolved response family is not binomial.
107///
108/// `flexible(base)` adds a jointly-fit anchored spline offset to the base link.
109/// The whole offset engine ([`crate::gamlss::gaussian::BinomialMeanWiggleFamily`]
110/// and the location-scale wiggle solver) is specialised to the binomial mean
111/// likelihood: it differentiates the binomial neg-log-likelihood through the
112/// warped link to fourth order under a monotone-spline constraint. For a
113/// Gaussian / Poisson / Gamma / etc. response there is no implemented mean-wiggle
114/// solver, so the standard and location-scale materializers used to build the
115/// implicit wiggle and then drop it on the floor: a silent no-op of a
116/// documented link (`flexible(identity)` on Gaussian, `flexible(log)` on
117/// Poisson/Gamma fit bit-identically to the plain base link), gam#1275. Rather
118/// than silently discard a requested-and-documented link configuration we error
119/// loudly here, exactly as [`reject_explicit_linkwiggle_for_nonbinomial`] does
120/// for the explicit term. Wiring a genuine non-binomial mean-wiggle is tracked
121/// as a separate feature.
122pub(super) fn reject_flexible_link_for_nonbinomial(
123    link_choice: Option<&LinkChoice>,
124    family: &LikelihoodSpec,
125) -> Result<(), WorkflowError> {
126    let requested_flexible =
127        link_choice.is_some_and(|choice| matches!(choice.mode, LinkMode::Flexible));
128    if requested_flexible && !family.is_binomial() {
129        return Err(WorkflowError::InvalidConfig {
130            reason: format!(
131                "flexible(...) links (the jointly-fit anchored spline link offset) are \
132                 implemented only for a binomial response; the resolved family is {} (a \
133                 non-binomial family), for which the link offset has no solver and would \
134                 otherwise be silently discarded. Use the plain base link, or fit a binomial \
135                 response.",
136                family.pretty_name()
137            ),
138        });
139    }
140    Ok(())
141}
142
143/// Detect whether a response column is binary (0/1 only).
144pub fn is_binary_response(y: ArrayView1<'_, f64>) -> bool {
145    if y.is_empty() {
146        return false;
147    }
148    y.iter()
149        .all(|v| (*v - 0.0).abs() < 1e-12 || (*v - 1.0).abs() < 1e-12)
150}
151
152/// Verify that the dataset has at least as many rows as the smooth terms in
153/// `spec` need for their bases to be well-posed.
154///
155/// Each [`SmoothBasisSpec`] owns its own `min_sample_rows` lower bound — the
156/// B-spline knot count, the *penalized* tensor-product floor (the sum of the
157/// per-marginal column counts, not their Kronecker product, because a `te()`
158/// is regularized and its effective dof is a small fraction of the column
159/// count), the PCA matrix width — so this helper is a thin sum-and-compare:
160/// the workflow has no per-basis-kind knowledge. Adding a new smooth kind
161/// extends the basis `match` in `min_sample_rows`, not this gate.
162///
163/// Catches the README-quickstart failure mode (#309) where `n=4` against
164/// `y ~ s(x)` would otherwise surface as an opaque `cached inner beta has
165/// length 8` message from the inner-state seeding hook.
166pub(super) fn check_smooth_capacity(
167    spec: &gam_terms::smooth::TermCollectionSpec,
168    n_rows: usize,
169    response_name: &str,
170) -> Result<(), WorkflowError> {
171    // Intercept + 1 dof for the smoothing-parameter optimizer.
172    let mut required: usize = 2;
173    let mut per_term: Vec<(String, usize)> = Vec::new();
174    for term in &spec.smooth_terms {
175        let need = term.basis.min_sample_rows();
176        required = required.saturating_add(need);
177        per_term.push((term.name.clone(), need));
178    }
179    if per_term.is_empty() || n_rows >= required {
180        return Ok(());
181    }
182    let breakdown = per_term
183        .iter()
184        .map(|(name, k)| format!("{name}≥{k}"))
185        .collect::<Vec<_>>()
186        .join(", ");
187    Err(WorkflowError::InvalidConfig {
188        reason: format!(
189            "not enough observations to fit the requested formula: dataset has n={n_rows} \
190             rows but the smooth terms on response '{response_name}' need at least \
191             {required} rows total ({breakdown}, plus intercept + smoothing-parameter dof) \
192             before REML estimation is well-posed. \
193             Fix: add more training rows, replace `s(x)` with a linear term, or pass a \
194             smaller basis via `s(x, k=3)`."
195        ),
196    })
197}