gam_models/fit_orchestration/materialize/validation.rs
1use super::*;
2use gam_terms::inference::formula_dsl::LinkMode;
3
4pub(crate) fn reject_marginal_slope_controls_for_transformation_normal(
5 config: &FitConfig,
6) -> Result<(), WorkflowError> {
7 let family_requests_marginal_slope = config.family.as_deref().is_some_and(|family| {
8 let canonical = family.to_ascii_lowercase().replace('_', "-");
9 canonical == "bernoulli-marginal-slope" || canonical == "binary-marginal-slope"
10 });
11 if family_requests_marginal_slope
12 || config.logslope_formula.is_some()
13 || config.z_column.is_some()
14 || config.ctn_stage1.is_some()
15 {
16 return Err(WorkflowError::InvalidConfig {
17 reason: "transformation_normal cannot be combined with marginal-slope family controls"
18 .to_string(),
19 });
20 }
21 Ok(())
22}
23
24/// Reject `timewiggle(...)` / `survmodel(...)` in a formula whose response is
25/// not `Surv(...)`.
26///
27/// These two DSL controls only have meaning under the survival likelihood: a
28/// `timewiggle(...)` term parameterizes the time-varying baseline-hazard /
29/// log-cumulative-hazard surface, and `survmodel(...)` selects the survival
30/// likelihood mode. Both are read exclusively by `materialize_survival`. When
31/// the main formula has no `Surv(...)` response, leaving them unguarded means
32/// the term is parsed and option-validated and then dropped on the floor —
33/// the contract violation reported in #371. We error instead, with the same
34/// "only supported in the main survival formula" phrasing the auxiliary-formula
35/// path already uses.
36pub(crate) fn reject_survival_only_terms_for_nonsurvival(
37 parsed: &ParsedFormula,
38) -> Result<(), WorkflowError> {
39 if parsed.timewiggle.is_some() {
40 return Err(WorkflowError::InvalidConfig {
41 reason: "timewiggle(...) is only supported in the main survival formula \
42 (a formula with a Surv(...) response); it is meaningless for a \
43 non-survival response and would otherwise be silently ignored"
44 .to_string(),
45 });
46 }
47 if parsed.survivalspec.is_some() {
48 return Err(WorkflowError::InvalidConfig {
49 reason: "survmodel(...) is only supported in the main survival formula \
50 (a formula with a Surv(...) response); it is meaningless for a \
51 non-survival response and would otherwise be silently ignored"
52 .to_string(),
53 });
54 }
55 Ok(())
56}
57
58/// Reject a non-default `config.survival_likelihood` when the response is not
59/// `Surv(...)`.
60///
61/// `survival_likelihood` selects the survival likelihood mode
62/// (`"location-scale"`, `"weibull"`, `"marginal-slope"`, `"latent"`,
63/// `"latent-binary"`, …) and is read *exclusively* inside `materialize_survival`.
64/// When the main formula has no `Surv(...)` response the survival materializer is
65/// never reached, so a survival-only knob like
66/// `survival_likelihood="weibull"` is parsed, validated, and then dropped on the
67/// floor — the request silently degrades to an ordinary Gaussian GAM (#1767),
68/// the same silent-no-op contract violation as the survival-only *terms* guarded
69/// by [`reject_survival_only_terms_for_nonsurvival`].
70///
71/// The default value is `"transformation"`, which is indistinguishable from
72/// "unset", so it (and only it) is allowed through here; every other,
73/// explicitly-requested mode is a configuration error and is rejected with the
74/// same phrasing the survival-only-term path uses.
75pub(crate) fn reject_survival_likelihood_for_nonsurvival(
76 config: &FitConfig,
77) -> Result<(), WorkflowError> {
78 let mode = config.survival_likelihood.trim();
79 // `"transformation"` is the default and is indistinguishable from "unset",
80 // so it is the only mode permitted on a non-survival response.
81 if mode.eq_ignore_ascii_case("transformation") {
82 return Ok(());
83 }
84 Err(WorkflowError::InvalidConfig {
85 reason: format!(
86 "survival_likelihood=\"{mode}\" is only supported in the main survival formula \
87 (a formula with a Surv(...) response); it selects a survival likelihood mode that \
88 is read exclusively by the survival fit path, so for a non-survival response it is \
89 meaningless and would otherwise be silently ignored (the requested survival model \
90 would degrade to an ordinary GAM). Wrap the response in Surv(...) or drop the \
91 survival_likelihood configuration."
92 ),
93 })
94}
95
96/// Reject an *explicitly requested* `linkwiggle(...)` term when the resolved
97/// response family is not binomial.
98///
99/// `linkwiggle(...)` adds a spline-flexible correction to the *link* function
100/// (logit / probit / cloglog), which only carries meaning for a binomial mean
101/// model — the standard and location-scale materializers wire `wiggle` into the
102/// fit only inside their `family.is_binomial()` arm. For a Gaussian / Gamma /
103/// Poisson / etc. response the term is built and then dropped on the floor,
104/// the same silent-no-op contract violation as #371. We error here.
105///
106/// This guards only the *explicit* formula term (`parsed.linkwiggle`). The
107/// implicit wiggle auto-derived from a `Flexible` link choice is checked by
108/// [`reject_flexible_link_for_nonbinomial`].
109pub(super) fn reject_explicit_linkwiggle_for_nonbinomial(
110 parsed: &ParsedFormula,
111 family: &LikelihoodSpec,
112) -> Result<(), WorkflowError> {
113 if parsed.linkwiggle.is_some() && !family.is_binomial() {
114 return Err(WorkflowError::InvalidConfig {
115 reason: "linkwiggle(...) corrects the link function of a binomial mean model \
116 and is only supported for a binomial response; it is meaningless for \
117 the resolved non-binomial family and would otherwise be silently ignored"
118 .to_string(),
119 });
120 }
121 Ok(())
122}
123
124pub(super) fn effective_link_choice_for_materialize(
125 parsed: &ParsedFormula,
126 config: &FitConfig,
127) -> Result<Option<LinkChoice>, WorkflowError> {
128 if let Some(linkspec) = parsed.linkspec.as_ref() {
129 if linkspec.mixture_rho.is_some()
130 || linkspec.sas_init.is_some()
131 || linkspec.beta_logistic_init.is_some()
132 {
133 return Err(WorkflowError::InvalidConfig {
134 reason: "link(...) initialization options are not supported by the materialized fit path; pass only link(type=...) in the formula"
135 .to_string(),
136 });
137 }
138 return parse_link_choice(Some(&linkspec.link), false).map_err(WorkflowError::from);
139 }
140 parse_link_choice(config.link.as_deref(), config.flexible_link).map_err(WorkflowError::from)
141}
142
143/// Reject a `flexible(...)` link choice (the implicit link wiggle) when the
144/// resolved response family is not binomial.
145///
146/// `flexible(base)` adds a jointly-fit anchored spline offset to the base link.
147/// The whole offset engine ([`crate::gamlss::gaussian::BinomialMeanWiggleFamily`]
148/// and the location-scale wiggle solver) is specialised to the binomial mean
149/// likelihood: it differentiates the binomial neg-log-likelihood through the
150/// warped link to fourth order under a monotone-spline constraint. For a
151/// Gaussian / Poisson / Gamma / etc. response there is no implemented mean-wiggle
152/// solver, so the standard and location-scale materializers used to build the
153/// implicit wiggle and then drop it on the floor: a silent no-op of a
154/// documented link (`flexible(identity)` on Gaussian, `flexible(log)` on
155/// Poisson/Gamma fit bit-identically to the plain base link), gam#1275. Rather
156/// than silently discard a requested-and-documented link configuration we error
157/// loudly here, exactly as [`reject_explicit_linkwiggle_for_nonbinomial`] does
158/// for the explicit term. Wiring a genuine non-binomial mean-wiggle is tracked
159/// as a separate feature.
160pub(super) fn reject_flexible_link_for_nonbinomial(
161 link_choice: Option<&LinkChoice>,
162 family: &LikelihoodSpec,
163) -> Result<(), WorkflowError> {
164 let requested_flexible =
165 link_choice.is_some_and(|choice| matches!(choice.mode, LinkMode::Flexible));
166 if requested_flexible && !family.is_binomial() {
167 return Err(WorkflowError::InvalidConfig {
168 reason: format!(
169 "flexible(...) links (the jointly-fit anchored spline link offset) are \
170 implemented only for a binomial response; the resolved family is {} (a \
171 non-binomial family), for which the link offset has no solver and would \
172 otherwise be silently discarded. Use the plain base link, or fit a binomial \
173 response.",
174 family.pretty_name()
175 ),
176 });
177 }
178 Ok(())
179}
180
181/// Detect whether a response column is binary (0/1 only).
182pub fn is_binary_response(y: ArrayView1<'_, f64>) -> bool {
183 if y.is_empty() {
184 return false;
185 }
186 y.iter()
187 .all(|v| (*v - 0.0).abs() < 1e-12 || (*v - 1.0).abs() < 1e-12)
188}
189
190/// Verify that the dataset has at least as many rows as the smooth terms in
191/// `spec` need for their bases to be well-posed.
192///
193/// Each [`SmoothBasisSpec`] owns its own `min_sample_rows` lower bound — the
194/// B-spline knot count, the *penalized* tensor-product floor (the sum of the
195/// per-marginal column counts, not their Kronecker product, because a `te()`
196/// is regularized and its effective dof is a small fraction of the column
197/// count), the PCA matrix width — so this helper is a thin sum-and-compare:
198/// the workflow has no per-basis-kind knowledge. Adding a new smooth kind
199/// extends the basis `match` in `min_sample_rows`, not this gate.
200///
201/// Catches the README-quickstart failure mode (#309) where `n=4` against
202/// `y ~ s(x)` would otherwise surface as an opaque `cached inner beta has
203/// length 8` message from the inner-state seeding hook.
204pub(super) fn check_smooth_capacity(
205 spec: &gam_terms::smooth::TermCollectionSpec,
206 n_rows: usize,
207 response_name: &str,
208) -> Result<(), WorkflowError> {
209 // Intercept + 1 dof for the smoothing-parameter optimizer.
210 let mut required: usize = 2;
211 let mut per_term: Vec<(String, usize)> = Vec::new();
212 for term in &spec.smooth_terms {
213 let need = term.basis.min_sample_rows();
214 required = required.saturating_add(need);
215 per_term.push((term.name.clone(), need));
216 }
217 if per_term.is_empty() || n_rows >= required {
218 return Ok(());
219 }
220 let breakdown = per_term
221 .iter()
222 .map(|(name, k)| format!("{name}≥{k}"))
223 .collect::<Vec<_>>()
224 .join(", ");
225 Err(WorkflowError::InvalidConfig {
226 reason: format!(
227 "not enough observations to fit the requested formula: dataset has n={n_rows} \
228 rows but the smooth terms on response '{response_name}' need at least \
229 {required} rows total ({breakdown}, plus intercept + smoothing-parameter dof) \
230 before REML estimation is well-posed. \
231 Fix: add more training rows, replace `s(x)` with a linear term, or pass a \
232 smaller basis via `s(x, k=3)`."
233 ),
234 })
235}