gam_models/fit_orchestration/materialize/validation.rs
1use super::*;
2use gam_terms::inference::formula_dsl::LinkMode;
3
4pub(crate) fn reject_marginal_slope_controls_for_transformation_normal(
5 config: &FitConfig,
6) -> Result<(), WorkflowError> {
7 let family_requests_marginal_slope = config.family.as_deref().is_some_and(|family| {
8 let canonical = family.to_ascii_lowercase().replace('_', "-");
9 canonical == "bernoulli-marginal-slope" || canonical == "binary-marginal-slope"
10 });
11 if family_requests_marginal_slope
12 || config.logslope_formula.is_some()
13 || config.z_column.is_some()
14 || config.ctn_stage1.is_some()
15 {
16 return Err(WorkflowError::InvalidConfig {
17 reason: "transformation_normal cannot be combined with marginal-slope family controls"
18 .to_string(),
19 });
20 }
21 Ok(())
22}
23
24/// Reject `timewiggle(...)` / `survmodel(...)` in a formula whose response is
25/// not `Surv(...)`.
26///
27/// These two DSL controls only have meaning under the survival likelihood: a
28/// `timewiggle(...)` term parameterizes the time-varying baseline-hazard /
29/// log-cumulative-hazard surface, and `survmodel(...)` selects the survival
30/// likelihood mode. Both are read exclusively by `materialize_survival`. When
31/// the main formula has no `Surv(...)` response, leaving them unguarded means
32/// the term is parsed and option-validated and then dropped on the floor —
33/// the contract violation reported in #371. We error instead, with the same
34/// "only supported in the main survival formula" phrasing the auxiliary-formula
35/// path already uses.
36pub(crate) fn reject_survival_only_terms_for_nonsurvival(
37 parsed: &ParsedFormula,
38) -> Result<(), WorkflowError> {
39 if parsed.timewiggle.is_some() {
40 return Err(WorkflowError::InvalidConfig {
41 reason: "timewiggle(...) is only supported in the main survival formula \
42 (a formula with a Surv(...) response); it is meaningless for a \
43 non-survival response and would otherwise be silently ignored"
44 .to_string(),
45 });
46 }
47 if parsed.survivalspec.is_some() {
48 return Err(WorkflowError::InvalidConfig {
49 reason: "survmodel(...) is only supported in the main survival formula \
50 (a formula with a Surv(...) response); it is meaningless for a \
51 non-survival response and would otherwise be silently ignored"
52 .to_string(),
53 });
54 }
55 Ok(())
56}
57
58/// Reject a non-default `config.survival_likelihood` when the response is not
59/// `Surv(...)`.
60///
61/// `survival_likelihood` selects the survival likelihood mode
62/// (`"location-scale"`, `"weibull"`, `"marginal-slope"`, `"latent"`,
63/// `"latent-binary"`, …) and is read *exclusively* inside `materialize_survival`.
64/// When the main formula has no `Surv(...)` response the survival materializer is
65/// never reached, so a survival-only knob like
66/// `survival_likelihood="weibull"` is parsed, validated, and then dropped on the
67/// floor — the request silently degrades to an ordinary Gaussian GAM (#1767),
68/// the same silent-no-op contract violation as the survival-only *terms* guarded
69/// by [`reject_survival_only_terms_for_nonsurvival`].
70///
71/// The direct Rust API default is `"location-scale"` (lognormal AFT), while
72/// CLI/config-layer callers may still pass their documented `"transformation"`
73/// default explicitly. Both defaults are indistinguishable from "unset" at this
74/// validation seam, so they are allowed through here; every other,
75/// explicitly-requested mode is a configuration error and is rejected with the
76/// same phrasing the survival-only-term path uses.
77pub(crate) fn reject_survival_likelihood_for_nonsurvival(
78 config: &FitConfig,
79) -> Result<(), WorkflowError> {
80 let mode = config.survival_likelihood.trim();
81 // The library and CLI/config layers have different survival defaults; both are
82 // indistinguishable from "unset" by the time a non-survival formula reaches
83 // this seam, so neither should poison ordinary GAM materialization.
84 if mode.eq_ignore_ascii_case("transformation") || mode.eq_ignore_ascii_case("location-scale") {
85 return Ok(());
86 }
87 Err(WorkflowError::InvalidConfig {
88 reason: format!(
89 "survival_likelihood=\"{mode}\" is only supported in the main survival formula \
90 (a formula with a Surv(...) response); it selects a survival likelihood mode that \
91 is read exclusively by the survival fit path, so for a non-survival response it is \
92 meaningless and would otherwise be silently ignored (the requested survival model \
93 would degrade to an ordinary GAM). Wrap the response in Surv(...) or drop the \
94 survival_likelihood configuration."
95 ),
96 })
97}
98
99/// Reject an *explicitly requested* `linkwiggle(...)` term when the resolved
100/// response family is not binomial.
101///
102/// `linkwiggle(...)` adds a spline-flexible correction to the *link* function
103/// (logit / probit / cloglog), which only carries meaning for a binomial mean
104/// model — the standard and location-scale materializers wire `wiggle` into the
105/// fit only inside their `family.is_binomial()` arm. For a Gaussian / Gamma /
106/// Poisson / etc. response the term is built and then dropped on the floor,
107/// the same silent-no-op contract violation as #371. We error here.
108///
109/// This guards only the *explicit* formula term (`parsed.linkwiggle`). The
110/// implicit wiggle auto-derived from a `Flexible` link choice is checked by
111/// [`reject_flexible_link_for_nonbinomial`].
112pub(super) fn reject_explicit_linkwiggle_for_nonbinomial(
113 parsed: &ParsedFormula,
114 family: &LikelihoodSpec,
115) -> Result<(), WorkflowError> {
116 if parsed.linkwiggle.is_some() && !family.is_binomial() {
117 return Err(WorkflowError::InvalidConfig {
118 reason: "linkwiggle(...) corrects the link function of a binomial mean model \
119 and is only supported for a binomial response; it is meaningless for \
120 the resolved non-binomial family and would otherwise be silently ignored"
121 .to_string(),
122 });
123 }
124 Ok(())
125}
126
127pub(super) fn effective_link_choice_for_materialize(
128 parsed: &ParsedFormula,
129 config: &FitConfig,
130) -> Result<Option<LinkChoice>, WorkflowError> {
131 if let Some(linkspec) = parsed.linkspec.as_ref() {
132 if linkspec.mixture_rho.is_some()
133 || linkspec.sas_init.is_some()
134 || linkspec.beta_logistic_init.is_some()
135 {
136 return Err(WorkflowError::InvalidConfig {
137 reason: "link(...) initialization options are not supported by the materialized fit path; pass only link(type=...) in the formula"
138 .to_string(),
139 });
140 }
141 return parse_link_choice(Some(&linkspec.link), false).map_err(WorkflowError::from);
142 }
143 parse_link_choice(config.link.as_deref(), config.flexible_link).map_err(WorkflowError::from)
144}
145
146/// Reject a `flexible(...)` link choice (the implicit link wiggle) when the
147/// resolved response family is not binomial.
148///
149/// `flexible(base)` adds a jointly-fit anchored spline offset to the base link.
150/// The whole offset engine ([`crate::gamlss::gaussian::BinomialMeanWiggleFamily`]
151/// and the location-scale wiggle solver) is specialised to the binomial mean
152/// likelihood: it differentiates the binomial neg-log-likelihood through the
153/// warped link to fourth order under a monotone-spline constraint. For a
154/// Gaussian / Poisson / Gamma / etc. response there is no implemented mean-wiggle
155/// solver, so the standard and location-scale materializers used to build the
156/// implicit wiggle and then drop it on the floor: a silent no-op of a
157/// documented link (`flexible(identity)` on Gaussian, `flexible(log)` on
158/// Poisson/Gamma fit bit-identically to the plain base link), gam#1275. Rather
159/// than silently discard a requested-and-documented link configuration we error
160/// loudly here, exactly as [`reject_explicit_linkwiggle_for_nonbinomial`] does
161/// for the explicit term. Wiring a genuine non-binomial mean-wiggle is tracked
162/// as a separate feature.
163pub(super) fn reject_flexible_link_for_nonbinomial(
164 link_choice: Option<&LinkChoice>,
165 family: &LikelihoodSpec,
166) -> Result<(), WorkflowError> {
167 let requested_flexible =
168 link_choice.is_some_and(|choice| matches!(choice.mode, LinkMode::Flexible));
169 if requested_flexible && !family.is_binomial() {
170 return Err(WorkflowError::InvalidConfig {
171 reason: format!(
172 "flexible(...) links (the jointly-fit anchored spline link offset) are \
173 implemented only for a binomial response; the resolved family is {} (a \
174 non-binomial family), for which the link offset has no solver and would \
175 otherwise be silently discarded. Use the plain base link, or fit a binomial \
176 response.",
177 family.pretty_name()
178 ),
179 });
180 }
181 Ok(())
182}
183
184/// Detect whether a response column is binary (0/1 only).
185pub fn is_binary_response(y: ArrayView1<'_, f64>) -> bool {
186 if y.is_empty() {
187 return false;
188 }
189 y.iter()
190 .all(|v| (*v - 0.0).abs() < 1e-12 || (*v - 1.0).abs() < 1e-12)
191}
192
193/// Verify that the dataset has at least as many rows as the smooth terms in
194/// `spec` need for their bases to be well-posed.
195///
196/// Each [`SmoothBasisSpec`] owns its own `min_sample_rows` lower bound — the
197/// B-spline knot count, the *penalized* tensor-product floor (the sum of the
198/// per-marginal column counts, not their Kronecker product, because a `te()`
199/// is regularized and its effective dof is a small fraction of the column
200/// count), the PCA matrix width — so this helper is a thin sum-and-compare:
201/// the workflow has no per-basis-kind knowledge. Adding a new smooth kind
202/// extends the basis `match` in `min_sample_rows`, not this gate.
203///
204/// Catches the README-quickstart failure mode (#309) where `n=4` against
205/// `y ~ s(x)` would otherwise surface as an opaque `cached inner beta has
206/// length 8` message from the inner-state seeding hook.
207pub(super) fn check_smooth_capacity(
208 spec: &gam_terms::smooth::TermCollectionSpec,
209 n_rows: usize,
210 response_name: &str,
211) -> Result<(), WorkflowError> {
212 // Intercept + 1 dof for the smoothing-parameter optimizer.
213 let mut required: usize = 2;
214 let mut per_term: Vec<(String, usize)> = Vec::new();
215 for term in &spec.smooth_terms {
216 let need = term.basis.min_sample_rows();
217 required = required.saturating_add(need);
218 per_term.push((term.name.clone(), need));
219 }
220 if per_term.is_empty() || n_rows >= required {
221 return Ok(());
222 }
223 let breakdown = per_term
224 .iter()
225 .map(|(name, k)| format!("{name}≥{k}"))
226 .collect::<Vec<_>>()
227 .join(", ");
228 Err(WorkflowError::InvalidConfig {
229 reason: format!(
230 "not enough observations to fit the requested formula: dataset has n={n_rows} \
231 rows but the smooth terms on response '{response_name}' need at least \
232 {required} rows total ({breakdown}, plus intercept + smoothing-parameter dof) \
233 before REML estimation is well-posed. \
234 Fix: add more training rows, replace `s(x)` with a linear term, or pass a \
235 smaller basis via `s(x, k=3)`."
236 ),
237 })
238}