gam_terms/term_builder.rs
1//! Term construction: bridge from parsed formula terms to `TermCollectionSpec`.
2//!
3//! This module takes the AST produced by `inference::formula_dsl` and a loaded
4//! dataset, resolves column references, infers knot counts and center strategies,
5//! and produces a `TermCollectionSpec` ready for `build_term_collection_design`.
6
7use std::collections::{BTreeMap, BTreeSet, HashMap};
8use std::path::PathBuf;
9
10use ndarray::{Array2, ArrayView1};
11
12use crate::basis::{
13 BSplineBasisSpec, BSplineBoundaryConditions, BSplineEndpointBoundaryCondition,
14 BSplineIdentifiability, BSplineKnotSpec, CenterCountRequest, CenterStrategy,
15 ConstantCurvatureBasisSpec, ConstantCurvatureIdentifiability, DuchonBasisSpec,
16 DuchonNullspaceOrder, DuchonOperatorPenaltySpec, MaternBasisSpec, MaternIdentifiability,
17 MaternNu, MeasureJetBasisSpec, MeasureJetIdentifiability, OneDimensionalBoundary,
18 SpatialIdentifiability, SphereMethod, SphereWahbaKernel, SphericalSplineBasisSpec,
19 SphericalSplineIdentifiability, ThinPlateBasisSpec, auto_spatial_center_strategy,
20 default_num_centers, default_spatial_center_strategy, default_spherical_harmonic_degree,
21 plan_spatial_basis, thin_plate_penalty_order,
22};
23use crate::inference::formula_dsl::{
24 ParsedTerm, SmoothKind, option_bool, option_f64, option_f64_strict, option_usize,
25 option_usize_any, option_usize_any_strict, option_usize_strict, strip_quotes,
26};
27use crate::smooth::{
28 ByVarKind, FactorSmoothFlavour, FactorSmoothSpec, LinearCoefficientGeometry, LinearTermSpec,
29 RandomEffectTermSpec, ShapeConstraint, SmoothBasisSpec, SmoothTermSpec,
30 TensorBSplineIdentifiability, TensorBSplinePenaltyDecomposition, TensorBSplineSpec,
31 TermCollectionSpec,
32};
33use gam_problem::types::ColIdx;
34use gam_data::{ColumnKindTag, DataError, EncodedDataset as Dataset};
35use gam_runtime::resource::ResourcePolicy;
36
37/// Default B-spline degree when a smooth's `degree=` option is absent. Cubic
38/// (degree 3) is the standard GAM convention: C² continuity with a low knot
39/// count.
40const DEFAULT_BSPLINE_DEGREE: usize = 3;
41
42/// Default difference-penalty order when a smooth's `penalty_order=` (alias
43/// `m=`) option is absent. Second-order (curvature) is the standard P-spline
44/// convention.
45const DEFAULT_PENALTY_ORDER: usize = 2;
46
47/// Default basis dimension for one-dimensional cyclic cubic P-splines.
48///
49/// Periodic smooths spend no coefficients on free endpoints, so they should not
50/// inherit the larger open B-spline knot ceiling by default. This is still only
51/// a default: callers can request a richer periodic space with `k=`.
52const CYCLIC_DEFAULT_BASIS_DIM: usize = 12;
53
54/// Default shared-marginal basis dimension for `bs="fs"`/`bs="sz"` factor smooths,
55/// matching mgcv's factor-smooth default `k=10`. A factor smooth shares one
56/// marginal across all levels; a modest basis recovers the shared signal without
57/// over-fitting each group's within-group noise (gam#903). Overridden by an
58/// explicit `k`/`basis_dim`.
59const FACTOR_SMOOTH_DEFAULT_BASIS_DIM: usize = 10;
60
61/// Default row-chunk size for the out-of-core PCA-basis smooth when the
62/// `chunk_size=` option is absent. Streams the design in row blocks to bound
63/// peak memory independent of the dataset row count.
64const DEFAULT_PCA_CHUNK_SIZE: usize = 4096;
65
66// ---------------------------------------------------------------------------
67// Typed errors
68// ---------------------------------------------------------------------------
69
70/// Typed errors emitted by term-builder helpers. `Display` reproduces the exact
71/// pre-refactor `format!(...)` text byte-for-byte, so callers that string-match
72/// on the message (tests, log assertions) keep working unchanged. Public-API
73/// functions still return `Result<_, String>` and use `.to_string()` shims at
74/// their boundary to stay compatible with callers in protected modules.
75#[derive(Clone, Debug)]
76pub enum TermBuilderError {
77 /// Column-resolution / column-kind lookup failures whose context is purely
78 /// internal (column-kind table out-of-sync, alias map missing an entry,
79 /// etc.). User-facing "this formula references a column that doesn't
80 /// exist" diagnostics use the dedicated `ColumnNotFound` variant so the
81 /// FFI boundary can lift the structured payload into a Python
82 /// `ColumnNotFoundError` without parsing prose.
83 MissingColumn { reason: String },
84 /// A formula referenced a column that is not present in the input data.
85 /// Mirrors `DataError::ColumnNotFound` field-for-field so the conversion
86 /// across module boundaries is a pure data move (no re-derivation, no
87 /// string re-parsing). Public callers see byte-identical `Display`
88 /// output to the legacy `missing_column_message` text.
89 ColumnNotFound {
90 name: String,
91 role: Option<String>,
92 available: Vec<String>,
93 similar: Vec<String>,
94 tsv_hint: bool,
95 },
96 /// User-specified configuration is internally inconsistent (e.g. too few
97 /// variables for a smooth type, conflicting size options, requested basis
98 /// dimension below the polynomial nullspace).
99 IncompatibleConfig { reason: String },
100 /// Option parsing failure: malformed numeric expression, unknown option
101 /// key, out-of-range integer, list-length mismatch, etc.
102 InvalidOption { reason: String },
103 /// User requested a feature that is intentionally not supported (unknown
104 /// smooth type / method / kernel / identifiability, non-zero anchor,
105 /// internal-only token, etc.).
106 UnsupportedFeature { reason: String },
107 /// Input data is degenerate for the requested term (constant column,
108 /// non-finite categorical entries, ...).
109 DegenerateData { reason: String },
110 /// Term-collection-stage formula error — a node that the caller was
111 /// supposed to resolve upstream reached the builder.
112 MalformedFormula { reason: String },
113}
114
115impl std::fmt::Display for TermBuilderError {
116 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
117 match self {
118 TermBuilderError::MissingColumn { reason }
119 | TermBuilderError::IncompatibleConfig { reason }
120 | TermBuilderError::InvalidOption { reason }
121 | TermBuilderError::UnsupportedFeature { reason }
122 | TermBuilderError::DegenerateData { reason }
123 | TermBuilderError::MalformedFormula { reason } => f.write_str(reason),
124 // Delegate to the canonical `DataError::ColumnNotFound` formatter
125 // so a single source of truth defines the human text. The
126 // intermediate `DataError` constructed here owns its strings only
127 // for the duration of the Display call — no allocation cost
128 // beyond the original payload that this variant already holds.
129 TermBuilderError::ColumnNotFound {
130 name,
131 role,
132 available,
133 similar,
134 tsv_hint,
135 } => {
136 let canonical = DataError::ColumnNotFound {
137 name: name.clone(),
138 role: role.clone(),
139 available: available.clone(),
140 similar: similar.clone(),
141 tsv_hint: *tsv_hint,
142 };
143 std::fmt::Display::fmt(&canonical, f)
144 }
145 }
146 }
147}
148
149impl From<TermBuilderError> for String {
150 fn from(err: TermBuilderError) -> String {
151 err.to_string()
152 }
153}
154
155/// Catchall lift for the term-builder's internal `Result<_, String>` helpers
156/// (numeric expression parsing, option lookup, boundary-condition parsing,
157/// ...) that flow into `build_termspec` via `?`. Maps to
158/// `IncompatibleConfig`, which is the most appropriate generic bucket for
159/// option/config-style failures — leaf sites that emit structured payloads
160/// (`From<DataError>` for column-not-found) bypass this fallback.
161impl From<String> for TermBuilderError {
162 fn from(reason: String) -> Self {
163 Self::IncompatibleConfig { reason }
164 }
165}
166
167/// Typed lift from data-layer errors. `DataError::ColumnNotFound` becomes
168/// `TermBuilderError::ColumnNotFound` field-for-field — no stringification,
169/// no information loss — so the FFI boundary downstream can dispatch on
170/// the typed variant. Other `DataError` variants degrade into
171/// `MissingColumn` since they describe column-resolution-time failures
172/// without a dedicated structured destination.
173impl From<DataError> for TermBuilderError {
174 fn from(err: DataError) -> Self {
175 match err {
176 DataError::ColumnNotFound {
177 name,
178 role,
179 available,
180 similar,
181 tsv_hint,
182 } => Self::ColumnNotFound {
183 name,
184 role,
185 available,
186 similar,
187 tsv_hint,
188 },
189 DataError::SchemaMismatch { reason }
190 | DataError::ParseError { reason }
191 | DataError::EncodingFailure { reason }
192 | DataError::EmptyInput { reason }
193 | DataError::InvalidValue { reason } => Self::MissingColumn { reason },
194 }
195 }
196}
197
198// Constructor helpers — keep error-site code compact and consistent.
199impl TermBuilderError {
200 #[inline]
201 fn missing_column(reason: impl Into<String>) -> Self {
202 TermBuilderError::MissingColumn {
203 reason: reason.into(),
204 }
205 }
206 #[inline]
207 fn incompatible_config(reason: impl Into<String>) -> Self {
208 TermBuilderError::IncompatibleConfig {
209 reason: reason.into(),
210 }
211 }
212 #[inline]
213 fn invalid_option(reason: impl Into<String>) -> Self {
214 TermBuilderError::InvalidOption {
215 reason: reason.into(),
216 }
217 }
218 #[inline]
219 fn unsupported_feature(reason: impl Into<String>) -> Self {
220 TermBuilderError::UnsupportedFeature {
221 reason: reason.into(),
222 }
223 }
224 #[inline]
225 fn degenerate_data(reason: impl Into<String>) -> Self {
226 TermBuilderError::DegenerateData {
227 reason: reason.into(),
228 }
229 }
230 #[inline]
231 fn malformed_formula(reason: impl Into<String>) -> Self {
232 TermBuilderError::MalformedFormula {
233 reason: reason.into(),
234 }
235 }
236}
237
238// ---------------------------------------------------------------------------
239// Column resolution
240// ---------------------------------------------------------------------------
241
242/// Resolve a bare column name to its index, returning a typed
243/// `DataError::ColumnNotFound` on miss so the FFI boundary can surface a
244/// structured `gamfit.ColumnNotFoundError(column=…, available=…)` rather
245/// than rely on string-classification of human prose. Internal callers that
246/// still flow `Result<_, String>` get byte-identical text via
247/// `From<DataError> for String`.
248pub fn resolve_col(col_map: &HashMap<String, usize>, name: &str) -> Result<usize, DataError> {
249 col_map
250 .get(name)
251 .copied()
252 .ok_or_else(|| DataError::column_not_found(col_map, name, None))
253}
254
255/// Like `resolve_col` but tags the missing-column payload with a role label
256/// (`"response"`, `"entry"`, `"exit"`, `"event"`, `"z"`, `"id"`, …) so the
257/// boundary-side Python exception can disambiguate which formula slot held
258/// the bad reference.
259pub fn resolve_role_col(
260 col_map: &HashMap<String, usize>,
261 name: &str,
262 role: &str,
263) -> Result<usize, DataError> {
264 col_map
265 .get(name)
266 .copied()
267 .ok_or_else(|| DataError::column_not_found(col_map, name, Some(role)))
268}
269
270fn encoded_levels_for_column(ds: &Dataset, col: ColIdx) -> Vec<(u64, String)> {
271 let mut seen = BTreeSet::<u64>::new();
272 for value in ds.values.column(col.get()) {
273 if value.is_finite() {
274 seen.insert(value.to_bits());
275 }
276 }
277 let schema_levels = ds
278 .schema
279 .columns
280 .get(col.get())
281 .map(|column| column.levels.as_slice())
282 .unwrap_or(&[]);
283 seen.into_iter()
284 .enumerate()
285 .map(|(idx, bits)| {
286 let fallback = format!("level{}", idx + 1);
287 let label = schema_levels.get(idx).cloned().unwrap_or(fallback);
288 (bits, label)
289 })
290 .collect()
291}
292
293pub fn column_map_with_alias(
294 col_map: &HashMap<String, usize>,
295 alias: &str,
296 target_column: &str,
297) -> HashMap<String, usize> {
298 let mut aliased = col_map.clone();
299 if let Some(idx) = col_map.get(target_column).copied() {
300 aliased.entry(alias.to_string()).or_insert(idx);
301 }
302 aliased
303}
304
305// ---------------------------------------------------------------------------
306// ParsedTerm[] + Dataset → TermCollectionSpec
307// ---------------------------------------------------------------------------
308
309pub fn build_termspec(
310 terms: &[ParsedTerm],
311 ds: &Dataset,
312 col_map: &HashMap<String, usize>,
313 inference_notes: &mut Vec<String>,
314 policy: &ResourcePolicy,
315) -> Result<TermCollectionSpec, TermBuilderError> {
316 let mut linear_terms = Vec::<LinearTermSpec>::new();
317 let mut random_terms = Vec::<RandomEffectTermSpec>::new();
318 let mut smooth_terms = Vec::<SmoothTermSpec>::new();
319 let smooth_coordinate_count = terms
320 .iter()
321 .map(|term| match term {
322 ParsedTerm::Smooth { vars, .. } => vars.len(),
323 _ => 0,
324 })
325 .sum::<usize>();
326
327 for t in terms {
328 match t {
329 ParsedTerm::Linear {
330 name,
331 explicit,
332 coefficient_min,
333 coefficient_max,
334 } => {
335 let col = resolve_col(col_map, name)?;
336 let auto_kind = ds.column_kinds.get(col).copied().ok_or_else(|| {
337 TermBuilderError::missing_column(format!(
338 "internal column-kind lookup failed for '{name}'"
339 ))
340 .to_string()
341 })?;
342 if *explicit {
343 linear_terms.push(LinearTermSpec {
344 name: name.clone(),
345 feature_col: col,
346 feature_cols: vec![col],
347 categorical_levels: vec![],
348 // Parametric linear terms are unpenalized by default
349 // (MLE, matching mgcv/glm); see #749.
350 double_penalty: false,
351 coefficient_geometry: LinearCoefficientGeometry::Unconstrained,
352 coefficient_min: *coefficient_min,
353 coefficient_max: *coefficient_max,
354 });
355 } else {
356 match auto_kind {
357 ColumnKindTag::Continuous | ColumnKindTag::Binary => {
358 linear_terms.push(LinearTermSpec {
359 name: name.clone(),
360 feature_col: col,
361 feature_cols: vec![col],
362 categorical_levels: vec![],
363 // Unpenalized parametric effect by default (#749).
364 double_penalty: false,
365 coefficient_geometry: LinearCoefficientGeometry::Unconstrained,
366 coefficient_min: *coefficient_min,
367 coefficient_max: *coefficient_max,
368 });
369 }
370 ColumnKindTag::Categorical => {
371 if coefficient_min.is_some() || coefficient_max.is_some() {
372 return Err(TermBuilderError::incompatible_config(format!(
373 "coefficient constraints are not supported for categorical auto-random-effect term '{name}'; use group({name}) or an unconstrained numeric term"
374 )));
375 }
376 random_terms.push(RandomEffectTermSpec {
377 name: name.clone(),
378 feature_col: col,
379 drop_first_level: false,
380 penalized: true,
381 frozen_levels: None,
382 });
383 }
384 }
385 }
386 }
387 ParsedTerm::BoundedLinear {
388 name,
389 min,
390 max,
391 prior,
392 } => {
393 let col = resolve_col(col_map, name)?;
394 let auto_kind = ds.column_kinds.get(col).copied().ok_or_else(|| {
395 TermBuilderError::missing_column(format!(
396 "internal column-kind lookup failed for '{name}'"
397 ))
398 .to_string()
399 })?;
400 if !matches!(auto_kind, ColumnKindTag::Continuous | ColumnKindTag::Binary) {
401 return Err(TermBuilderError::incompatible_config(format!(
402 "bounded() currently supports only numeric columns, got categorical '{name}'"
403 )));
404 }
405 linear_terms.push(LinearTermSpec {
406 name: name.clone(),
407 feature_col: col,
408 feature_cols: vec![col],
409 categorical_levels: vec![],
410 double_penalty: false,
411 coefficient_geometry: LinearCoefficientGeometry::Bounded {
412 min: *min,
413 max: *max,
414 prior: prior.clone(),
415 },
416 coefficient_min: None,
417 coefficient_max: None,
418 });
419 }
420 ParsedTerm::RandomEffect { name } => {
421 let col = resolve_col(col_map, name)?;
422 random_terms.push(RandomEffectTermSpec {
423 name: name.clone(),
424 feature_col: col,
425 drop_first_level: false,
426 penalized: true,
427 frozen_levels: None,
428 });
429 }
430 ParsedTerm::Smooth {
431 label,
432 vars,
433 kind,
434 options,
435 } => {
436 let smooth_vars = vars.clone();
437 let by_name = options.get("by").cloned();
438 // `bs="sz"` (sum-to-zero), like `bs="fs"`/`bs="re"`, is a
439 // factor-smooth family handled natively by `build_smooth_basis`'s
440 // fs/sz/re path: it detects the categorical factor among the
441 // variables and emits a `SmoothBasisSpec::FactorSmooth { Sz }`
442 // with the correct single-penalty marginal and modest default
443 // basis. Route sz straight through `build_smooth_basis` rather
444 // than intercepting it into a legacy `FactorSumToZero` envelope
445 // here (which left `sz(fac, x)` mis-typed as `FactorSumToZero`
446 // instead of the expected `FactorSmooth { Sz }`).
447 let cols = smooth_vars
448 .iter()
449 .map(|v| resolve_col(col_map, v))
450 .collect::<Result<Vec<_>, _>>()?;
451 let mut inner_options = options.clone();
452 inner_options.remove("by");
453 // `ordered=` is consumed here (ByVarKind::Factor routing) and
454 // must not propagate to the inner basis builder, which has no
455 // allow-list entry for it and would reject it as an unknown option.
456 inner_options.remove("ordered");
457 // Pop the shape constraint before `build_smooth_basis` runs so
458 // it never reaches the per-kind `validate_known_options`
459 // allow-lists (the constraint is a property of the smooth term,
460 // not of any one basis kind). Basis-incompatible requests still
461 // fail loudly downstream via `shape_supports_basis`.
462 let shape = match inner_options.remove("shape") {
463 None => ShapeConstraint::None,
464 Some(raw) => crate::smooth::parse_shape_constraint(&raw)
465 .map_err(TermBuilderError::invalid_option)?,
466 };
467 let inner_basis = build_smooth_basis(
468 *kind,
469 &smooth_vars,
470 &cols,
471 &inner_options,
472 ds,
473 inference_notes,
474 policy,
475 smooth_coordinate_count,
476 )?;
477 if let Some(by_name) = by_name {
478 let by_col = resolve_col(col_map, &by_name)?;
479 match ds.column_kinds.get(by_col).copied().ok_or_else(|| {
480 format!("internal column-kind lookup failed for by variable '{by_name}'")
481 })? {
482 ColumnKindTag::Categorical => {
483 let levels = encoded_levels_for_column(ds, ColIdx::new(by_col));
484 // A penalized random block for this factor already
485 // owns its full level offsets when EITHER an explicit
486 // `group(factor)` appears, OR a *bare* categorical
487 // `+ factor` does — the latter is auto-promoted to a
488 // penalized random-effect block (see the
489 // `ParsedTerm::Linear` / `ColumnKindTag::Categorical`
490 // arm above, `penalized: true`). Both representations
491 // carry the same per-level offsets, so #1457: the
492 // `by=` branch must NOT additionally add its own
493 // unpenalized treatment-coded main effect, which would
494 // double-represent the factor (two `g` design blocks +
495 // a spurious extra smoothing parameter).
496 let penalized_group_owner_present =
497 terms.iter().any(|other| match other {
498 ParsedTerm::RandomEffect { name } => name == &by_name,
499 ParsedTerm::Linear {
500 name,
501 explicit: false,
502 ..
503 } if name == &by_name => col_map
504 .get(name)
505 .and_then(|c| ds.column_kinds.get(*c).copied())
506 .map(|kind| matches!(kind, ColumnKindTag::Categorical))
507 .unwrap_or(false),
508 _ => false,
509 });
510 // Add an unpenalized treatment-coded fixed main
511 // effect for a standalone factor-by smooth, unless
512 // the same factor already has an explicit
513 // `group(factor)` term OR a bare categorical `+
514 // factor` that was auto-promoted to a penalized
515 // random block (#1457). In those mixed-model forms
516 // the penalized random intercept is the coherent
517 // owner of level offsets; adding a no-pooling fixed
518 // factor effect would bypass random-effect
519 // shrinkage and degrade BLUP-style predictions.
520 if !random_terms.iter().any(|rt| rt.name == by_name)
521 && !penalized_group_owner_present
522 {
523 random_terms.push(RandomEffectTermSpec {
524 name: by_name.clone(),
525 feature_col: by_col,
526 drop_first_level: true,
527 penalized: false,
528 frozen_levels: None,
529 });
530 }
531 // Route to a single BySmooth::Factor term with
532 // frozen levels pre-populated from the training data.
533 // Design building later gates each level into its own
534 // column block (see build_by_smooth_local in term_specs).
535 let frozen_levels: Vec<u64> =
536 levels.iter().map(|(bits, _)| *bits).collect();
537 smooth_terms.push(SmoothTermSpec {
538 name: label.clone(),
539 basis: SmoothBasisSpec::BySmooth {
540 smooth: Box::new(inner_basis),
541 by_kind: ByVarKind::Factor {
542 feature_col: by_col,
543 ordered: option_bool(options, "ordered").unwrap_or(false),
544 frozen_levels: Some(frozen_levels),
545 },
546 },
547 shape,
548 joint_null_rotation: None,
549 });
550 }
551 ColumnKindTag::Binary | ColumnKindTag::Continuous => {
552 smooth_terms.push(SmoothTermSpec {
553 name: label.clone(),
554 basis: SmoothBasisSpec::BySmooth {
555 smooth: Box::new(inner_basis),
556 by_kind: ByVarKind::Numeric {
557 feature_col: by_col,
558 },
559 },
560 shape,
561 joint_null_rotation: None,
562 });
563 }
564 }
565 } else {
566 smooth_terms.push(SmoothTermSpec {
567 name: label.clone(),
568 basis: inner_basis,
569 shape,
570 joint_null_rotation: None,
571 });
572 }
573 }
574 ParsedTerm::LinkWiggle { .. }
575 | ParsedTerm::TimeWiggle { .. }
576 | ParsedTerm::LinkConfig { .. }
577 | ParsedTerm::SurvivalConfig { .. } => {
578 // Consumed at formula level, not design terms.
579 }
580 ParsedTerm::LogSlopeSurface { .. } => {
581 return Err(TermBuilderError::malformed_formula(
582 "logslope(...) declarations must be resolved by the marginal-slope formula path before building a term spec",
583 ));
584 }
585 ParsedTerm::Interaction { vars } => {
586 // A linear `:` interaction realizes one design column equal to
587 // the elementwise product of its operands. Numeric (continuous/
588 // binary) operands multiply directly; a categorical operand is
589 // a factor, so the product is expanded factor-aware: one design
590 // column per surviving cell of the factor(s), each an indicator
591 // `1[factor == level]` gating the numeric product.
592 //
593 // Coding is MARGINALITY-AWARE (gam#1158, gam#1159). A categorical
594 // operand `g` is treatment-coded (its lexicographically first
595 // reference level dropped) ONLY when the lower-order term obtained
596 // by removing `g` from this interaction is also present in the
597 // model — that lower-order term is what makes the dropped level
598 // identifiable, exactly mgcv's marginality rule. When that parent
599 // is ABSENT (the interaction-only form), dropping the reference
600 // level instead pins a group to the reference fit (a rank-deficient
601 // design), so we keep ALL levels (full dummy coding) and rely on a
602 // single intercept cell-drop below for identifiability:
603 // * `y ~ x:g` with no `x` main effect → "common intercept,
604 // separate slopes": every group keeps its own x-slope.
605 // * `y ~ g:h` with no `g`/`h` main effects → the saturated
606 // cell-means model: full cross of all levels minus one
607 // reference cell absorbed by the intercept.
608 // When the parents ARE present (`x + x:g`, or `g*h` = `g + h +
609 // g:h`), the historical treatment coding is preserved so those
610 // forms stay correct.
611 //
612 // A main effect for var V is a `Linear`/`BoundedLinear`/
613 // `RandomEffect` ParsedTerm whose referenced name is V (an
614 // auto-detected categorical `Linear` becomes a RandomEffect main
615 // effect; either spelling counts). We only treat such standalone
616 // main-effect terms as parents — not V appearing inside another
617 // interaction.
618 let main_effect_present = |target: &str| -> bool {
619 terms.iter().any(|other| match other {
620 ParsedTerm::Linear { name, .. }
621 | ParsedTerm::BoundedLinear { name, .. }
622 | ParsedTerm::RandomEffect { name } => name == target,
623 _ => false,
624 })
625 };
626 // The lower-order parent of dropping operand `drop_var` from this
627 // interaction is present iff EVERY other operand is a main effect.
628 // For the two cases we care about (`x:g`, `g:h`) the interaction
629 // has two operands, so this reduces to "is the single remaining
630 // operand a main effect"; the general form handles any arity.
631 let parent_present = |drop_var: &str| -> bool {
632 vars.iter()
633 .filter(|v| v.as_str() != drop_var)
634 .all(|v| main_effect_present(v))
635 };
636
637 let mut numeric_cols = Vec::<usize>::new();
638 // Per categorical operand: (var name, col, kept levels, was the
639 // reference level dropped / treatment-coded?).
640 let mut categorical_factors =
641 Vec::<(String, usize, Vec<(u64, String)>, bool)>::new();
642 for var in vars {
643 let col = resolve_col(col_map, var)?;
644 let kind = ds.column_kinds.get(col).copied().ok_or_else(|| {
645 TermBuilderError::missing_column(format!(
646 "internal column-kind lookup failed for '{var}'"
647 ))
648 .to_string()
649 })?;
650 match kind {
651 ColumnKindTag::Continuous | ColumnKindTag::Binary => numeric_cols.push(col),
652 ColumnKindTag::Categorical => {
653 let mut levels = encoded_levels_for_column(ds, ColIdx::new(col));
654 // Treatment-code (drop the reference level) only when
655 // the marginal parent that identifies it is present;
656 // otherwise keep every level (full dummy coding).
657 let treatment_coded = parent_present(var);
658 if treatment_coded && levels.len() > 1 {
659 levels.remove(0);
660 }
661 if levels.is_empty() {
662 return Err(TermBuilderError::incompatible_config(format!(
663 "interaction `{}` references categorical column `{var}` with no usable levels",
664 vars.join(":")
665 )));
666 }
667 categorical_factors.push((var.clone(), col, levels, treatment_coded));
668 }
669 }
670 }
671
672 let label = vars.join(":");
673
674 if categorical_factors.is_empty() {
675 // Pure numeric `:` interaction — single product column,
676 // identical to the historical behaviour.
677 linear_terms.push(LinearTermSpec {
678 name: label,
679 feature_col: numeric_cols[0],
680 feature_cols: numeric_cols,
681 categorical_levels: vec![],
682 // Parametric `:` interaction column is unpenalized by
683 // default, same as any other linear term (#749).
684 double_penalty: false,
685 coefficient_geometry: LinearCoefficientGeometry::Unconstrained,
686 coefficient_min: None,
687 coefficient_max: None,
688 });
689 inference_notes.push(format!(
690 "wired linear interaction `{}` as product of numeric columns",
691 vars.join(":")
692 ));
693 } else {
694 // Factor-aware expansion: cartesian product over the kept
695 // levels of every categorical operand. Each cell yields one
696 // column gating the numeric product (or, with no numeric
697 // operand, a pure cell indicator).
698 let mut cells: Vec<Vec<(usize, u64, String)>> = vec![Vec::new()];
699 for (_var, col, levels, _treatment_coded) in &categorical_factors {
700 let mut next = Vec::with_capacity(cells.len() * levels.len());
701 for cell in &cells {
702 for (bits, level_label) in levels {
703 let mut extended = cell.clone();
704 extended.push((*col, *bits, level_label.clone()));
705 next.push(extended);
706 }
707 }
708 cells = next;
709 }
710
711 // Intercept-identifiability cell drop. When the cells are PURE
712 // INDICATORS (no numeric operand) and at least one factor was
713 // dummy-coded (kept all its levels), the full set of cell
714 // columns sums to the all-ones intercept and is rank-deficient
715 // against it. Drop exactly ONE reference cell — the cell where
716 // every factor sits at its reference (lexicographically first)
717 // level — so the remaining saturated cells are identifiable
718 // (rank n_g*n_h - 1 cells + intercept). With a numeric operand
719 // the cells gate `x` and sum to `x`, not the intercept, so no
720 // cell is dropped (the collinearity there is with the absent
721 // `x` main effect, which is exactly why full coding is right).
722 let any_dummy_coded = categorical_factors
723 .iter()
724 .any(|(_, _, _, treatment_coded)| !*treatment_coded);
725 if numeric_cols.is_empty() && any_dummy_coded {
726 // The reference cell pairs each factor's column with the
727 // bits of its lexicographically-first (index 0) level.
728 let reference_cell: Vec<(usize, u64)> = categorical_factors
729 .iter()
730 .map(|(_, col, _, _)| {
731 let levels = encoded_levels_for_column(ds, ColIdx::new(*col));
732 (*col, levels[0].0)
733 })
734 .collect();
735 cells.retain(|cell| {
736 !reference_cell.iter().all(|(rcol, rbits)| {
737 cell.iter()
738 .any(|(col, bits, _)| col == rcol && bits == rbits)
739 })
740 });
741 }
742
743 let n_cells = cells.len();
744 for cell in cells {
745 let cell_suffix = cell
746 .iter()
747 .map(|(_, _, level_label)| level_label.as_str())
748 .collect::<Vec<_>>()
749 .join(":");
750 let categorical_levels =
751 cell.iter().map(|(col, bits, _)| (*col, *bits)).collect();
752 // `feature_col` is required to point at a real column;
753 // use the first numeric operand when present, otherwise
754 // the first categorical column (its raw value is never
755 // multiplied — `realized_design_column` starts from ones
756 // and only gates by the level indicators).
757 let feature_col = numeric_cols
758 .first()
759 .copied()
760 .unwrap_or(categorical_factors[0].1);
761 linear_terms.push(LinearTermSpec {
762 name: format!("{label}:{cell_suffix}"),
763 feature_col,
764 feature_cols: numeric_cols.clone(),
765 categorical_levels,
766 double_penalty: false,
767 coefficient_geometry: LinearCoefficientGeometry::Unconstrained,
768 coefficient_min: None,
769 coefficient_max: None,
770 });
771 }
772 let all_treatment_coded = !any_dummy_coded;
773 let coding = if all_treatment_coded {
774 "treatment-coded"
775 } else {
776 "marginality-aware (full dummy / saturated)"
777 };
778 inference_notes.push(format!(
779 "wired factor-aware linear interaction `{}` as {} {} cell column(s)",
780 vars.join(":"),
781 n_cells,
782 coding
783 ));
784 }
785 }
786 }
787 }
788
789 Ok(TermCollectionSpec {
790 linear_terms,
791 random_effect_terms: random_terms,
792 smooth_terms,
793 })
794}
795
796fn split_list_option(raw: &str) -> Vec<String> {
797 let t = raw.trim();
798 // Accept the Python/JSON list form `[a, b]` AND mgcv's R-vector forms
799 // `c(a, b)` / `(a, b)` as bracketed wrappers around a comma-separated body.
800 // mgcv-style formulas pass per-margin numeric options as `k=c(5,5)` /
801 // `period=c(2*pi, pi)`; without R-vector peeling here those entries were
802 // split into `["c(5", "5)"]` and the downstream numeric parser then
803 // misreported the leading garbage as the invalid digit.
804 let inner = t
805 .strip_prefix('[')
806 .and_then(|u| u.strip_suffix(']'))
807 .or_else(|| {
808 t.strip_prefix("c(")
809 .or_else(|| t.strip_prefix("C("))
810 .or_else(|| t.strip_prefix('('))
811 .and_then(|u| u.strip_suffix(')'))
812 })
813 .unwrap_or(t);
814 inner
815 .split(',')
816 .map(|v| v.trim().to_string())
817 .filter(|v| !v.is_empty())
818 .collect()
819}
820
821fn parse_numeric_expr(raw: &str) -> Result<f64, String> {
822 let mut acc = 1.0f64;
823 let normalized = raw.replace(' ', "");
824 if normalized.eq_ignore_ascii_case("none") {
825 return Err("None is not numeric".to_string());
826 }
827 for factor in normalized.split('*') {
828 if factor.is_empty() {
829 return Err(format!("invalid numeric expression '{raw}'"));
830 }
831 let value = if factor.eq_ignore_ascii_case("pi") || factor == "π" {
832 std::f64::consts::PI
833 } else if factor.eq_ignore_ascii_case("tau") || factor == "τ" {
834 std::f64::consts::TAU
835 } else if let Some(prefix) = factor
836 .strip_suffix("pi")
837 .or_else(|| factor.strip_suffix("π"))
838 {
839 let coefficient = if prefix.is_empty() {
840 1.0
841 } else {
842 prefix
843 .parse::<f64>()
844 .map_err(|err| format!("invalid numeric expression '{raw}': {err}"))?
845 };
846 coefficient * std::f64::consts::PI
847 } else if let Some(prefix) = factor
848 .strip_suffix("tau")
849 .or_else(|| factor.strip_suffix("τ"))
850 {
851 let coefficient = if prefix.is_empty() {
852 1.0
853 } else {
854 prefix
855 .parse::<f64>()
856 .map_err(|err| format!("invalid numeric expression '{raw}': {err}"))?
857 };
858 coefficient * std::f64::consts::TAU
859 } else {
860 factor
861 .parse::<f64>()
862 .map_err(|err| format!("invalid numeric expression '{raw}': {err}"))?
863 };
864 acc *= value;
865 }
866 Ok(acc)
867}
868
869/// Read an endpoint/period option as a numeric *expression* (`2*pi`, `tau`,
870/// `0.5*tau`, `6.283185307179586`, ...) — the same grammar that `period=` and
871/// `origin=` already accept via [`parse_numeric_expr`].
872///
873/// Returns `Ok(None)` when the key is absent, `Ok(Some(v))` when it parses, and
874/// a hard `Err` when the key is *present but unparseable*. The crucial contrast
875/// is with the lenient [`option_f64`], which collapses an unparseable value to
876/// `None` and lets the caller silently substitute the data range — wrapping a
877/// cyclic smooth at the wrong period with no diagnostic (the #815 failure mode).
878fn option_numeric_expr(
879 options: &BTreeMap<String, String>,
880 key: &str,
881) -> Result<Option<f64>, String> {
882 match options.get(key) {
883 None => Ok(None),
884 Some(raw) => parse_numeric_expr(raw)
885 .map(Some)
886 .map_err(|err| format!("option `{key}={raw}` is not a valid numeric value: {err}")),
887 }
888}
889
890fn parse_periods_option(
891 options: &BTreeMap<String, String>,
892 dim: usize,
893) -> Result<Option<Vec<Option<f64>>>, String> {
894 let Some(raw) = options.get("period") else {
895 return Ok(None);
896 };
897 let values = split_list_option(raw);
898 let mut periods = vec![None; dim];
899 if values.len() == 1 && dim == 1 {
900 periods[0] = Some(parse_numeric_expr(&values[0])?);
901 } else {
902 if values.len() != dim {
903 return Err(format!(
904 "period list length {} must match smooth dimension {}",
905 values.len(),
906 dim
907 ));
908 }
909 for (i, v) in values.iter().enumerate() {
910 if v.eq_ignore_ascii_case("none") {
911 continue;
912 }
913 periods[i] = Some(parse_numeric_expr(v)?);
914 }
915 }
916 Ok(Some(periods))
917}
918
919fn parse_periodic_axes_option(
920 options: &BTreeMap<String, String>,
921 dim: usize,
922) -> Result<Option<Vec<Option<f64>>>, String> {
923 let Some(raw_axes) = options.get("periodic") else {
924 return Ok(None);
925 };
926 let mut periods = parse_periods_option(options, dim)?.unwrap_or_else(|| vec![None; dim]);
927 // Scalar boolean form (`periodic=true` / `false`, `yes` / `no`) applies to
928 // every axis — the documented per-axis-flag broadcast (see the doc on
929 // `parse_periodic_axes`, the tensor sibling that already accepts it). A
930 // 1-D `duchon(x, periodic=true)` lands here: the cyclic *domain* is then
931 // resolved from the data range by `parse_cyclic_boundary` (the 1-D builder
932 // consults `boundary` first), so a finite explicit period is NOT required —
933 // we only need to NOT mis-read "true" as an axis index (#1074). `false`
934 // means no axis is periodic.
935 let lowered = raw_axes.trim().to_ascii_lowercase();
936 match lowered.as_str() {
937 "true" | "yes" | "y" => return Ok(Some(periods)),
938 "false" | "no" | "n" => return Ok(Some(vec![None; dim])),
939 _ => {}
940 }
941 let axes = split_list_option(raw_axes);
942 if axes.is_empty() {
943 return Ok(Some(periods));
944 }
945
946 // Boolean forms `periodic=true` / `periodic=[true, false, ...]`, mirroring
947 // `parse_tensor_periodic_axes`. The radial 1-D builders (`duchon`/`tps`/
948 // `matern`) intentionally DERIVE the wrap period from the closed center
949 // lattice when none is supplied (`prepare_periodic_duchon_centers_1d_with_period`,
950 // gam#580: `None => span`), so a boolean-selected periodic axis legitimately
951 // omits `period`. Without this branch, `duchon(x, periodic=true)`-style
952 // radial formulas failed with the misleading "invalid periodic axis 'true'".
953 let is_bool = |t: &str| {
954 matches!(
955 t.to_ascii_lowercase().as_str(),
956 "true" | "yes" | "y" | "false" | "no" | "n"
957 )
958 };
959 let is_truthy = |t: &str| matches!(t.to_ascii_lowercase().as_str(), "true" | "yes" | "y");
960
961 // Scalar boolean: `periodic=true` / `periodic=false`.
962 if axes.len() == 1 && is_bool(&axes[0]) {
963 if !is_truthy(&axes[0]) {
964 // Non-periodic: return None so the 1-D builder (which routes on
965 // `spec.periodic.is_some()`) does NOT take the periodic path.
966 return Ok(None);
967 }
968 // Every axis periodic; honor any explicit per-axis period, else leave
969 // `None` for the caller (formula arm) / builder to derive the span.
970 return Ok(Some(periods));
971 }
972
973 // Per-axis boolean list: `periodic=[true, false, ...]` (length must match dim).
974 if axes.iter().all(|a| is_bool(a)) {
975 if axes.len() != dim {
976 return Err(format!(
977 "periodic flag list length {} must match smooth dimension {dim}",
978 axes.len()
979 ));
980 }
981 if !axes.iter().any(|a| is_truthy(a)) {
982 return Ok(None);
983 }
984 for (i, a) in axes.iter().enumerate() {
985 if !is_truthy(a) {
986 periods[i] = None;
987 }
988 }
989 return Ok(Some(periods));
990 }
991
992 // Index-list form: `periodic=[0, 2]`. Each listed axis must carry an
993 // explicit finite period — an index gives no per-axis span-derive hint.
994 for a in &axes {
995 let axis = a
996 .parse::<usize>()
997 .map_err(|err| format!("invalid periodic axis '{a}': {err}"))?;
998 if axis >= dim {
999 return Err(format!(
1000 "periodic axis {axis} out of range for {dim}D smooth"
1001 ));
1002 }
1003 if periods[axis].is_none() {
1004 return Err(format!(
1005 "periodic axis {axis} requires period[{axis}] to be finite"
1006 ));
1007 }
1008 }
1009 // Axes not listed are non-periodic even if period list has a finite placeholder.
1010 let listed: std::collections::BTreeSet<usize> = axes
1011 .iter()
1012 .filter_map(|a| a.parse::<usize>().ok())
1013 .collect();
1014 for i in 0..dim {
1015 if !listed.contains(&i) {
1016 periods[i] = None;
1017 }
1018 }
1019 Ok(Some(periods))
1020}
1021
1022// ---------------------------------------------------------------------------
1023// Smooth basis spec construction
1024// ---------------------------------------------------------------------------
1025
1026fn parse_option_list(raw: &str) -> Vec<String> {
1027 let trimmed = raw.trim();
1028 // Accept both the Python/JSON list form `[a, b]` and mgcv's R vector form
1029 // `c(a, b)` (and a bare `(a, b)`) as the bracketed wrapper around a
1030 // comma-separated option list. mgcv writes per-margin options as
1031 // `bs=c('tp','tp')` / `m=c(2,2)`, so the `c(...)` form must round-trip
1032 // through the same splitter the `[...]` form uses.
1033 let inner = trimmed
1034 .strip_prefix('[')
1035 .and_then(|v| v.strip_suffix(']'))
1036 .or_else(|| {
1037 trimmed
1038 .strip_prefix("c(")
1039 .or_else(|| trimmed.strip_prefix("C("))
1040 .or_else(|| trimmed.strip_prefix('('))
1041 .and_then(|v| v.strip_suffix(')'))
1042 })
1043 .unwrap_or(trimmed);
1044 inner
1045 .split(',')
1046 .map(|v| {
1047 v.trim()
1048 .trim_matches('"')
1049 .trim_matches('\'')
1050 .to_ascii_lowercase()
1051 })
1052 .filter(|v| !v.is_empty())
1053 .collect()
1054}
1055
1056fn parse_periodic_axes(
1057 options: &BTreeMap<String, String>,
1058 dim: usize,
1059) -> Result<Vec<bool>, String> {
1060 let mut axes = vec![false; dim];
1061 if let Some(raw) = options.get("periodic").or_else(|| options.get("cyclic")) {
1062 let lowered = raw.trim().to_ascii_lowercase();
1063 match lowered.as_str() {
1064 "true" | "yes" | "y" => {
1065 axes.fill(true);
1066 return Ok(axes);
1067 }
1068 "false" | "no" | "n" => return Ok(axes),
1069 _ => {}
1070 }
1071 for axis_raw in parse_option_list(raw) {
1072 let axis = axis_raw
1073 .parse::<usize>()
1074 .map_err(|err| format!("invalid periodic axis '{axis_raw}': {err}"))?;
1075 if axis >= dim {
1076 return Err(format!(
1077 "periodic axis {axis} out of range for {dim}D smooth"
1078 ));
1079 }
1080 axes[axis] = true;
1081 }
1082 }
1083 if let Some(raw) = options.get("boundary").or_else(|| options.get("bc")) {
1084 let boundary = parse_option_list(raw);
1085 if boundary.len() == dim {
1086 for (axis, value) in boundary.iter().enumerate() {
1087 if matches!(value.as_str(), "periodic" | "cyclic" | "cc") {
1088 axes[axis] = true;
1089 }
1090 }
1091 } else if dim == 1
1092 && matches!(
1093 boundary.first().map(String::as_str),
1094 Some("periodic" | "cyclic" | "cc")
1095 )
1096 {
1097 axes[0] = true;
1098 }
1099 }
1100 Ok(axes)
1101}
1102
1103fn parse_optional_numeric_list(
1104 options: &BTreeMap<String, String>,
1105 keys: &[&str],
1106 dim: usize,
1107) -> Result<Vec<Option<f64>>, String> {
1108 let Some(raw) = keys.iter().find_map(|key| options.get(*key)) else {
1109 return Ok(vec![None; dim]);
1110 };
1111 let values = split_list_option(raw);
1112 let mut out = vec![None; dim];
1113 if values.len() == 1 && dim == 1 {
1114 if !values[0].eq_ignore_ascii_case("none") {
1115 out[0] = Some(parse_numeric_expr(&values[0])?);
1116 }
1117 return Ok(out);
1118 }
1119 if values.len() != dim {
1120 return Err(format!(
1121 "numeric option list length {} must match smooth dimension {}",
1122 values.len(),
1123 dim
1124 ));
1125 }
1126 for (i, value) in values.iter().enumerate() {
1127 if !value.eq_ignore_ascii_case("none") {
1128 out[i] = Some(parse_numeric_expr(value)?);
1129 }
1130 }
1131 Ok(out)
1132}
1133
1134fn parse_periods(
1135 options: &BTreeMap<String, String>,
1136 periodic_axes: &[bool],
1137) -> Result<Vec<Option<f64>>, String> {
1138 let dim = periodic_axes.len();
1139 // Broadcast a single-element `period=[v]` onto the lone periodic axis
1140 // of a multi-axis smooth (e.g. `te(th, h, bc=['periodic','natural'],
1141 // period=[2*pi])`): with only one periodic margin, the value can only
1142 // belong there.
1143 let lone_periodic_broadcast = options
1144 .get("period")
1145 .or_else(|| options.get("periods"))
1146 .and_then(|raw| {
1147 let values = split_list_option(raw);
1148 if values.len() != 1 || dim <= 1 {
1149 return None;
1150 }
1151 let mut iter = periodic_axes.iter().enumerate().filter(|(_, p)| **p);
1152 let first = iter.next()?;
1153 if iter.next().is_some() {
1154 return None;
1155 }
1156 Some((first.0, values.into_iter().next().unwrap()))
1157 });
1158 let periods = if let Some((axis, value)) = lone_periodic_broadcast {
1159 let mut out = vec![None; dim];
1160 if !value.eq_ignore_ascii_case("none") {
1161 out[axis] = Some(parse_numeric_expr(&value)?);
1162 }
1163 out
1164 } else {
1165 parse_optional_numeric_list(options, &["period", "periods"], dim)?
1166 };
1167 for (axis, (periodic, period)) in periodic_axes.iter().zip(periods.iter()).enumerate() {
1168 if *periodic
1169 && let Some(value) = period
1170 && (!value.is_finite() || *value <= 0.0)
1171 {
1172 return Err(format!(
1173 "period for periodic axis {axis} must be finite and positive, got {value}"
1174 ));
1175 }
1176 }
1177 Ok(periods)
1178}
1179
1180fn parse_period_origins(
1181 options: &BTreeMap<String, String>,
1182 periodic_axes: &[bool],
1183) -> Result<Vec<Option<f64>>, String> {
1184 parse_optional_numeric_list(
1185 options,
1186 &[
1187 "origin",
1188 "origins",
1189 "period_origin",
1190 "period-origin",
1191 "domain_origin",
1192 ],
1193 periodic_axes.len(),
1194 )
1195}
1196
1197/// Parse a per-axis periodic flag list for tensor smooths. Accepts three forms:
1198/// - `periodic=true` / `periodic=false` (scalar applied to every axis),
1199/// - `periodic=[true, false, ...]` (one flag per axis, length `dim`),
1200/// - `periodic=c(1, 1)` / `c(0, 0)` (a length-`dim` 0/1 mask, mgcv's
1201/// per-margin spelling — distinguished from an axis-index list by the
1202/// repeated 0/1 value), and
1203/// - `periodic=[0, 2, ...]` (axis indices that are periodic; others are not).
1204///
1205/// `boundary=[..., "periodic"/"cyclic"/"cc", ...]` may also flip individual
1206/// axes on; non-matching tokens leave the existing flag unchanged.
1207fn parse_tensor_periodic_axes(
1208 options: &BTreeMap<String, String>,
1209 dim: usize,
1210) -> Result<Vec<bool>, String> {
1211 let mut axes = vec![false; dim];
1212 if let Some(raw) = options.get("periodic").or_else(|| options.get("cyclic")) {
1213 let lowered = raw.trim().to_ascii_lowercase();
1214 match lowered.as_str() {
1215 "true" | "yes" | "y" => {
1216 axes.fill(true);
1217 }
1218 "false" | "no" | "n" => {
1219 // Already false; allow `boundary=` below to flip axes if set.
1220 }
1221 _ => {
1222 let entries = parse_option_list(raw);
1223 let all_bool = !entries.is_empty()
1224 && entries.iter().all(|v| {
1225 matches!(
1226 v.as_str(),
1227 "true" | "yes" | "y" | "false" | "no" | "n" | "none"
1228 )
1229 });
1230 // mgcv writes per-margin flag vectors as `periodic=c(1,1)` /
1231 // `periodic=c(0,0)` — a length-`dim` mask where each entry is a
1232 // 0/1 flag for THAT margin, not an axis index. A bare axis-index
1233 // list (`periodic=[0,1]`, `periodic=[0]`) lists DISTINCT margin
1234 // indices to turn on. The two collide only when the list is all
1235 // 0/1 of length `dim`; disambiguate by the repeated-value
1236 // signature `c(1,1)`/`c(0,0)` (a valid axis-index set never
1237 // repeats an index), which is the canonical mask spelling. This
1238 // is what makes the leading tensor margin honor its periodic flag
1239 // (#1751: `periodic=c(1,1)` previously parsed `1,1` as axis
1240 // indices, marking only axis 1 and dropping axis 0).
1241 let all_zero_one = !entries.is_empty()
1242 && entries.iter().all(|v| v == "0" || v == "1");
1243 let has_repeat = {
1244 let mut seen = std::collections::BTreeSet::new();
1245 !entries.iter().all(|v| seen.insert(v.clone()))
1246 };
1247 let numeric_mask = all_zero_one && entries.len() == dim && has_repeat;
1248 if all_bool || numeric_mask {
1249 if entries.len() != dim {
1250 return Err(format!(
1251 "periodic list length {} must match smooth dimension {}",
1252 entries.len(),
1253 dim
1254 ));
1255 }
1256 for (i, v) in entries.iter().enumerate() {
1257 axes[i] = matches!(v.as_str(), "true" | "yes" | "y" | "1");
1258 }
1259 } else {
1260 for axis_raw in entries {
1261 let axis = axis_raw
1262 .parse::<usize>()
1263 .map_err(|err| format!("invalid periodic axis '{axis_raw}': {err}"))?;
1264 if axis >= dim {
1265 return Err(format!(
1266 "periodic axis {axis} out of range for {dim}D smooth"
1267 ));
1268 }
1269 axes[axis] = true;
1270 }
1271 }
1272 }
1273 }
1274 }
1275 if let Some(raw) = options.get("boundary").or_else(|| options.get("bc")) {
1276 let boundary = parse_option_list(raw);
1277 if boundary.len() == dim {
1278 for (axis, value) in boundary.iter().enumerate() {
1279 if matches!(value.as_str(), "periodic" | "cyclic" | "cc") {
1280 axes[axis] = true;
1281 }
1282 }
1283 }
1284 }
1285 // A per-margin basis vector (`bs=c('cc','ps')` / `type=[...]`) declares each
1286 // margin's basis family, and a cyclic family (`cc`/`cp`/`cyclic`) makes THAT
1287 // margin periodic — exactly as the 1-D `s(x, bs='cc')` smooth wraps its lone
1288 // axis. Without this, the per-margin `cc` token was validated but discarded:
1289 // every `bs=c(...)` spelling collapsed to the same open B-spline tensor
1290 // (#1752). Only honor the vector form here; a scalar `bs='cc'` on a tensor is
1291 // ambiguous about which margins wrap, so it does not flip any axis on.
1292 if let Some(raw) = options.get("bs").or_else(|| options.get("type"))
1293 && bs_selector_is_vector(raw)
1294 {
1295 let per_margin = parse_option_list(raw);
1296 if per_margin.len() == dim {
1297 for (axis, margin_bs) in per_margin.iter().enumerate() {
1298 if matches!(
1299 canonicalize_smooth_type(margin_bs),
1300 "cc" | "cp" | "cyclic"
1301 ) {
1302 axes[axis] = true;
1303 }
1304 }
1305 }
1306 }
1307 Ok(axes)
1308}
1309
1310/// Validate the per-margin `boundary=`/`bc=` tokens on a tensor-product smooth.
1311///
1312/// The tensor `boundary`/`bc` list selects, per margin, whether the margin
1313/// *wraps* (a `periodic`/`cyclic`/`cc` token, consumed by
1314/// [`parse_tensor_periodic_axes`]) or is an ordinary non-periodic margin. In the
1315/// tensor DSL a *non-periodic* margin is spelled `clamped` — in the B-spline
1316/// sense of a **clamped knot vector**, i.e. the standard open spline that is
1317/// free at its two ends and does not wrap (exactly how the callers document it:
1318/// "non-periodic / clamped … free at the two ends, no wrap"). It is therefore an
1319/// inert marker here, not a zero-derivative endpoint reparameterization: a
1320/// cylinder `te(theta, z, boundary=['periodic','clamped'], …)` is a cyclic θ
1321/// margin tensor-producted with an ordinary open z margin, the direct analog of
1322/// mgcv `te(bs=c("cc","ps"))` / `te(bs=c("cc","cr"))`.
1323///
1324/// The periodic selectors and the inert non-periodic markers
1325/// (`clamped`/`open`/`natural`/`free`/`none`/empty) are accepted; anything else
1326/// (e.g. a genuine `anchored` zero-value endpoint constraint, which has no
1327/// ordinary-margin meaning in a tensor) is surfaced as a clean
1328/// unsupported-feature error rather than silently dropped. Previously `clamped`
1329/// itself was rejected, so the cylinder/torus mixed-boundary tensors — the exact
1330/// construction the manifold quality suite builds — could not be fit at all.
1331fn validate_tensor_boundary_tokens(
1332 options: &BTreeMap<String, String>,
1333 dim: usize,
1334) -> Result<(), String> {
1335 let Some(raw) = options.get("boundary").or_else(|| options.get("bc")) else {
1336 return Ok(());
1337 };
1338 let entries = parse_option_list(raw);
1339 for (axis, value) in entries.iter().enumerate() {
1340 let inert = matches!(
1341 value.trim().to_ascii_lowercase().as_str(),
1342 "clamped" | "open" | "natural" | "free" | "none" | "" | "periodic" | "cyclic" | "cc"
1343 );
1344 if !inert {
1345 return Err(TermBuilderError::unsupported_feature(format!(
1346 "tensor smooth margin {axis} boundary token '{value}' is not supported \
1347 (got bc/boundary={raw:?} on a {dim}-D tensor); tensor margins accept the periodic \
1348 selectors (periodic/cyclic/cc) or the non-periodic markers (clamped/open/natural/free). \
1349 Apply anchored/zero-value endpoint constraints with a 1-D s(x, bc=...) term instead."
1350 ))
1351 .to_string());
1352 }
1353 }
1354 Ok(())
1355}
1356
1357fn tensor_k_axis_option_axis(
1358 key: &str,
1359 cols: &[usize],
1360 ds: &Dataset,
1361) -> Result<Option<usize>, String> {
1362 let Some(suffix) = key.strip_prefix("k_") else {
1363 return Ok(None);
1364 };
1365 if suffix.is_empty() {
1366 return Err("tensor k axis option must be named k_<axis> or k_<variable>".to_string());
1367 }
1368 if let Ok(axis) = suffix.parse::<usize>() {
1369 return if axis < cols.len() {
1370 Ok(Some(axis))
1371 } else {
1372 Err(format!(
1373 "tensor k axis option `{key}` references axis {axis}, but the smooth has {} margins",
1374 cols.len()
1375 ))
1376 };
1377 }
1378
1379 let mut matches = cols
1380 .iter()
1381 .enumerate()
1382 .filter(|(_, col)| ds.headers.get(**col).is_some_and(|name| name == suffix))
1383 .map(|(axis, _)| axis);
1384 let first = matches.next();
1385 if matches.next().is_some() {
1386 return Err(format!(
1387 "tensor k axis option `{key}` matches more than one margin named `{suffix}`"
1388 ));
1389 }
1390 first.map(Some).ok_or_else(|| {
1391 let margin_names = cols
1392 .iter()
1393 .enumerate()
1394 .map(|(axis, col)| {
1395 let name = ds
1396 .headers
1397 .get(*col)
1398 .map(String::as_str)
1399 .unwrap_or("<unnamed>");
1400 format!("{axis}:{name}")
1401 })
1402 .collect::<Vec<_>>()
1403 .join(", ");
1404 format!(
1405 "tensor k axis option `{key}` does not match a margin index or name; tensor margins are [{margin_names}]"
1406 )
1407 })
1408}
1409
1410fn is_tensor_k_axis_option_key(key: &str) -> bool {
1411 key.strip_prefix("k_")
1412 .is_some_and(|suffix| !suffix.is_empty())
1413}
1414
1415/// Parse a per-margin basis dimension list (`k=<scalar>`, `k=[k0, k1, ...]`,
1416/// or axis aliases like `k_x=...` / `k_0=...`). A scalar is broadcast across
1417/// all axes; `None` returns the heuristic from the data column.
1418fn parse_tensor_k_list(
1419 options: &BTreeMap<String, String>,
1420 cols: &[usize],
1421 ds: &Dataset,
1422) -> Result<(Vec<usize>, bool), String> {
1423 let mut axis_values = vec![None; cols.len()];
1424 let mut saw_axis_alias = false;
1425 for (key, value) in options {
1426 let Some(axis) = tensor_k_axis_option_axis(key, cols, ds)? else {
1427 continue;
1428 };
1429 saw_axis_alias = true;
1430 if axis_values[axis].is_some() {
1431 return Err(format!("tensor k axis {axis} is specified more than once"));
1432 }
1433 let k: usize = value
1434 .parse()
1435 .map_err(|err| format!("invalid tensor k option `{key}={value}`: {err}"))?;
1436 axis_values[axis] = Some(k);
1437 }
1438
1439 let raw = options
1440 .get("k")
1441 .or_else(|| options.get("basis_dim"))
1442 .or_else(|| options.get("basis-dim"))
1443 .or_else(|| options.get("basisdim"));
1444 if saw_axis_alias {
1445 if raw.is_some() {
1446 return Err(
1447 "tensor k axis aliases cannot be combined with k= or basis_dim=".to_string(),
1448 );
1449 }
1450 if let Some(missing_axis) = axis_values.iter().position(Option::is_none) {
1451 let margin_name = cols
1452 .get(missing_axis)
1453 .and_then(|col| ds.headers.get(*col))
1454 .map(String::as_str)
1455 .unwrap_or("<unnamed>");
1456 return Err(format!(
1457 "tensor k axis aliases must specify every margin; missing axis {missing_axis} ({margin_name})"
1458 ));
1459 }
1460 return Ok((
1461 axis_values
1462 .into_iter()
1463 .map(|k| k.expect("missing axis values rejected above"))
1464 .collect(),
1465 false,
1466 ));
1467 }
1468 let Some(raw) = raw else {
1469 let inferred = heuristic_tensor_margin_knots(cols, ds);
1470 return Ok((inferred, true));
1471 };
1472 let entries = split_list_option(raw);
1473 if entries.len() == 1 {
1474 let k: usize = entries[0]
1475 .parse()
1476 .map_err(|err| format!("invalid tensor k '{}': {err}", entries[0]))?;
1477 return Ok((vec![k; cols.len()], false));
1478 }
1479 if entries.len() != cols.len() {
1480 return Err(format!(
1481 "tensor k list length {} must match smooth dimension {}",
1482 entries.len(),
1483 cols.len()
1484 ));
1485 }
1486 let mut out = Vec::with_capacity(entries.len());
1487 for entry in entries {
1488 let k: usize = entry
1489 .parse()
1490 .map_err(|err| format!("invalid tensor k '{entry}': {err}"))?;
1491 out.push(k);
1492 }
1493 Ok((out, false))
1494}
1495
1496/// Parse the `identifiability=` option for tensor-product smooths. Mirrors the
1497/// vocabulary of the Matern/Duchon parsers so the formula DSL is consistent.
1498///
1499/// `kind` selects the default identifiability when no explicit
1500/// `identifiability=` option is supplied: `te(...)` ([`SmoothKind::Te`]) keeps
1501/// the full-tensor sum-to-zero default, while `ti(...)` ([`SmoothKind::Ti`])
1502/// defaults to per-margin sum-to-zero so the marginal main effects are excluded
1503/// (the mgcv tensor-interaction semantics). An explicit option always wins.
1504fn parse_tensor_identifiability(
1505 options: &BTreeMap<String, String>,
1506 kind: SmoothKind,
1507) -> Result<TensorBSplineIdentifiability, String> {
1508 let Some(raw) = options.get("identifiability").map(String::as_str) else {
1509 return Ok(match kind {
1510 SmoothKind::Ti => TensorBSplineIdentifiability::MarginalSumToZero,
1511 _ => TensorBSplineIdentifiability::default(),
1512 });
1513 };
1514 match raw.trim().to_ascii_lowercase().as_str() {
1515 "none" => Ok(TensorBSplineIdentifiability::None),
1516 "sum_tozero" | "sum-to-zero" | "center_sum_tozero" | "center-sum-to-zero" | "centered"
1517 | "sumtozero" => Ok(TensorBSplineIdentifiability::SumToZero),
1518 "marginal_sum_tozero" | "marginal-sum-to-zero" | "marginal_sumtozero"
1519 | "marginalsumtozero" | "interaction" => {
1520 Ok(TensorBSplineIdentifiability::MarginalSumToZero)
1521 }
1522 other => Err(TermBuilderError::unsupported_feature(format!(
1523 "invalid tensor identifiability '{other}'; expected one of: none, sum_tozero, marginal_sum_tozero"
1524 ))
1525 .to_string()),
1526 }
1527}
1528
1529fn bspline_boundary_declares_periodic_axis(options: &BTreeMap<String, String>) -> bool {
1530 options
1531 .get("boundary")
1532 .or_else(|| options.get("bc"))
1533 .map(|raw| {
1534 parse_option_list(raw)
1535 .into_iter()
1536 .any(|value| matches!(value.as_str(), "periodic" | "cyclic" | "cc"))
1537 })
1538 .unwrap_or(false)
1539}
1540
1541/// Canonical-name lookup for the `bs=`/`type=` smooth selector.
1542///
1543/// User-facing names — including mgcv-compatible spellings whose semantics
1544/// match an existing gamfit smooth exactly — collapse to the engine-internal
1545/// canonical names used by the dispatch in [`build_smooth_basis`]. Adding a
1546/// new exactly-equivalent alias is a one-line entry here; the match arms
1547/// below remain the single dispatch site.
1548///
1549/// Aliases listed here MUST be true semantic equivalents of the canonical
1550/// target, not approximations. mgcv names whose semantics differ from any
1551/// gamfit smooth (e.g. `bs="ts"` shrinkage thin-plate, `bs="ad"` adaptive)
1552/// are intentionally NOT mapped here — they should reach the unsupported-type
1553/// path so users get a real diagnostic instead of a silent semantic
1554/// substitution. mgcv's `bs="cr"`/`"cs"` (cubic regression and its shrinkage
1555/// twin) are handled directly in the [`build_smooth_basis`] dispatch — they
1556/// are not aliased here because the `cr`/`cs` distinction controls a default
1557/// (`double_penalty`) that the canonical-name layer cannot see.
1558///
1559/// Unrecognised inputs pass through unchanged so the dispatch can produce its
1560/// usual "unsupported smooth type" error, preserving the existing diagnostic
1561/// surface for genuine typos.
1562pub(crate) fn canonicalize_smooth_type(raw: &str) -> &str {
1563 match raw {
1564 // Thin-plate spline. mgcv `bs="tp"` is the default thin-plate
1565 // regression spline — exact semantic equivalent of gamfit's `"tps"`.
1566 "tp" => "tps",
1567 // Gaussian process / Matérn. mgcv `bs="gp"` defaults to a Matérn
1568 // covariance kernel with REML smoothing parameter selection, which
1569 // matches gamfit's `"matern"` exactly (same kernel-Gram identity,
1570 // same REML route).
1571 "gp" => "matern",
1572 // Constant-curvature (M_κ) geodesic-kernel smooth (#944). All aliases
1573 // collapse to one canonical type so `bs="curv"`/`bs="mkappa"` cannot
1574 // diverge from `curv(...)`.
1575 "curv" | "constant_curvature" | "mkappa" => "curvature",
1576 // Measure-jet spline: multiscale local-jet-residual energy of the
1577 // empirical measure. No mgcv equivalent (mgcv has no measure-learned
1578 // geometry smooth), so no mgcv alias is mapped.
1579 "mjs" | "measure_jet" | "web" => "measurejet",
1580 other => other,
1581 }
1582}
1583
1584/// Is `margin_bs` a per-margin basis name that the tensor builder realizes as a
1585/// penalized 1-D B-spline margin?
1586///
1587/// gam's tensor product is built from penalized B-spline marginals. mgcv's
1588/// thin-plate (`tp`/`tps`), P-spline (`ps`), B-spline (`bs`), cubic-regression
1589/// (`cr`/`cs`), and cyclic (`cc`/`cp`/`cyclic`) marginals are all penalized
1590/// splines spanning the same per-axis smoothing space, so a B-spline margin
1591/// reproduces the same tensor smoothing class. Margin kinds with fundamentally
1592/// different structure (adaptive, random-effect, sphere) are NOT accepted as
1593/// tensor margins.
1594pub(crate) fn tensor_margin_bs_is_supported(margin_bs: &str) -> bool {
1595 matches!(
1596 canonicalize_smooth_type(margin_bs),
1597 "tps" | "ps" | "bs" | "bspline" | "cr" | "cs" | "cc" | "cp" | "cyclic"
1598 )
1599}
1600
1601/// Does the smooth request a periodic/cyclic axis via its options?
1602///
1603/// Mirrors the boundary-condition reading used by the periodic-aware dispatch
1604/// branches. Factored out so the type resolver and `build_smooth_basis` agree
1605/// on a single notion of "periodic requested".
1606pub(crate) fn smooth_options_declare_periodic(options: &BTreeMap<String, String>) -> bool {
1607 options.contains_key("periodic")
1608 || options.contains_key("cyclic")
1609 || options
1610 .get("boundary")
1611 .or_else(|| options.get("bc"))
1612 .map(|boundary| {
1613 boundary.to_ascii_lowercase().contains("periodic")
1614 || boundary.to_ascii_lowercase().contains("cyclic")
1615 })
1616 .unwrap_or(false)
1617}
1618
1619/// Resolve the canonical engine-internal smooth-type name for a term.
1620///
1621/// Reads the user-facing `type=`/`bs=` selector and collapses mgcv-compatible
1622/// aliases (`tp`→`tps`, `gp`→`matern`) via [`canonicalize_smooth_type`], or
1623/// derives the default from the smooth kind/arity when no selector is given.
1624/// This is the single source of truth for the dispatch in
1625/// [`build_smooth_basis`]; other call sites (e.g. predictor-specific basis
1626/// policy) use it so the classification never drifts from the dispatch.
1627/// Is the raw `bs=`/`type=` selector a vector literal (`c('tp','tp')`,
1628/// `['tp','tp']`, `(tp, tp)`) rather than a scalar smooth-type name?
1629///
1630/// mgcv's tensor smooths take a *per-margin* basis vector
1631/// (`te(x1, x2, bs=c('tp','tp'))`). Such a value is not a scalar canonical
1632/// type and must not be fed through [`canonicalize_smooth_type`] — it has to be
1633/// recognized as a tensor request and split into per-margin types. A scalar
1634/// selector (`bs="tp"`) is left untouched.
1635pub(crate) fn bs_selector_is_vector(raw: &str) -> bool {
1636 let trimmed = raw.trim();
1637 let bracketed = (trimmed.starts_with('[') && trimmed.ends_with(']'))
1638 || (trimmed.starts_with("c(") || trimmed.starts_with("C(")) && trimmed.ends_with(')')
1639 || (trimmed.starts_with('(') && trimmed.ends_with(')'));
1640 bracketed && !parse_option_list(trimmed).is_empty()
1641}
1642
1643pub fn resolve_smooth_type_name(
1644 kind: SmoothKind,
1645 n_cols: usize,
1646 options: &BTreeMap<String, String>,
1647) -> String {
1648 let selector = options.get("type").or_else(|| options.get("bs"));
1649 // A per-margin basis vector is a tensor request, never a scalar type. Route
1650 // it to the tensor builder, which reads the per-margin types out of the
1651 // same `bs=` option. (A vector on a non-tensor smooth is ill-formed and
1652 // falls through to the scalar path below so the existing diagnostic fires.)
1653 if let Some(raw) = selector
1654 && bs_selector_is_vector(raw)
1655 && matches!(kind, SmoothKind::Te | SmoothKind::Ti | SmoothKind::T2)
1656 {
1657 return "tensor".to_string();
1658 }
1659 selector
1660 .map(|s| canonicalize_smooth_type(&s.to_ascii_lowercase()).to_string())
1661 .unwrap_or_else(|| match kind {
1662 SmoothKind::Te | SmoothKind::Ti | SmoothKind::T2 => "tensor".to_string(),
1663 SmoothKind::S if n_cols == 1 => "bspline".to_string(),
1664 // Mixed periodic Euclidean radial kernels are not separable on the
1665 // cylinder. Use a tensor product with a cyclic margin so s(theta,h)
1666 // honors seam continuity while preserving the formula-level s(...).
1667 SmoothKind::S if smooth_options_declare_periodic(options) => "tensor".to_string(),
1668 SmoothKind::S => "tps".to_string(),
1669 })
1670}
1671
1672/// Does this canonical smooth type size its basis through the generous spatial
1673/// center heuristic ([`crate::basis::default_num_centers`])?
1674///
1675/// Only the radial spatial bases (thin-plate, Matérn/GP, Duchon) route their
1676/// default basis dimension through `plan_spatial_basis(.., Default, ..)`. The
1677/// B-spline, cyclic, tensor, and factor-smooth bases use their own modest
1678/// knot-based defaults, so they are unaffected by — and must not be perturbed
1679/// by — secondary-predictor basis-parsimony adjustments (#501).
1680pub fn smooth_type_uses_spatial_center_heuristic(canonical_type: &str) -> bool {
1681 matches!(canonical_type, "tps" | "matern" | "duchon")
1682}
1683
1684pub fn build_smooth_basis(
1685 kind: SmoothKind,
1686 vars: &[String],
1687 cols: &[usize],
1688 options: &BTreeMap<String, String>,
1689 ds: &Dataset,
1690 inference_notes: &mut Vec<String>,
1691 policy: &ResourcePolicy,
1692 smooth_coordinate_count: usize,
1693) -> Result<SmoothBasisSpec, String> {
1694 // Fail fast on degenerate input: a smooth whose (non-categorical) coordinate
1695 // columns collapse to a SINGLE distinct point can only ever fit the response
1696 // mean — its design matrix is rank-1. For a UNIVARIATE smooth this is exactly
1697 // "the one column is constant": `smooth(x)`/`matern(x)` on constant `x` would
1698 // otherwise silently fit the mean of `y` with no visible cue (Duchon already
1699 // errors loudly via the basis layer; this makes the diagnosis explicit and
1700 // uniform). For a MULTIVARIATE smooth (tensor, sphere, tps, ...) a single
1701 // constant coordinate is NOT degenerate — the basis still varies along the
1702 // other coordinate(s) and the penalty absorbs the rank-deficient direction
1703 // (e.g. a constant-longitude meridian arc on the sphere is a well-posed 1-D
1704 // slice of S²). Such a term is degenerate only when EVERY coordinate is
1705 // constant at once, i.e. the joint input is a single point. Test the JOINT
1706 // cardinality, not each column independently, so the loud diagnosis still
1707 // fires for the genuinely rank-1 case without rejecting well-posed
1708 // lower-dimensional slices.
1709 let coord_cols: Vec<(&String, usize)> = vars
1710 .iter()
1711 .zip(cols.iter().copied())
1712 .filter(|(_, col)| !matches!(ds.column_kinds.get(*col), Some(ColumnKindTag::Categorical)))
1713 .collect();
1714 if !coord_cols.is_empty() {
1715 let views: Vec<ArrayView1<'_, f64>> = coord_cols
1716 .iter()
1717 .map(|(_, col)| ds.values.column(*col))
1718 .collect();
1719 let n_rows = views[0].len();
1720 let mut distinct_points = std::collections::HashSet::<Vec<u64>>::new();
1721 for r in 0..n_rows {
1722 let key: Vec<u64> = views
1723 .iter()
1724 .map(|v| {
1725 let x = v[r];
1726 let norm = if x == 0.0 { 0.0 } else { x };
1727 norm.to_bits()
1728 })
1729 .collect();
1730 distinct_points.insert(key);
1731 if distinct_points.len() > 1 {
1732 break;
1733 }
1734 }
1735 if distinct_points.len() <= 1 {
1736 return Err(TermBuilderError::degenerate_data(if coord_cols.len() == 1 {
1737 let var = coord_cols[0].0;
1738 format!(
1739 "smooth term over '{var}' has only one unique value in the training data \
1740 — a smooth on a constant column is degenerate and would only fit the response mean. \
1741 Remove `{var}` from the smooth, drop the term, or check the data."
1742 )
1743 } else {
1744 let names = coord_cols
1745 .iter()
1746 .map(|(v, _)| v.as_str())
1747 .collect::<Vec<_>>()
1748 .join(", ");
1749 format!(
1750 "smooth term over ({names}) has only one unique joint coordinate in the training \
1751 data — every coordinate is constant, so the smooth is degenerate and would only \
1752 fit the response mean. Drop the term or check the data."
1753 )
1754 })
1755 .to_string());
1756 }
1757 }
1758 if let Some(by_name) = options.get("by").cloned() {
1759 let by_col = options
1760 .get("__by_col")
1761 .and_then(|raw| raw.parse::<usize>().ok())
1762 .or_else(|| vars.iter().position(|v| v == &by_name).map(|idx| cols[idx]))
1763 .ok_or_else(|| format!("unknown by= column '{by_name}'"))?;
1764 let mut inner_options = options.clone();
1765 inner_options.remove("by");
1766 inner_options.remove("__by_col");
1767 inner_options.remove("id");
1768 let inner = build_smooth_basis(
1769 kind,
1770 vars,
1771 cols,
1772 &inner_options,
1773 ds,
1774 inference_notes,
1775 policy,
1776 smooth_coordinate_count,
1777 )?;
1778 let by_kind = match ds.column_kinds.get(by_col).copied() {
1779 Some(ColumnKindTag::Categorical) => ByVarKind::Factor {
1780 feature_col: by_col,
1781 ordered: option_bool(options, "ordered").unwrap_or(false),
1782 frozen_levels: None,
1783 },
1784 Some(ColumnKindTag::Continuous | ColumnKindTag::Binary) => ByVarKind::Numeric {
1785 feature_col: by_col,
1786 },
1787 None => {
1788 return Err(format!(
1789 "internal column-kind lookup failed for by='{by_name}'"
1790 ));
1791 }
1792 };
1793 return Ok(SmoothBasisSpec::BySmooth {
1794 smooth: Box::new(inner),
1795 by_kind,
1796 });
1797 }
1798
1799 let smooth_double_penalty = option_bool(options, "double_penalty").unwrap_or(true);
1800 let type_opt = resolve_smooth_type_name(kind, cols.len(), options);
1801
1802 if matches!(type_opt.as_str(), "fs" | "sz" | "re") {
1803 validate_known_options(
1804 type_opt.as_str(),
1805 options,
1806 &[
1807 "type",
1808 "bs",
1809 "k",
1810 "basis_dim",
1811 "basis-dim",
1812 "basisdim",
1813 "knots",
1814 "knot_placement",
1815 "knot-placement",
1816 "knotplacement",
1817 "degree",
1818 "penalty_order",
1819 "m",
1820 "double_penalty",
1821 "ordered",
1822 ],
1823 )?;
1824 if cols.len() != 2 {
1825 return Err(format!(
1826 "{} factor-smooth currently expects exactly two variables (one numeric, one categorical)",
1827 type_opt
1828 ));
1829 }
1830 let kinds = cols
1831 .iter()
1832 .map(|&c| ds.column_kinds.get(c).copied())
1833 .collect::<Vec<_>>();
1834 let (cont_idx, group_idx) = if type_opt == "re" {
1835 // mgcv random-slope examples are often s(g, x, bs="re").
1836 match (kinds[0], kinds[1]) {
1837 (Some(ColumnKindTag::Categorical), _) => (1usize, 0usize),
1838 (_, Some(ColumnKindTag::Categorical)) => (0usize, 1usize),
1839 _ => (1usize, 0usize),
1840 }
1841 } else {
1842 match (kinds[0], kinds[1]) {
1843 (_, Some(ColumnKindTag::Categorical)) => (0usize, 1usize),
1844 (Some(ColumnKindTag::Categorical), _) => (1usize, 0usize),
1845 _ => {
1846 return Err(format!(
1847 "{} factor-smooth requires one categorical factor variable",
1848 type_opt
1849 ));
1850 }
1851 }
1852 };
1853 let c = cols[cont_idx];
1854 let (minv, maxv) = col_minmax(ds.values.column(c))?;
1855 let degree = if type_opt == "re" {
1856 1
1857 } else {
1858 option_usize(options, "degree").unwrap_or(DEFAULT_BSPLINE_DEGREE)
1859 };
1860 // For a factor smooth every group's curve is fit from THAT group's rows
1861 // alone, so the marginal's flexibility must respect the least-resolved
1862 // group, not the pooled column. The pooled heuristic can hand the marginal
1863 // a basis that saturates (or exceeds) a small group's sample — e.g. the
1864 // sleepstudy panel has 8 training days per subject, and a default cubic
1865 // basis of 8 functions interpolates each subject's 8 points, leaving no
1866 // room for the wiggliness penalty to collapse the curve toward the
1867 // per-subject line. The factor smooth then fits within-group noise and
1868 // extrapolates badly (held-out forecast worse than the population mean).
1869 //
1870 // Cap the marginal basis below the minimum per-group covariate resolution
1871 // so the penalty always retains residual degrees of freedom to shrink each
1872 // group's curvature toward its linear null space (the random-slope
1873 // estimand). This small-group cap composes with a separate upper bound at
1874 // mgcv's factor-smooth default k=10 (FACTOR_SMOOTH_DEFAULT_BASIS_DIM,
1875 // applied below), so even ample-data groups get the modest SHARED marginal
1876 // a factor smooth wants rather than the full pooled basis. The explicit
1877 // `re` random-effect form takes neither cap: it is a raw linear `[1, x]`
1878 // random effect (0 internal knots), handled in the branch above.
1879 let pooled_internal = heuristic_knots_for_column(ds.values.column(c));
1880 let default_internal = if type_opt == "re" {
1881 // `bs="re"` is a PARAMETRIC random effect, not a smooth of the
1882 // covariate: `s(x, g, bs="re")` is the mgcv random intercept+slope
1883 // `(1 + x | g)`, i.e. a per-group line `[1, x]`, penalized by an iid
1884 // ridge. A degree-1 marginal with ZERO internal knots spans exactly
1885 // that linear space (2 coefficients per group). Using the pooled
1886 // knot heuristic here instead turned the marginal into a
1887 // piecewise-linear B-spline (e.g. 6 functions/group on sleepstudy),
1888 // i.e. a *smooth* with kinks rather than a random slope — many extra
1889 // collinear-across-levels coefficients that ill-condition the joint
1890 // Newton/REML solve (minutes-long fits, and a singular block when
1891 // combined with a separate random intercept `s(g, bs="re")`). The
1892 // raw linear basis is both the correct `re` semantics and fast.
1893 0
1894 } else {
1895 let min_group_resolution =
1896 min_per_group_unique_count(ds.values.column(c), ds.values.column(cols[group_idx]));
1897 // Per-group basis dim = degree + 1 + internal. Hold it well below the
1898 // smallest group's resolution (leave at least two residual points per
1899 // group) so the smooth cannot interpolate that group and the
1900 // wiggliness penalty retains the room to collapse each curve toward
1901 // its linear null space. Never drop below `degree + 2`, which keeps
1902 // exactly the linear span plus a single curvature direction — the
1903 // minimal smoother that can still bend if the data demand it.
1904 let basis_cap = min_group_resolution.saturating_sub(2).max(degree + 2);
1905 let internal_cap = basis_cap.saturating_sub(degree + 1);
1906 let capped = pooled_internal.min(internal_cap.max(1));
1907 // A factor smooth (`fs` AND `sz`) shares ONE marginal across ALL
1908 // levels, each level's curve fit from that group's rows alone. The
1909 // pooled knot heuristic (driven by the full column's sample) hands it
1910 // a much richer basis than the shared signal needs — ~24
1911 // functions/group on the gam#903 factor-smooth-recovery fixtures — so
1912 // REML has the capacity to fit within-group noise and over-fits the
1913 // shared shape (fs: edf 58 vs mgcv's k=10/edf 39; sz: gam 0.068 vs
1914 // mgcv 0.046 truth RMSE), losing the truth-recovery head-to-head with
1915 // the mature tool. mgcv's factor-smooth default `k=10` embodies the
1916 // right convention: a modest shared marginal. Cap the marginal there
1917 // (basis ≈ degree+1+internal ≈ 10) for both flavours when the
1918 // small-group cap above is not already tighter, so REML is not handed
1919 // noise-fitting capacity it does not need. An explicit `k`/`basis_dim`
1920 // overrides this (parse_ps_internal_knots); `re` is the raw linear
1921 // effect handled above.
1922 let fs_default_internal = FACTOR_SMOOTH_DEFAULT_BASIS_DIM
1923 .saturating_sub(degree + 1)
1924 .max(1);
1925 capped.min(fs_default_internal)
1926 };
1927 let (n_knots, _, effective_degree) =
1928 parse_ps_internal_knots(options, degree, default_internal)?;
1929 let penalty_order = option_usize(options, "penalty_order")
1930 .unwrap_or(if effective_degree > 1 { 2 } else { 1 })
1931 .min(effective_degree);
1932 // All factor-smooth flavours (`fs`, `sz`, `re`) place their per-level
1933 // marginal on the SAME penalized B-spline (P-spline) basis. The flavours
1934 // differ ONLY in their penalty/constraint structure (handled below) —
1935 // sz: zero-sum deviation blocks with the per-level null space left
1936 // unpenalized; fs: random-effect double penalty; re: identity ridge.
1937 //
1938 // `sz` USED to route its default-degree marginal to a NATURAL cubic
1939 // regression spline (`cr`), on the belief that mgcv's `bs="sz"` does the
1940 // same and that cr recovers smooth signals more efficiently than the
1941 // (then uncapped) B-spline margin (#1074). That introduced a consistency
1942 // failure (#1605): the `cr` basis enforces the natural boundary
1943 // conditions f''(x_1)=f''(x_k)=0 and extrapolates linearly past the end
1944 // knots, so it CANNOT represent a per-group deviation curve with non-zero
1945 // curvature at the data boundary. Phase-shifted deviation shapes
1946 // (f''(0) = -(2π)² sin(φ) ≠ 0) are then biased toward "free linear +
1947 // anchored wiggle", under-shooting the amplitude — a bias that does NOT
1948 // vanish as n→∞ (n-independent: a genuine consistency failure, not
1949 // finite-sample shrinkage). The earlier #700/#1074 sz fixtures used
1950 // d_g ∝ sin(2πx), whose f'' happens to vanish at x=0 and x=1, so they
1951 // accidentally satisfied the natural BC and never exposed the gap; the
1952 // `fs` sibling, on this very B-spline marginal, recovers the SAME
1953 // phase-shifted data to the noise floor.
1954 //
1955 // The penalized B-spline marginal makes no boundary assumption, so it
1956 // represents arbitrary deviation shapes, and — with the
1957 // FACTOR_SMOOTH_DEFAULT_BASIS_DIM cap above already removing the
1958 // noise-fitting capacity that originally motivated leaving B-splines —
1959 // it recovers the BC-satisfying #700/#1074 signals just as well. Sharing
1960 // one marginal basis across all flavours also lets the B-spline degree/
1961 // knot degradation handle low-cardinality covariates uniformly (what
1962 // `fs` already does), so the `sz`-only cr data-support cap (#1541/#1542)
1963 // — and the asymmetry where only the cr-marginal `sz` spelling hard-
1964 // failed a 3-level ordinal — is no longer needed.
1965 let marginal_knotspec = resolve_nonperiodic_bspline_knotspec(
1966 options,
1967 ds.values.column(c),
1968 (minv, maxv),
1969 effective_degree,
1970 n_knots,
1971 )?;
1972 let marginal = BSplineBasisSpec {
1973 degree: effective_degree,
1974 penalty_order,
1975 knotspec: marginal_knotspec,
1976 // mgcv's `bs="fs"` is a random-effect-style smooth: EVERY per-level
1977 // coefficient, including the marginal null space, is penalized so
1978 // unobserved groups can be predicted — so `fs` keeps the null-space
1979 // (double) penalty. mgcv's `bs="sz"` is a pure across-level
1980 // *deviation* smooth that, under the default `select=FALSE`, leaves
1981 // the per-level null space UNPENALIZED; carrying the double penalty
1982 // there shrinks the genuine deviation signal and over-smooths the
1983 // recovered curves relative to mgcv (gam#700). `re` carries its own
1984 // identity ridge below and ignores this flag. Honour an explicit
1985 // user `double_penalty=` either way.
1986 double_penalty: option_bool(options, "double_penalty")
1987 .unwrap_or(type_opt.as_str() != "sz"),
1988 identifiability: BSplineIdentifiability::None,
1989 boundary_conditions: Default::default(),
1990 boundary: OneDimensionalBoundary::Open,
1991 };
1992 let flavour = match type_opt.as_str() {
1993 "fs" => FactorSmoothFlavour::Fs {
1994 m_null_penalty_orders: vec![
1995 option_usize(options, "m").unwrap_or(DEFAULT_PENALTY_ORDER),
1996 ],
1997 },
1998 "sz" => FactorSmoothFlavour::Sz,
1999 "re" => FactorSmoothFlavour::Re,
2000 // Outer `matches!` already restricts to fs/sz/re.
2001 other => {
2002 return Err(format!(
2003 "internal: factor-smooth flavour dispatch reached unexpected type `{}`",
2004 other
2005 ));
2006 }
2007 };
2008 return Ok(SmoothBasisSpec::FactorSmooth {
2009 spec: FactorSmoothSpec {
2010 continuous_cols: vec![c],
2011 group_col: cols[group_idx],
2012 marginal,
2013 flavour,
2014 group_frozen_levels: None,
2015 frozen_global_orthogonality: None,
2016 },
2017 });
2018 }
2019
2020 match type_opt.as_str() {
2021 "cyclic" | "cc" | "cp" | "cyclic-ps" => {
2022 validate_known_options(
2023 "cyclic",
2024 options,
2025 &[
2026 "type",
2027 "bs",
2028 "by",
2029 "k",
2030 "basis_dim",
2031 "basis-dim",
2032 "basisdim",
2033 "degree",
2034 "penalty_order",
2035 "period",
2036 "periods",
2037 "period_start",
2038 "period_end",
2039 "start",
2040 "end",
2041 "origin",
2042 "origins",
2043 "period_origin",
2044 "period-origin",
2045 "domain_origin",
2046 "double_penalty",
2047 "id",
2048 "__by_col",
2049 "identifiability",
2050 ],
2051 )?;
2052 if cols.len() != 1 {
2053 return Err(format!(
2054 "periodic smooth expects one variable, got {}",
2055 cols.len()
2056 ));
2057 }
2058 let c = cols[0];
2059 let (minv, maxv) = col_minmax(ds.values.column(c))?;
2060 let degree = option_usize(options, "degree").unwrap_or(DEFAULT_BSPLINE_DEGREE);
2061 let mut default_internal = heuristic_knots_for_column(ds.values.column(c));
2062 if ds.values.nrows() <= 32 && smooth_coordinate_count >= 5 {
2063 default_internal = default_internal.min(1);
2064 }
2065 // A periodic cubic spline has no free endpoint behaviour to spend
2066 // degrees of freedom on: the wrap constraint removes the ordinary
2067 // boundary wiggle, and the cyclic second-difference penalty leaves
2068 // only the constant direction (handled by the smooth
2069 // identifiability constraint). An over-rich default would give
2070 // small binomial/continuation-ratio fits a large penalized nuisance
2071 // space whose REML/LAML optimum is driven by finite-sample Bernoulli
2072 // noise rather than the low-frequency periodic signal. Cap the
2073 // cyclic default in the mgcv `bs="cc"` spirit: a modest basis unless
2074 // the caller explicitly requests `k=...`; high-frequency periodic
2075 // structure remains available through that explicit contract. Since
2076 // gam#1680 lowered the open-spline univariate default to ≈12
2077 // functions this cap and the open-spline default coincide, so it now
2078 // acts as an explicit floor/guard that keeps the cyclic default lean
2079 // even if the open-spline heuristic is later widened.
2080 let cyclic_default_basis_cap = CYCLIC_DEFAULT_BASIS_DIM.max(degree + 1);
2081 let default_basis = (default_internal + degree + 1).min(cyclic_default_basis_cap);
2082 let num_basis = option_usize_any(options, &["k", "basis_dim", "basis-dim", "basisdim"])
2083 .unwrap_or(default_basis);
2084 if num_basis < degree + 1 {
2085 return Err(format!(
2086 "periodic smooth: k={} too small for degree {}; expected k >= {}",
2087 num_basis,
2088 degree,
2089 degree + 1
2090 ));
2091 }
2092 // The cyclic arm is periodic on its single axis by construction, so
2093 // resolve the period exactly the way the `s()`/`ps` arm does: honour
2094 // `period=`/`periods=` first (with `origin=` setting the domain
2095 // start), and fall back to the `period_start`/`period_end` endpoint
2096 // form only when `period=` is absent. Previously this arm jumped
2097 // straight to `parse_periodic_domain_1d`, so a `period=<v>`
2098 // declaration was silently dropped and the smooth wrapped at the
2099 // data range (#816). All three helpers route through
2100 // `parse_numeric_expr`, so `period=2*pi` and `period_end=2*pi` parse
2101 // identically (#815).
2102 let periodic_axes = [true];
2103 let periods = parse_periods(options, &periodic_axes)?;
2104 let origins = parse_period_origins(options, &periodic_axes)?;
2105 // Distinguish a *cyclic basis selector* (`bs='cc'`/`cp'`/`cyclic`,
2106 // this whole arm) from a generic B-spline forced periodic by a
2107 // `periodic=`/`boundary=` flag (the `ps`/`bspline` arm). Only the
2108 // latter carries the sample-dependent off-by-ε seam that #1771's
2109 // guard in `parse_periodic_domain_1d` requires an explicit period
2110 // to avoid. A bare `s(x, bs='cc')` opts INTO mgcv's `bs="cc"`
2111 // semantics — the wrap IS the observed data range — exactly like
2112 // the tensor cc-margin fallback (`te(x, z, bs=c('cc','cc'))`). The
2113 // cyclic arm was left routing through the now-strict helper when
2114 // #1771 tightened it, so a bare cyclic smooth hard-errored with
2115 // "periodic B-spline smooth requires an explicit period" even
2116 // though its period is well-defined. Honor `period=`/`periods=`
2117 // first, then the half-open `period_start`/`period_end` endpoint
2118 // form, and only otherwise wrap at the observed `[min, max]` span.
2119 let has_endpoint_decl = ["period_start", "start", "period_end", "end"]
2120 .iter()
2121 .any(|key| options.contains_key(*key));
2122 let (domain_start, period) = if let Some(p) = periods[0] {
2123 (origins[0].unwrap_or(minv), p)
2124 } else if has_endpoint_decl {
2125 parse_periodic_domain_1d(options, minv, maxv)?
2126 } else {
2127 let span = maxv - minv;
2128 if !(span.is_finite() && span > 0.0) {
2129 return Err(format!(
2130 "cyclic smooth requires a positive observed data range to derive \
2131 its period, got [{minv}, {maxv}]"
2132 ));
2133 }
2134 (origins[0].unwrap_or(minv), span)
2135 };
2136 Ok(SmoothBasisSpec::BSpline1D {
2137 feature_col: c,
2138 spec: BSplineBasisSpec {
2139 degree,
2140 penalty_order: option_usize(options, "penalty_order")
2141 .unwrap_or(DEFAULT_PENALTY_ORDER),
2142 knotspec: BSplineKnotSpec::PeriodicUniform {
2143 data_range: (domain_start, domain_start + period),
2144 num_basis,
2145 },
2146 double_penalty: smooth_double_penalty,
2147 identifiability: BSplineIdentifiability::default(),
2148 boundary_conditions: Default::default(),
2149 boundary: OneDimensionalBoundary::Cyclic {
2150 start: domain_start,
2151 end: domain_start + period,
2152 },
2153 },
2154 })
2155 }
2156 "bspline" | "ps" | "p-spline" | "cr" | "cs" => {
2157 // mgcv's `bs="cr"` (cubic regression spline) and `bs="cs"` (its
2158 // shrinkage twin) are penalized cubic-regression smooths that span
2159 // the same per-axis function space as gamfit's `bspline` (cubic
2160 // B-spline, second-derivative penalty). Route both through the
2161 // 1-D B-spline arm; the only semantic difference is whether the
2162 // null space is shrunk: `cr` is the no-shrinkage form (mgcv's
2163 // default) and `cs` is the shrinkage form (mgcv's `cs`/gamfit's
2164 // double_penalty). Without this route, a stand-alone
2165 // `s(x, bs='cr')` (which is otherwise a routine 1-D smooth in
2166 // mgcv-compatible formulae) reached the dispatch's default arm
2167 // and aborted the whole fit with `unsupported smooth type 'cr'`,
2168 // even though the same name was already recognized as a tensor
2169 // margin (`tensor_margin_bs_is_supported`).
2170 let validation_name = match type_opt.as_str() {
2171 "cr" => "cr",
2172 "cs" => "cs",
2173 _ => "bspline",
2174 };
2175 validate_known_options(
2176 validation_name,
2177 options,
2178 &[
2179 "type",
2180 "bs",
2181 "by",
2182 "k",
2183 "basis_dim",
2184 "basis-dim",
2185 "basisdim",
2186 "knots",
2187 "knot_placement",
2188 "knot-placement",
2189 "knotplacement",
2190 "degree",
2191 "penalty_order",
2192 "boundary",
2193 "bc",
2194 "boundary_conditions",
2195 "bc_left",
2196 "bc_right",
2197 "left_bc",
2198 "right_bc",
2199 "start_bc",
2200 "end_bc",
2201 "side",
2202 "anchor",
2203 "anchor_value",
2204 "value",
2205 "anchor_left",
2206 "left_anchor",
2207 "anchor_right",
2208 "right_anchor",
2209 "periodic",
2210 "period",
2211 "periods",
2212 "period_start",
2213 "period_end",
2214 "origin",
2215 "double_penalty",
2216 "by",
2217 "id",
2218 "__by_col",
2219 "identifiability",
2220 "by",
2221 ],
2222 )?;
2223 if cols.len() != 1 {
2224 return Err(TermBuilderError::incompatible_config(format!(
2225 "bspline smooth expects one variable, got {}",
2226 cols.len()
2227 ))
2228 .to_string());
2229 }
2230 let c = cols[0];
2231 let (minv, maxv) = col_minmax(ds.values.column(c))?;
2232 let degree = option_usize(options, "degree").unwrap_or(DEFAULT_BSPLINE_DEGREE);
2233 let default_internal = heuristic_knots_for_column(ds.values.column(c));
2234 let (mut n_knots, inferred, effective_degree) =
2235 parse_ps_internal_knots(options, degree, default_internal)?;
2236 let periodic_axes = parse_periodic_axes(options, 1).map_err(|e| e.to_string())?;
2237 // Periodic margins still need enough basis functions to wrap, so
2238 // surface the per-axis degree reduction as a config error when the
2239 // user explicitly asked for a periodic-but-too-small basis. The
2240 // non-periodic path silently degrades degree to match mgcv.
2241 if periodic_axes[0] && effective_degree != degree {
2242 return Err(TermBuilderError::invalid_option(format!(
2243 "periodic smooth: k={} too small for degree {}; expected k >= {}",
2244 effective_degree + 1,
2245 degree,
2246 degree + 1
2247 ))
2248 .to_string());
2249 }
2250 if inferred && ds.values.nrows() <= 32 && smooth_coordinate_count >= 5 {
2251 n_knots = n_knots.min(1);
2252 }
2253 if inferred {
2254 let unique = unique_count_column(ds.values.column(c));
2255 let ceiling = ((unique as f64).cbrt() as usize).max(20);
2256 inference_notes.push(format!(
2257 "Automatically set {} internal knots for smooth '{}' from {} unique values (rule: clamp(unique/4, 4..max(20, cbrt(unique))) = clamp(unique/4, 4..{})). Override with knots=... or k=....",
2258 n_knots,
2259 vars.join(","),
2260 unique,
2261 ceiling,
2262 ));
2263 }
2264 let boundary_conditions =
2265 if periodic_axes[0] && bspline_boundary_declares_periodic_axis(options) {
2266 BSplineBoundaryConditions::default()
2267 } else {
2268 parse_bspline_boundary_conditions(options).map_err(|e| e.to_string())?
2269 };
2270 let periods = parse_periods(options, &periodic_axes).map_err(|e| e.to_string())?;
2271 let origins =
2272 parse_period_origins(options, &periodic_axes).map_err(|e| e.to_string())?;
2273 let (knotspec, boundary) = if periodic_axes[0] {
2274 if !boundary_conditions.is_free() {
2275 return Err(TermBuilderError::incompatible_config(
2276 "periodic B-splines cannot also declare endpoint boundary conditions",
2277 )
2278 .to_string());
2279 }
2280 {
2281 let (domain_start, p_value) = if periods[0].is_some() {
2282 (origins[0].unwrap_or(minv), periods[0].unwrap())
2283 } else {
2284 parse_periodic_domain_1d(options, minv, maxv).map_err(|e| e.to_string())?
2285 };
2286 let domain_end = domain_start + p_value;
2287 (
2288 BSplineKnotSpec::PeriodicUniform {
2289 data_range: (domain_start, domain_end),
2290 num_basis: n_knots + effective_degree + 1,
2291 },
2292 OneDimensionalBoundary::Cyclic {
2293 start: domain_start,
2294 end: domain_end,
2295 },
2296 )
2297 }
2298 } else if type_opt == "cr" || type_opt == "cs" {
2299 // mgcv `bs="cr"`/`"cs"`: a natural cubic regression spline whose
2300 // basis is indexed by `k` values at quantile-placed knots (#1074),
2301 // NOT a B-spline knot vector. Match gam's `k=` convention by
2302 // requesting the same total basis size the B-spline arm would
2303 // produce (`n_knots` internal + degree + 1), floored at the cr
2304 // minimum of 3 knots. `cr` vs `cs` (shrinkage) is carried by the
2305 // `double_penalty` flag resolved below, which the cr builder reads.
2306 //
2307 // Cap that request to the covariate's data support (#1541): a cr
2308 // basis cannot place more value-knots than there are distinct
2309 // covariate values, so an unclamped `k` on a low-cardinality
2310 // predictor (binary indicator, 3-level ordinal, small count) used
2311 // to hard-fail in `select_cr_knots` instead of reducing like mgcv
2312 // and gam's tensor path. Below the cr minimum (a binary covariate)
2313 // degrade to the B-spline marginal the default `s(x, k=..)` basis
2314 // already fits on the same data — never a hard error.
2315 let k_cr = (n_knots + effective_degree + 1).max(CR_MIN_KNOTS);
2316 let knotspec = match capped_cr_marginal_knotspec(
2317 ds.values.column(c),
2318 k_cr,
2319 &vars.join(","),
2320 inference_notes,
2321 )? {
2322 Some(cr_knotspec) => cr_knotspec,
2323 None => resolve_nonperiodic_bspline_knotspec(
2324 options,
2325 ds.values.column(c),
2326 (minv, maxv),
2327 effective_degree,
2328 n_knots,
2329 )?,
2330 };
2331 (knotspec, parse_cyclic_boundary(options, minv, maxv)?)
2332 } else {
2333 (
2334 resolve_nonperiodic_bspline_knotspec(
2335 options,
2336 ds.values.column(c),
2337 (minv, maxv),
2338 effective_degree,
2339 n_knots,
2340 )?,
2341 parse_cyclic_boundary(options, minv, maxv)?,
2342 )
2343 };
2344 // mgcv `bs="cr"` does not shrink the linear null space; only `cs`
2345 // (and the gamfit-flavoured `bspline`/`ps`) do. Honour an explicit
2346 // `double_penalty=` either way.
2347 let double_penalty = if type_opt == "cr" {
2348 option_bool(options, "double_penalty").unwrap_or(false)
2349 } else {
2350 smooth_double_penalty
2351 };
2352 // Clamp the marginal difference penalty to `<= effective_degree`
2353 // so it stays well-defined when the per-axis degree was reduced
2354 // (mirrors the tensor margin path: `create_difference_penalty_matrix`
2355 // requires order < num_basis_functions).
2356 let penalty_order = option_usize(options, "penalty_order")
2357 .unwrap_or(DEFAULT_PENALTY_ORDER)
2358 .min(effective_degree);
2359 Ok(SmoothBasisSpec::BSpline1D {
2360 feature_col: c,
2361 spec: BSplineBasisSpec {
2362 degree: effective_degree,
2363 penalty_order,
2364 knotspec,
2365 double_penalty,
2366 identifiability: BSplineIdentifiability::default(),
2367 boundary,
2368 boundary_conditions,
2369 },
2370 })
2371 }
2372 "tps" | "thinplate" | "thin-plate" => {
2373 validate_known_options(
2374 "thinplate",
2375 options,
2376 &[
2377 SECONDARY_CENTER_CAP_OPTION,
2378 "type",
2379 "bs",
2380 "by",
2381 "length_scale",
2382 "centers",
2383 "k",
2384 "basis_dim",
2385 "basis-dim",
2386 "basisdim",
2387 "knots",
2388 "include_intercept",
2389 "double_penalty",
2390 "by",
2391 "id",
2392 "__by_col",
2393 "identifiability",
2394 "by",
2395 "periodic",
2396 "cyclic",
2397 "period",
2398 "period_start",
2399 "period_end",
2400 "scale_dims",
2401 ],
2402 )?;
2403 let plan = plan_spatial_basis(
2404 ds.values.nrows(),
2405 cols.len(),
2406 CenterCountRequest::Default,
2407 DuchonNullspaceOrder::Linear,
2408 option_bool(options, "scale_dims").unwrap_or(false),
2409 policy,
2410 )
2411 .map_err(|e| e.to_string())?;
2412 // #1074: the mgcv-sized basis cap (`k = 10·3^(d-1)`) that used to live
2413 // here was DELETED. It masked the real defect — the n-scaling default
2414 // over-sizes a thin-plate field, producing a weakly-identified
2415 // two-penalty ρ-surface the outer optimizer stalls on (row-order
2416 // dependent, #1378), and surplus columns REML can't penalize away on
2417 // weak-signal fits. Capping the basis hid that stall instead of fixing
2418 // it. The default now uses the generic spatial center heuristic; the
2419 // root fix (a well-identified ρ-surface / optimizer that doesn't stall)
2420 // is tracked separately. Explicit `k`/`centers` still take full effect.
2421 let default_centers = plan.centers;
2422 let centers = parse_countwith_basis_alias(
2423 options,
2424 "centers",
2425 cap_default_spatial_centers(options, default_centers),
2426 )?;
2427 let center_strategy = if has_explicit_countwith_basis_alias(options, "centers") {
2428 spatial_center_strategy_for_dimension(centers, cols.len())
2429 } else {
2430 auto_spatial_center_strategy(centers, cols.len())
2431 };
2432 Ok(SmoothBasisSpec::ThinPlate {
2433 feature_cols: cols.to_vec(),
2434 spec: ThinPlateBasisSpec {
2435 center_strategy,
2436 periodic: parse_periodic_axes_option(options, cols.len())?,
2437 // Sentinel: leave at 0.0 when the user didn't pass an
2438 // explicit length_scale so `auto_init_length_scale_in_place`
2439 // can replace it with a data-derived initialization. The
2440 // old hard-coded 1.0 was the documented basin (see
2441 // smooth.rs `auto_init_length_scale_in_place`) that the
2442 // spatial optimizer could not escape, leaving TPS terms
2443 // initialized off the data scale.
2444 length_scale: option_f64(options, "length_scale").unwrap_or(0.0),
2445 double_penalty: smooth_double_penalty,
2446 identifiability: parse_spatial_identifiability(options)
2447 .map_err(|e| e.to_string())?,
2448 radial_reparam: None,
2449 },
2450 input_scales: None,
2451 })
2452 }
2453 "sphere" | "s2" | "sos" => {
2454 validate_known_options(
2455 "sphere",
2456 options,
2457 &[
2458 "type",
2459 "bs",
2460 "by",
2461 "centers",
2462 "k",
2463 "basis_dim",
2464 "basis-dim",
2465 "basisdim",
2466 "knots",
2467 "penalty_order",
2468 "m",
2469 "double_penalty",
2470 "id",
2471 "__by_col",
2472 "kernel",
2473 "method",
2474 "radians",
2475 "units",
2476 "degree",
2477 "l",
2478 "max_degree",
2479 "max-degree",
2480 ],
2481 )?;
2482 if cols.len() != 2 {
2483 return Err(format!(
2484 "sphere smooth expects exactly two variables (lat, lon), got {}",
2485 cols.len()
2486 ));
2487 }
2488 let radians = option_bool(options, "radians").unwrap_or_else(|| {
2489 options
2490 .get("units")
2491 .map(|u| u.eq_ignore_ascii_case("radian") || u.eq_ignore_ascii_case("radians"))
2492 .unwrap_or(false)
2493 });
2494 // An explicit `degree`/`l`/`max_degree` names a spherical-harmonic
2495 // truncation, so with no explicit kernel/method it selects the
2496 // Harmonic construction (the Wahba kernel ignores `degree` and would
2497 // silently emit a 1-column kernel design). An explicit kernel/method
2498 // still wins.
2499 let degree_requested = options.contains_key("degree")
2500 || options.contains_key("l")
2501 || options.contains_key("max_degree")
2502 || options.contains_key("max-degree");
2503 let kernel = options
2504 .get("kernel")
2505 .or_else(|| options.get("method"))
2506 .map(|raw| strip_quotes(raw).trim().to_ascii_lowercase())
2507 .unwrap_or_else(|| {
2508 if degree_requested {
2509 "harmonic".to_string()
2510 } else {
2511 "sobolev".to_string()
2512 }
2513 });
2514 let (method, wahba_kernel) = match kernel.as_str() {
2515 "sobolev" | "wahba" | "wahba_sobolev" | "wahba-sobolev" => {
2516 (SphereMethod::Wahba, SphereWahbaKernel::Sobolev)
2517 }
2518 "pseudo" | "mgcv" | "sos" | "wahba_pseudo" | "wahba-pseudo" => {
2519 (SphereMethod::Wahba, SphereWahbaKernel::Pseudo)
2520 }
2521 "harmonic" | "spherical_harmonic" | "spherical-harmonic" => {
2522 (SphereMethod::Harmonic, SphereWahbaKernel::Sobolev)
2523 }
2524 other => {
2525 return Err(format!(
2526 "unsupported sphere kernel '{other}'; expected sobolev, pseudo, or harmonic"
2527 ));
2528 }
2529 };
2530 let max_degree = if matches!(method, SphereMethod::Harmonic) {
2531 let degree =
2532 option_usize_any(options, &["degree", "l", "max_degree", "max-degree"])
2533 .or_else(|| option_usize(options, "centers"))
2534 .or_else(|| {
2535 option_usize_any(options, &["k", "basis_dim", "basis-dim", "basisdim"])
2536 .and_then(|k| (1..=128).find(|&l| l * (l + 2) >= k))
2537 })
2538 .unwrap_or_else(|| default_spherical_harmonic_degree(ds.values.nrows()));
2539 if degree == 0 {
2540 return Err("sphere smooth requires degree/max_degree >= 1".to_string());
2541 }
2542 if degree > 32 {
2543 return Err(format!(
2544 "sphere smooth max_degree={} is too large for the dense harmonic engine (limit 32)",
2545 degree
2546 ));
2547 }
2548 Some(degree)
2549 } else {
2550 None
2551 };
2552 let penalty_order = option_usize(options, "penalty_order")
2553 .or_else(|| option_usize(options, "m"))
2554 .unwrap_or(DEFAULT_PENALTY_ORDER);
2555 let center_strategy = if matches!(method, SphereMethod::Wahba) {
2556 let mut centers = parse_countwith_basis_alias(
2557 options,
2558 "centers",
2559 default_num_centers(ds.values.nrows(), cols.len()),
2560 )?;
2561 if penalty_order >= 4 {
2562 centers = centers.max(30);
2563 }
2564 CenterStrategy::FarthestPoint {
2565 num_centers: centers,
2566 }
2567 } else {
2568 CenterStrategy::FarthestPoint { num_centers: 0 }
2569 };
2570 Ok(SmoothBasisSpec::Sphere {
2571 feature_cols: cols.to_vec(),
2572 spec: SphericalSplineBasisSpec {
2573 center_strategy,
2574 penalty_order,
2575 double_penalty: smooth_double_penalty,
2576 radians,
2577 method,
2578 max_degree,
2579 wahba_kernel,
2580 identifiability: SphericalSplineIdentifiability::CenterSumToZero,
2581 },
2582 })
2583 }
2584 "curvature" => {
2585 // Constant-curvature (M_κ) geodesic-kernel smooth (#944): the
2586 // κ-generic sibling of the intrinsic S² smooth above. The feature
2587 // columns are κ-stereographic chart coordinates; `kappa=` is the
2588 // fixed sectional curvature (default 0 = flat), and the geometry
2589 // comes from `geometry::constant_curvature::ConstantCurvature`.
2590 validate_known_options(
2591 "curvature",
2592 options,
2593 &[
2594 "type",
2595 "bs",
2596 "by",
2597 "centers",
2598 "k",
2599 "basis_dim",
2600 "basis-dim",
2601 "basisdim",
2602 "knots",
2603 "kappa",
2604 "length_scale",
2605 "double_penalty",
2606 "id",
2607 "__by_col",
2608 ],
2609 )?;
2610 let kappa = option_f64(options, "kappa").unwrap_or(0.0);
2611 if !kappa.is_finite() {
2612 return Err("curvature smooth requires a finite kappa".to_string());
2613 }
2614 let length_scale = option_f64(options, "length_scale").unwrap_or(0.0);
2615 if !length_scale.is_finite() || length_scale < 0.0 {
2616 return Err(format!(
2617 "curvature smooth length_scale must be positive (or omitted for auto); got {length_scale}"
2618 ));
2619 }
2620 let centers = parse_countwith_basis_alias(
2621 options,
2622 "centers",
2623 default_num_centers(ds.values.nrows(), cols.len()),
2624 )?;
2625 if centers < 2 {
2626 return Err("curvature smooth requires at least 2 centers".to_string());
2627 }
2628 Ok(SmoothBasisSpec::ConstantCurvature {
2629 feature_cols: cols.to_vec(),
2630 spec: ConstantCurvatureBasisSpec {
2631 center_strategy: CenterStrategy::FarthestPoint {
2632 num_centers: centers,
2633 },
2634 kappa,
2635 // 0.0 sentinel = κ-independent auto initialization in the
2636 // basis builder (median chart center spacing, doubled).
2637 length_scale,
2638 // Curvature smooth defaults to NO double-penalty ridge
2639 // (#1464): the curvature-blind ridge `I` absorbs the data fit
2640 // independently of κ and rails the fitted curvature to the
2641 // +chart bound (hyperbolic truth recovered as spherical). The
2642 // RKHS Gram penalty is already full-rank PD, so the ridge adds
2643 // no stability. Honour an EXPLICIT `double_penalty=` only.
2644 double_penalty: option_bool(options, "double_penalty").unwrap_or(false),
2645 identifiability: ConstantCurvatureIdentifiability::CenterSumToZero,
2646 },
2647 })
2648 }
2649 "measurejet" => {
2650 // Measure-jet spline: multiscale local-jet-residual energy of the
2651 // empirical measure. The feature columns are ambient coordinates
2652 // of data concentrated near an unknown low-dimensional set; the
2653 // geometry (centers, masses, scale band) is read off the measure
2654 // at build time — magic by default, every option optional.
2655 validate_known_options(
2656 "measurejet",
2657 options,
2658 &[
2659 "type",
2660 "bs",
2661 "by",
2662 "centers",
2663 "k",
2664 "basis_dim",
2665 "basis-dim",
2666 "basisdim",
2667 "knots",
2668 "s",
2669 "alpha",
2670 "tau",
2671 "scales",
2672 "length_scale",
2673 "double_penalty",
2674 "multiscale",
2675 "learn_length_scale",
2676 "id",
2677 "__by_col",
2678 ],
2679 )?;
2680 let order_s = option_f64(options, "s").unwrap_or(0.0);
2681 // 0.0 = auto sentinel; explicit values must sit inside the
2682 // admissible order interval of the affine-jet (r = 2) energy.
2683 if !(order_s.is_finite() && (order_s == 0.0 || (order_s > 0.0 && order_s < 2.0))) {
2684 return Err(format!(
2685 "measurejet smooth s must lie in (0, 2) (or be omitted for auto); got {order_s}"
2686 ));
2687 }
2688 // Default to the spec Default (α = 1, density-WEIGHTED Hessian
2689 // energy — the module-header default). The density-free α = 3/2
2690 // (q^{−2}) over-smooths low-intrinsic-dimension manifolds where the
2691 // local mass q is tiny and varies along the stratum (#1116:
2692 // 13×-worse-than-matérn on a 1-D curve in 3-D); α = 1's q^{−1} is
2693 // gentler and robust across intrinsic dimensions. An explicit
2694 // `alpha=` still overrides for full-dimensional density-free use.
2695 let alpha =
2696 option_f64(options, "alpha").unwrap_or(MeasureJetBasisSpec::default().alpha);
2697 if !alpha.is_finite() {
2698 return Err("measurejet smooth requires a finite alpha".to_string());
2699 }
2700 let tau0 = option_f64(options, "tau").unwrap_or(1e-3);
2701 if !(tau0.is_finite() && tau0 >= 0.0) {
2702 return Err(format!(
2703 "measurejet smooth tau must be finite and nonnegative; got {tau0}"
2704 ));
2705 }
2706 let num_scales = option_usize(options, "scales").unwrap_or(0);
2707 let length_scale = option_f64(options, "length_scale").unwrap_or(0.0);
2708 if !length_scale.is_finite() || length_scale < 0.0 {
2709 return Err(format!(
2710 "measurejet smooth length_scale must be positive (or omitted for auto); got {length_scale}"
2711 ));
2712 }
2713 let centers = parse_countwith_basis_alias(
2714 options,
2715 "centers",
2716 default_num_centers(ds.values.nrows(), cols.len()),
2717 )?;
2718 if centers < 3 {
2719 return Err("measurejet smooth requires at least 3 centers".to_string());
2720 }
2721 // Multiscale (per-scale spectral split + (α, lnτ) ψ dials + the
2722 // affine-preserving ridge) is an explicit opt-in (#1116): default
2723 // single-scale at any center count, the Duchon/Matérn footprint.
2724 let multiscale = option_bool(options, "multiscale").unwrap_or(false);
2725 // REML-learning the representer range ℓ is an explicit opt-in.
2726 // The stable default freezes ℓ at the auto/user value; the
2727 // design-moving coordinate is expensive and can overfit low-signal
2728 // surfaces when enabled implicitly.
2729 let learn_length_scale = option_bool(options, "learn_length_scale").unwrap_or(false);
2730 Ok(SmoothBasisSpec::MeasureJet {
2731 feature_cols: cols.to_vec(),
2732 spec: MeasureJetBasisSpec {
2733 center_strategy: CenterStrategy::FarthestPoint {
2734 num_centers: centers,
2735 },
2736 order_s,
2737 alpha,
2738 tau0,
2739 num_scales,
2740 // 0.0 sentinel = auto initialization in the basis builder
2741 // (median nearest-center spacing).
2742 length_scale,
2743 double_penalty: smooth_double_penalty,
2744 learn_length_scale,
2745 multiscale,
2746 identifiability: MeasureJetIdentifiability::CenterSumToZero,
2747 frozen_quadrature: None,
2748 },
2749 input_scales: None,
2750 })
2751 }
2752 "matern" => {
2753 // Catch typos like `lengt_scale=` / `nyu=` / `centerz=` before
2754 // they get silently ignored and the user wonders why their
2755 // option had no effect. The matern() term accepts exactly
2756 // these options.
2757 validate_known_options(
2758 "matern",
2759 options,
2760 &[
2761 SECONDARY_CENTER_CAP_OPTION,
2762 "type",
2763 "bs",
2764 "by",
2765 "nu",
2766 "length_scale",
2767 "centers",
2768 "k",
2769 "basis_dim",
2770 "basis-dim",
2771 "basisdim",
2772 "knots",
2773 "include_intercept",
2774 "double_penalty",
2775 "by",
2776 "id",
2777 "__by_col",
2778 "identifiability",
2779 "by",
2780 "periodic",
2781 "cyclic",
2782 "period",
2783 "period_start",
2784 "period_end",
2785 "scale_dims",
2786 ],
2787 )?;
2788 let plan = plan_spatial_basis(
2789 ds.values.nrows(),
2790 cols.len(),
2791 CenterCountRequest::Default,
2792 DuchonNullspaceOrder::Zero,
2793 option_bool(options, "scale_dims").unwrap_or(false),
2794 policy,
2795 )
2796 .map_err(|e| e.to_string())?;
2797 let centers = parse_countwith_basis_alias(
2798 options,
2799 "centers",
2800 cap_default_spatial_centers(
2801 options,
2802 default_matern_center_count(ds.values.nrows(), cols.len(), plan.centers),
2803 ),
2804 )?;
2805 let center_strategy = if has_explicit_countwith_basis_alias(options, "centers") {
2806 spatial_center_strategy_for_dimension(centers, cols.len())
2807 } else {
2808 auto_spatial_center_strategy(centers, cols.len())
2809 };
2810 let nu = parse_matern_nu(options.get("nu").map(String::as_str).unwrap_or("5/2"))?;
2811 // The exponential (ν = 1/2) Matérn kernel has a singular Laplacian
2812 // at zero in d ≥ 2, so the operator-collocation penalty machinery
2813 // hits a non-invertible matrix during fit. Surface the cause
2814 // up-front instead of letting the user see the generic
2815 // "Matrix conditioning issue detected" wrapper from PIRLS.
2816 if matches!(nu, MaternNu::Half) && cols.len() >= 2 {
2817 return Err(TermBuilderError::unsupported_feature(format!(
2818 "matern() with nu=1/2 is not supported for d>=2 (got {} covariates): \
2819 the exponential kernel's Laplacian is singular at center collisions, \
2820 which makes the operator-collocation penalty non-invertible. \
2821 Choose nu>=3/2 (e.g. nu=3/2 or the default nu=5/2) for multi-dimensional smooths.",
2822 cols.len()
2823 ))
2824 .to_string());
2825 }
2826 let aniso_log_scales = if option_bool(options, "scale_dims").unwrap_or(false) {
2827 Some(vec![0.0; cols.len()])
2828 } else {
2829 None
2830 };
2831 Ok(SmoothBasisSpec::Matern {
2832 feature_cols: cols.to_vec(),
2833 spec: MaternBasisSpec {
2834 center_strategy,
2835 periodic: parse_periodic_axes_option(options, cols.len())?,
2836 // Sentinel: leave at 0.0 when the user didn't pass an
2837 // explicit length_scale so the planner's
2838 // `auto_init_length_scale_in_place` can replace it with the
2839 // SAME data-derived wiggly-side initialization the thin-plate
2840 // path uses (`max_range / sqrt(n)`), then let the κ-optimizer
2841 // refine from there.
2842 //
2843 // gam#1629: the previous `default_matern_length_scale` seeded
2844 // the FULL data diameter — the maximally over-smoothed corner.
2845 // Because that value is non-zero, the `0.0`-gated auto-init was
2846 // a no-op for Matérn, so the κ-optimizer started in the flat
2847 // over-smoothed basin and parked there, leaving high-frequency
2848 // 2-D surfaces unresolved (truth-RMSE ~6× worse than
2849 // thin-plate/tensor on identical data, and insensitive to `k`).
2850 // Routing Matérn through the same `0.0` sentinel as thin-plate
2851 // (see the ThinPlate branch above) starts REML in the resolving
2852 // regime it can actually escape from.
2853 length_scale: option_f64(options, "length_scale").unwrap_or(0.0),
2854 nu,
2855 include_intercept: option_bool(options, "include_intercept").unwrap_or(false),
2856 double_penalty: smooth_double_penalty,
2857 identifiability: parse_matern_identifiability(options)
2858 .map_err(|e| e.to_string())?,
2859 aniso_log_scales,
2860 // Cold build: let the bootstrap-κ spectral test decide whether
2861 // the double-penalty nullspace shrinkage survives; the freeze
2862 // step then pins that decision into the FrozenTransform so the
2863 // κ-optimizer's rebuilds keep the count invariant (gam#787/#860).
2864 nullspace_shrinkage_survived: None,
2865 },
2866 input_scales: None,
2867 })
2868 }
2869 "duchon" => {
2870 validate_known_options(
2871 "duchon",
2872 options,
2873 &[
2874 SECONDARY_CENTER_CAP_OPTION,
2875 "type",
2876 "bs",
2877 "by",
2878 "length_scale",
2879 "centers",
2880 "k",
2881 "basis_dim",
2882 "basis-dim",
2883 "basisdim",
2884 "knots",
2885 "power",
2886 "p",
2887 "nullspace_order",
2888 "order",
2889 "identifiability",
2890 "by",
2891 "periodic",
2892 "cyclic",
2893 "period",
2894 "period_start",
2895 "period_end",
2896 "scale_dims",
2897 "double_penalty",
2898 "by",
2899 "id",
2900 "__by_col",
2901 ],
2902 )?;
2903 if options.contains_key("double_penalty") {
2904 return Err(TermBuilderError::incompatible_config(format!(
2905 "Duchon smooth '{}' does not support double_penalty; the Duchon smoother already ships its native reproducing-norm penalty plus a null-space shrinkage ridge.",
2906 vars.join(", ")
2907 ))
2908 .to_string());
2909 }
2910 let requested_nullspace_order = parse_duchon_order(options)?;
2911 let length_scale = option_f64_strict(options, "length_scale")?;
2912 // Resolve `(nullspace_order, power)`. The default (magic) path is a
2913 // structural amplitude/slope/curvature smoother: an affine (`Linear`)
2914 // polynomial nullspace and spectral power `s = (d - 1)/2`, giving the
2915 // cubic kernel `r^3` in 1D. There is no nullspace-order escalation —
2916 // the structural cubic smoother is well-defined for every dimension.
2917 //
2918 // Explicit `power=...` honors the user's value verbatim against their
2919 // requested nullspace order; the kernel validator emits a precise
2920 // diagnostic for any inadmissible combination. In the scale-free
2921 // (non-hybrid) regime fractional powers are admitted and threaded as
2922 // `f64`. The hybrid Duchon-Matérn kernel (`length_scale=Some`) is
2923 // restricted to integer powers.
2924 let (nullspace_order, power) = match parse_duchon_power_policy(options)? {
2925 DuchonPowerPolicy::Explicit(req_power) => {
2926 if length_scale.is_some() && req_power.fract() != 0.0 {
2927 return Err(TermBuilderError::incompatible_config(format!(
2928 "hybrid Duchon-Matern smooth '{}' (length_scale=...) requires an integer power, got power={}; \
2929 drop length_scale to use the scale-free structural kernel with a fractional power.",
2930 vars.join(", "),
2931 req_power,
2932 ))
2933 .to_string());
2934 }
2935 (requested_nullspace_order, req_power)
2936 }
2937 DuchonPowerPolicy::CubicStructuralDefault => {
2938 // Magic cubic rule (REQUEST-LAYER default): no explicit power ⇒
2939 // affine null space + fractional spectral power s = (d-1)/2, i.e.
2940 // the Duchon kernel φ(r)=r³ in every dimension. An EXPLICIT
2941 // `power=0` is handled above and is honored as the s=0 Duchon
2942 // kernel (r²·log r ≡ the thin-plate kernel in even d) — the magic
2943 // default lives here, not in the basis builder.
2944 match length_scale {
2945 None => crate::basis::duchon_cubic_default(cols.len()),
2946 Some(_) => {
2947 // The hybrid Matérn-blended kernel (`length_scale=Some`)
2948 // requires an INTEGER spectral power `s` (the partial-
2949 // fraction split `1/(ρ^{2p}(κ²+ρ²)^s)` is only defined for
2950 // integer `s`). The fractional cubic default `s=(d-1)/2` is
2951 // a half-integer for even `d`, and the basis builder's
2952 // `power_as_usize` maps a NON-integer to `0` (not its
2953 // floor) — so for even `d ≥ 4` the realized kernel has
2954 // `2(p+s) = 2p = 4 ≤ d`, which is non-finite at the origin
2955 // and crashes the fit (historically a non-finite
2956 // eigendecomposition; now a fit-time validation error).
2957 //
2958 // Rather than emit the fractional cubic and let it truncate
2959 // into an inadmissible kernel, resolve the SMALLEST
2960 // admissible integer `(nullspace, s)` at the requested
2961 // nullspace order, honoring the collocation order of the
2962 // default operator penalties (mass + tension ⇒ D1). This
2963 // recovers the canonical thin-plate smoothness order
2964 // `m = p + s = ⌊d/2⌋ + 1` for the hybrid kernel and agrees
2965 // with the fractional cubic default for odd `d` (where the
2966 // collocation floor already forces `s = (d-1)/2`).
2967 let max_op = crate::basis::duchon_max_active_operator_derivative_order(
2968 &DuchonOperatorPenaltySpec::default(),
2969 );
2970 let (ns, s) = crate::basis::resolve_duchon_orders(
2971 cols.len(),
2972 requested_nullspace_order,
2973 max_op,
2974 length_scale,
2975 );
2976 (ns, s as f64)
2977 }
2978 }
2979 }
2980 };
2981 let plan = plan_spatial_basis(
2982 ds.values.nrows(),
2983 cols.len(),
2984 CenterCountRequest::Default,
2985 nullspace_order,
2986 option_bool(options, "scale_dims").unwrap_or(false),
2987 policy,
2988 )
2989 .map_err(|e| e.to_string())?;
2990 let centers_explicit = has_explicit_countwith_basis_alias(options, "centers");
2991 let requested_centers = parse_countwith_basis_alias(
2992 options,
2993 "centers",
2994 cap_default_spatial_centers(options, plan.centers),
2995 )?;
2996 let polynomial_cols = match nullspace_order {
2997 DuchonNullspaceOrder::Zero => 1,
2998 DuchonNullspaceOrder::Linear => cols.len() + 1,
2999 DuchonNullspaceOrder::Degree(degree) => {
3000 crate::basis::duchon_nullspace_dimension(cols.len(), degree)
3001 }
3002 };
3003 if requested_centers <= polynomial_cols {
3004 return Err(TermBuilderError::incompatible_config(format!(
3005 "Duchon smooth '{}' requested basis dimension {} but order={:?} in {}D needs {} polynomial null-space columns; choose centers/k > {}",
3006 vars.join(", "),
3007 requested_centers,
3008 nullspace_order,
3009 cols.len(),
3010 polynomial_cols,
3011 polynomial_cols,
3012 ))
3013 .to_string());
3014 }
3015 let mut centers = requested_centers;
3016 if !centers_explicit && ds.values.nrows() <= 32 && smooth_coordinate_count >= 5 {
3017 centers = centers.max(polynomial_cols + 4);
3018 }
3019 let center_strategy = if centers_explicit {
3020 spatial_center_strategy_for_dimension(centers, cols.len())
3021 } else {
3022 auto_spatial_center_strategy(centers, cols.len())
3023 };
3024 let aniso_log_scales = if option_bool(options, "scale_dims").unwrap_or(false) {
3025 Some(vec![0.0; cols.len()])
3026 } else {
3027 None
3028 };
3029 // The default is the full Hilbert scale (curvature `Primary` + trend
3030 // ridge + mass + tension); REML deselects what the data don't support.
3031 let operator_penalties = DuchonOperatorPenaltySpec::default();
3032 // For a 1-D periodic Duchon with no EXPLICIT period, anchor the wrap
3033 // to the covariate DATA range rather than letting the basis builder
3034 // derive it from the (k-subsampled) center span. The center span is a
3035 // strict subset of the data and undershoots the true period, seaming
3036 // the curve (f(0) ≠ f(2π)); the data range is the caller's actual
3037 // domain. Honors any explicit `period=` (parse_periodic_axes_option
3038 // already threaded it) and leaves multi-D / non-periodic untouched.
3039 let mut periodic = parse_periodic_axes_option(options, cols.len())?;
3040 if cols.len() == 1
3041 && let Some(axes) = periodic.as_mut()
3042 && axes.len() == 1
3043 && axes[0].is_none()
3044 {
3045 let (minv, maxv) = col_minmax(ds.values.column(cols[0]))?;
3046 if maxv > minv {
3047 axes[0] = Some(maxv - minv);
3048 }
3049 }
3050 Ok(SmoothBasisSpec::Duchon {
3051 feature_cols: cols.to_vec(),
3052 spec: DuchonBasisSpec {
3053 center_strategy,
3054 periodic,
3055 length_scale,
3056 power,
3057 nullspace_order,
3058 identifiability: parse_spatial_identifiability(options)
3059 .map_err(|e| e.to_string())?,
3060 aniso_log_scales,
3061 operator_penalties,
3062 boundary: if cols.len() == 1 {
3063 let c = cols[0];
3064 let (minv, maxv) = col_minmax(ds.values.column(c))?;
3065 parse_cyclic_boundary(options, minv, maxv)?
3066 } else {
3067 OneDimensionalBoundary::Open
3068 },
3069 radial_reparam: None,
3070 },
3071 input_scales: None,
3072 })
3073 }
3074 "tensor" | "te" | "ti" | "t2" => {
3075 validate_known_options(
3076 "tensor",
3077 options,
3078 &[
3079 "type",
3080 "bs",
3081 "by",
3082 "k",
3083 "basis_dim",
3084 "basis-dim",
3085 "basisdim",
3086 "knot_placement",
3087 "knot-placement",
3088 "knotplacement",
3089 "degree",
3090 "penalty_order",
3091 "double_penalty",
3092 "periodic",
3093 "cyclic",
3094 "period",
3095 "periods",
3096 "period_start",
3097 "period_end",
3098 "origin",
3099 "origins",
3100 "period_origin",
3101 "period-origin",
3102 "domain_origin",
3103 "boundary",
3104 "bc",
3105 "identifiability",
3106 "id",
3107 "__by_col",
3108 ],
3109 )?;
3110 if cols.len() < 2 {
3111 return Err(TermBuilderError::incompatible_config(format!(
3112 "tensor smooth expects at least 2 variables, got {}",
3113 cols.len()
3114 ))
3115 .to_string());
3116 }
3117 let dim = cols.len();
3118
3119 // Tensor-product contract (#1082). `te(x1, x2, ...)` ALWAYS builds a
3120 // genuine anisotropic tensor product of per-margin bases (the arm
3121 // below), exactly as mgcv's `te()` does — one smoothing parameter per
3122 // margin, a marginal-Kronecker-sum penalty, and the bilinear null
3123 // space left unpenalized under the default `select = FALSE`. A margin
3124 // vector `bs=c('tp','tp')` requests a thin-plate FUNCTION SPACE per
3125 // axis; the tensor realizes each axis as a 1-D penalized B-spline
3126 // margin spanning that same per-axis space (tp/ps/cr/bs/cc all share
3127 // it). We deliberately do NOT silently swap the requested tensor for a
3128 // single multi-D ISOTROPIC thin-plate radial smooth (`s(x,y,bs='tp')`):
3129 // that is a different model — one isotropic smoothing parameter, no
3130 // per-margin anisotropy — and substituting it while the user wrote a
3131 // tensor formula is dishonest. A user who genuinely wants the isotropic
3132 // radial smooth asks for it directly with `s(x1, x2, bs='tp')`.
3133 // Per-margin basis vector (`bs=c('tp','tp')` / `bs=['ps','cr']`):
3134 // validate each requested margin is a penalized-spline basis that
3135 // the tensor product realizes as a 1-D B-spline margin. mgcv's
3136 // `tp`/`ps`/`cr`/`bs`/`cc` margins are all penalized splines over
3137 // the same per-axis function space, so a B-spline margin recovers
3138 // the same tensor smoothing space; genuinely different margin kinds
3139 // (e.g. adaptive `ad`, random `re`) are rejected loudly rather than
3140 // silently substituted.
3141 if let Some(raw) = options.get("bs").or_else(|| options.get("type"))
3142 && bs_selector_is_vector(raw)
3143 {
3144 let per_margin = parse_option_list(raw);
3145 if per_margin.len() != dim {
3146 return Err(TermBuilderError::invalid_option(format!(
3147 "tensor smooth per-margin bs vector has {} entries but the smooth has {} margins",
3148 per_margin.len(),
3149 dim
3150 ))
3151 .to_string());
3152 }
3153 for (axis, margin_bs) in per_margin.iter().enumerate() {
3154 if !tensor_margin_bs_is_supported(margin_bs) {
3155 return Err(TermBuilderError::unsupported_feature(format!(
3156 "tensor smooth margin {axis} basis '{margin_bs}' is not a supported penalized-spline margin; \
3157 tensor margins accept tp/tps/ps/bs/cr/cc"
3158 ))
3159 .to_string());
3160 }
3161 }
3162 }
3163 let periodic_axes = parse_tensor_periodic_axes(options, dim)?;
3164 validate_tensor_boundary_tokens(options, dim)?;
3165 let periods_opt = parse_periods(options, &periodic_axes)?;
3166 let origins_opt = parse_period_origins(options, &periodic_axes)?;
3167 let degree = option_usize(options, "degree").unwrap_or(DEFAULT_BSPLINE_DEGREE);
3168 let penalty_order =
3169 option_usize(options, "penalty_order").unwrap_or(if degree > 1 { 2 } else { 1 });
3170 let (mut k_list, k_inferred) = parse_tensor_k_list(options, cols, ds)?;
3171 if ds.values.nrows() <= 32 && smooth_coordinate_count >= 5 {
3172 for k in &mut k_list {
3173 *k = (*k).min(degree + 2);
3174 }
3175 }
3176 if k_inferred {
3177 inference_notes.push(format!(
3178 "Automatically set per-margin basis sizes {:?} for tensor smooth '{}' \
3179 (dimension-aware tensor budget: total ∏k kept near the mgcv-te default \
3180 and within the data support, distributed geometrically across margins and \
3181 capped per margin by each column's resolution). \
3182 Override with k=<int> or k=[k0,k1,...].",
3183 k_list,
3184 vars.join(",")
3185 ));
3186 }
3187 // Per-axis requested marginal basis family. mgcv's `te()`/`ti()`
3188 // default marginal basis is the cubic regression spline (`cr`), and
3189 // the te_3d quality gap (#1074) is precisely the marginal-basis
3190 // resolution at small `k`: a `cr` margin places k value-knots at
3191 // data quantiles (finer interior resolution under natural boundary
3192 // constraints) where the cubic B-spline margin has only
3193 // `k-degree-1` interior knots. Resolve each axis to either an
3194 // explicit per-margin `bs` (vector `bs=c('cr','ps')`), a single
3195 // scalar `bs`, or the unset default — and route
3196 // `cr`/`cs`/unset/`tp`/`tps` margins through the natural cubic
3197 // regression builder (`NaturalCubicRegression` knotspec), keeping
3198 // explicit `ps`/`bs`/`bspline` on the B-spline margin.
3199 let per_axis_bs: Vec<Option<String>> =
3200 match options.get("bs").or_else(|| options.get("type")) {
3201 Some(raw) if bs_selector_is_vector(raw) => {
3202 let list = parse_option_list(raw);
3203 (0..dim).map(|a| list.get(a).cloned()).collect()
3204 }
3205 Some(raw) => {
3206 let scalar = raw
3207 .trim()
3208 .trim_matches('"')
3209 .trim_matches('\'')
3210 .to_ascii_lowercase();
3211 vec![Some(scalar); dim]
3212 }
3213 None => vec![None; dim],
3214 };
3215 // A margin is realized as a natural cubic regression spline when it
3216 // is the (unset) mgcv default, an explicit `cr`/`cs`, or a
3217 // `tp`/`tps` (same per-axis penalized-spline space). Explicit
3218 // B-spline-family margins (`ps`/`bs`/`bspline`/`p-spline`) keep the
3219 // open B-spline margin.
3220 let margin_wants_cr = |bs: &Option<String>| -> bool {
3221 matches!(
3222 bs.as_deref(),
3223 None | Some("cr") | Some("cs") | Some("tp") | Some("tps")
3224 )
3225 };
3226 let mut margins: Vec<BSplineBasisSpec> = Vec::with_capacity(dim);
3227 let mut emitted_periods: Vec<Option<f64>> = Vec::with_capacity(dim);
3228 for axis in 0..dim {
3229 let c = cols[axis];
3230 let (data_min, data_max) = col_minmax(ds.values.column(c))?;
3231 // mgcv reduces a tensor margin's basis dimension to what its data
3232 // can support: a cr or B-spline margin cannot place more value
3233 // knots / basis functions than there are DISTINCT covariate
3234 // values on that axis. Without this cap an explicit `k` on a
3235 // low-cardinality margin — e.g. the binary `badh ∈ {0,1}` in
3236 // `te(age, badh, k=5)` — hard-failed in `select_cr_knots` ("cubic
3237 // regression spline with k=5 requires at least 5 distinct values,
3238 // got 2") instead of degrading to the 2-function (linear) margin
3239 // mgcv builds there. The auto-`k` path already caps per margin via
3240 // `heuristic_tensor_margin_knots`; mirror that for explicit `k`.
3241 // The cap propagates correctly: every per-axis quantity below
3242 // (effective degree, knot set, penalty order) is derived from
3243 // `k_axis`, and the marginal basis size is read from the resulting
3244 // knot spec — never from `k_list`. Floor at 2 so a margin still
3245 // carries at least a linear basis (tensor margins require k >= 2).
3246 let k_requested = k_list[axis];
3247 let n_distinct_axis = unique_count_column(ds.values.column(c));
3248 let k_axis = k_requested.min(n_distinct_axis).max(2);
3249 if k_axis < k_requested {
3250 log::info!(
3251 "tensor smooth: margin axis {axis} requested k={k_requested}, but the \
3252 covariate has only {n_distinct_axis} distinct value(s); reducing this \
3253 margin to k={k_axis} (mgcv-style data-support cap on the per-axis basis)."
3254 );
3255 }
3256 // Per-axis effective spline degree. The B-spline basis with `k`
3257 // functions is well-defined for any `degree <= k - 1`; mgcv's
3258 // `te(...)` exploits this so a binary tensor margin
3259 // (`k=2` → linear basis) or a ternary margin (`k=3` → quadratic)
3260 // can coexist with a smoother continuous margin under one
3261 // shared `degree=` request. We mirror that: if the caller
3262 // explicitly asks for `k < degree + 1`, drop the degree on
3263 // THAT axis only to the largest feasible spline, and track the
3264 // penalty order so the marginal difference penalty stays
3265 // well-defined (`order < num_basis_functions` is required by
3266 // `create_difference_penalty_matrix`). Periodic axes still
3267 // need enough basis functions to wrap; reject k there.
3268 if k_axis < 2 {
3269 return Err(TermBuilderError::invalid_option(format!(
3270 "tensor smooth: k[{axis}]={k_axis} too small; tensor margins require k >= 2"
3271 ))
3272 .to_string());
3273 }
3274 if periodic_axes[axis] && k_axis < degree + 1 {
3275 return Err(TermBuilderError::invalid_option(format!(
3276 "tensor smooth: periodic axis {axis} requires k >= {} for degree {degree}, got k={k_axis}",
3277 degree + 1
3278 ))
3279 .to_string());
3280 }
3281 let effective_degree = degree.min(k_axis - 1).max(1);
3282 let effective_penalty_order = penalty_order.min(effective_degree);
3283 // A `cc`/`cp`/`cyclic` per-margin basis declares periodicity
3284 // without necessarily supplying a `period=`: mgcv's `bs="cc"`
3285 // wraps at the covariate's observed data range. Mirror the 1-D
3286 // cyclic fallback (`parse_periodic_domain_1d`) here so a bare
3287 // `te(x, z, bs=c('cc','cc'))` wraps each margin on its own
3288 // [min, max] span instead of hard-erroring (#1752).
3289 let margin_is_cc = matches!(
3290 canonicalize_smooth_type(per_axis_bs[axis].as_deref().unwrap_or("")),
3291 "cc" | "cp" | "cyclic"
3292 );
3293 let (knotspec, boundary, axis_period) = if periodic_axes[axis] {
3294 // A `cc`/`cp`/`cyclic` per-margin basis declares periodicity
3295 // without necessarily supplying a `period=`; in that case wrap
3296 // at the covariate's observed [min, max] span, mirroring the
3297 // 1-D cyclic fallback (`parse_periodic_domain_1d`) so a bare
3298 // `te(x, z, bs=c('cc','cc'))` wraps each margin on its own
3299 // range instead of hard-erroring (#1752). An axis made
3300 // periodic by an explicit `periodic=`/`boundary=` selector
3301 // (not a cyclic margin basis) still requires an explicit
3302 // `period=`: a data-derived period there is a sample-dependent
3303 // off-by-ε seam and is not inferred.
3304 let (domain_start, period_value) = match periods_opt[axis] {
3305 Some(period_value) => {
3306 if !period_value.is_finite() || period_value <= 0.0 {
3307 return Err(format!(
3308 "tensor smooth axis {axis}: period must be a positive finite value, got {period_value}"
3309 ));
3310 }
3311 (origins_opt[axis].unwrap_or(data_min), period_value)
3312 }
3313 None if margin_is_cc => {
3314 let span = data_max - data_min;
3315 if !span.is_finite() || span <= 0.0 {
3316 return Err(format!(
3317 "tensor smooth axis {axis}: cyclic margin requires a positive \
3318 observed data range to derive its period, got [{data_min}, {data_max}]"
3319 ));
3320 }
3321 (origins_opt[axis].unwrap_or(data_min), span)
3322 }
3323 None => {
3324 return Err(format!(
3325 "tensor smooth axis {axis} is periodic but requires an explicit \
3326 period: pass period=<value> (scalar) or period=[..., <value>, ...]. \
3327 Deriving the period from the observed data range is sample-dependent \
3328 (off-by-ε seam), so it is not inferred."
3329 ));
3330 }
3331 };
3332 let domain_end = domain_start + period_value;
3333 (
3334 BSplineKnotSpec::PeriodicUniform {
3335 data_range: (domain_start, domain_end),
3336 num_basis: k_axis,
3337 },
3338 OneDimensionalBoundary::Cyclic {
3339 start: domain_start,
3340 end: domain_end,
3341 },
3342 Some(period_value),
3343 )
3344 } else if margin_wants_cr(&per_axis_bs[axis]) && k_axis >= 3 {
3345 // mgcv `te()`/`ti()` default cr margin: place exactly
3346 // `k_axis` Lancaster–Salkauskas value-knots at data
3347 // quantiles. The cr basis dimension equals the knot count,
3348 // so this reproduces the requested per-margin `k` directly.
3349 // A natural cubic regression spline needs at least 3 knots
3350 // (one interior); a `k_axis < 3` margin (e.g. a binary
3351 // tensor axis requesting a linear margin) falls through to
3352 // the B-spline branch below, exactly as before #1074 — mgcv
3353 // likewise does not build a `cr` margin below k=3.
3354 let cr_knots =
3355 crate::basis::select_cr_knots(ds.values.column(c), k_axis)
3356 .map_err(|e| e.to_string())?;
3357 (
3358 BSplineKnotSpec::NaturalCubicRegression { knots: cr_knots },
3359 OneDimensionalBoundary::Open,
3360 None,
3361 )
3362 } else {
3363 // `num_internal_knots = k - degree - 1` reproduces the
3364 // requested basis size exactly when degree was reduced for
3365 // a low-cardinality margin; keep the legacy `.max(1)`
3366 // floor on the un-reduced path so the existing knot
3367 // geometry is unchanged whenever the user already passed
3368 // k >= degree + 1.
3369 let num_internal_knots = if effective_degree < degree {
3370 k_axis.saturating_sub(effective_degree + 1)
3371 } else {
3372 k_axis.saturating_sub(degree + 1).max(1)
3373 };
3374 let knotspec = match parse_knot_placement(options)? {
3375 crate::basis::BSplineKnotPlacement::Uniform => BSplineKnotSpec::Generate {
3376 data_range: (data_min, data_max),
3377 num_internal_knots,
3378 },
3379 crate::basis::BSplineKnotPlacement::Quantile => {
3380 crate::basis::auto_knot_vector_1d_quantile(
3381 ds.values.column(c),
3382 num_internal_knots,
3383 effective_degree,
3384 )
3385 .map_err(|e| e.to_string())?;
3386 BSplineKnotSpec::Automatic {
3387 num_internal_knots: Some(num_internal_knots),
3388 placement: crate::basis::BSplineKnotPlacement::Quantile,
3389 }
3390 }
3391 };
3392 (knotspec, OneDimensionalBoundary::Open, None)
3393 };
3394 // A `cr` margin fixes cubic regression geometry; the cr builder
3395 // reads only the knot set + `double_penalty`. Enable null-space
3396 // shrinkage for an explicit `cs` margin. B-spline margins keep
3397 // the resolved effective degree / penalty order with no extra
3398 // null-space penalty (mgcv `select = FALSE` tensor default).
3399 let is_cr_margin =
3400 matches!(knotspec, BSplineKnotSpec::NaturalCubicRegression { .. });
3401 let margin_double_penalty =
3402 is_cr_margin && matches!(per_axis_bs[axis].as_deref(), Some("cs"));
3403 margins.push(BSplineBasisSpec {
3404 degree: effective_degree,
3405 penalty_order: effective_penalty_order,
3406 knotspec,
3407 double_penalty: margin_double_penalty,
3408 identifiability: BSplineIdentifiability::None,
3409 boundary,
3410 boundary_conditions: BSplineBoundaryConditions::default(),
3411 });
3412 emitted_periods.push(axis_period);
3413 }
3414 // #1593: canonicalize the margin order so a tensor smooth is invariant
3415 // to the typed order of its covariates. `te(x, z)` and `te(z, x)` span
3416 // the IDENTICAL tensor-product space under the identical per-margin
3417 // penalty family, but the design is the Khatri–Rao product
3418 // `B_first ⊙ B_second`, so the typed order permutes the design columns
3419 // (and the per-margin penalty blocks `S_first⊗I`, `I⊗S_second`). That
3420 // permutation is a pure relabelling in exact arithmetic — REML is
3421 // invariant to it — yet it reorders the penalized normal-equation / REML
3422 // eigen/Cholesky linear algebra, and the resulting sub-ULP differences
3423 // route the outer λ optimizer to a different terminal point in te's flat
3424 // REML valley (the over-smoothed margin rails to the ρ bound while the
3425 // other lands on a materially different λ̂). So the shipped surface
3426 // drifted ~2–6 % of range with a cosmetic swap of the covariate order
3427 // (the #1378 row-permutation / #1456 rotation flat-valley gauge family).
3428 // Sorting the margins by their source feature-column index makes the same
3429 // physical model build the identical problem regardless of typed order,
3430 // so the fit — and every prediction rebuilt from the resolved spec — is
3431 // genuinely order-invariant. `ti`/`t2` share this arm and become exactly
3432 // invariant too (they were already ~1e-5 by centring each margin
3433 // separately; canonicalization makes the swap bit-identical).
3434 let canon_cols: Vec<usize> = {
3435 let mut perm: Vec<usize> = (0..dim).collect();
3436 perm.sort_by_key(|&a| cols[a]);
3437 if perm.iter().enumerate().any(|(i, &a)| i != a) {
3438 margins = perm.iter().map(|&a| margins[a].clone()).collect();
3439 emitted_periods = perm.iter().map(|&a| emitted_periods[a]).collect();
3440 }
3441 perm.iter().map(|&a| cols[a]).collect()
3442 };
3443 let any_periodic = emitted_periods.iter().any(|p| p.is_some());
3444 let periods_vec = if any_periodic {
3445 emitted_periods
3446 } else {
3447 Vec::new()
3448 };
3449 // Tensor smooths (`te`/`ti`/`t2`) must match mgcv's DEFAULT
3450 // `select = FALSE`: the joint null space of the per-margin
3451 // penalties — the bilinear, low-order interaction directions that
3452 // no marginal roughness operator can see — is left UNPENALIZED.
3453 // mgcv only adds a null-space shrinkage penalty there under the
3454 // opt-in `select = TRUE` (which gam exposes as `double_penalty`).
3455 //
3456 // The general smooth default (`smooth_double_penalty`, true) is
3457 // calibrated for 1-D `s()` terms; carrying it into tensors silently
3458 // shrinks the genuinely-present bilinear interaction signal, so
3459 // REML places positive weight on the extra ridge and systematically
3460 // OVER-SMOOTHS the recovered surface relative to mgcv's plain
3461 // `te`/`ti` (gam#700/#701/#702/#703). Default tensors to no extra
3462 // null-space penalty; an explicit user `double_penalty=`/`select=`
3463 // still wins.
3464 let tensor_double_penalty = option_bool(options, "double_penalty").unwrap_or(false);
3465 Ok(SmoothBasisSpec::TensorBSpline {
3466 feature_cols: canon_cols,
3467 spec: TensorBSplineSpec {
3468 marginalspecs: margins,
3469 periods: periods_vec,
3470 double_penalty: tensor_double_penalty,
3471 identifiability: parse_tensor_identifiability(options, kind)?,
3472 // `t2` selects mgcv's separable (Wood, Scheipl & Faraway
3473 // 2013) decomposition. It can arrive either as the `t2(...)`
3474 // function form (`SmoothKind::T2`) or as a `type="t2"` /
3475 // `bs="t2"` option on an `s(...)`/`te(...)` term, in which
3476 // case `kind` is *not* `T2` but the resolved type string is
3477 // "t2". Keying only off `kind` silently aliased the option
3478 // form to `te`'s Kronecker-sum penalty (gam#1185); key off
3479 // the resolved type string as well so both routes build the
3480 // separable penalty.
3481 penalty_decomposition: if matches!(kind, SmoothKind::T2)
3482 || type_opt.as_str() == "t2"
3483 {
3484 TensorBSplinePenaltyDecomposition::Separable
3485 } else {
3486 TensorBSplinePenaltyDecomposition::MarginalKroneckerSum
3487 },
3488 },
3489 })
3490 }
3491 "pca" => {
3492 validate_known_options(
3493 "pca",
3494 options,
3495 &[
3496 "type",
3497 "bs",
3498 "by",
3499 "k",
3500 "basis_dim",
3501 "basis-dim",
3502 "basisdim",
3503 "lazy_path",
3504 "path",
3505 "pca_basis_path",
3506 "chunk_size",
3507 "smooth_penalty",
3508 "centered",
3509 "double_penalty",
3510 "id",
3511 "__by_col",
3512 ],
3513 )?;
3514 let path = options
3515 .get("lazy_path")
3516 .or_else(|| options.get("pca_basis_path"))
3517 .or_else(|| options.get("path"))
3518 .map(|raw| PathBuf::from(strip_quotes(raw)));
3519 let Some(path) = path else {
3520 return Err(TermBuilderError::incompatible_config(
3521 "pca smooth requires lazy_path=... on the formula path",
3522 )
3523 .to_string());
3524 };
3525 let k = option_usize_any(options, &["k", "basis_dim", "basis-dim", "basisdim"])
3526 .unwrap_or(0);
3527 let chunk_size = option_usize(options, "chunk_size").unwrap_or(DEFAULT_PCA_CHUNK_SIZE);
3528 Ok(SmoothBasisSpec::Pca {
3529 feature_cols: cols.to_vec(),
3530 basis_matrix: Array2::<f64>::zeros((cols.len(), k)),
3531 centered: option_bool(options, "centered").unwrap_or(true),
3532 smooth_penalty: option_f64(options, "smooth_penalty").unwrap_or(1.0),
3533 center_mean: None,
3534 pca_basis_path: Some(path),
3535 chunk_size,
3536 })
3537 }
3538 other => Err(TermBuilderError::unsupported_feature(format!(
3539 "unsupported smooth type '{other}'"
3540 ))
3541 .to_string()),
3542 }
3543}
3544
3545/// Initialise per-axis anisotropic log-scales on eligible spatial smooth specs.
3546pub fn enable_scale_dimensions(spec: &mut TermCollectionSpec) {
3547 for smooth in spec.smooth_terms.iter_mut() {
3548 // A multi-axis thin-plate term cannot carry per-axis anisotropy on its
3549 // single curvature penalty, so `scale_dimensions` was historically a
3550 // silent no-op for `bs="tp"` (gam#1676). Rewrite it to the
3551 // mathematically-equivalent anisotropic s=0 Duchon spline first; the
3552 // Duchon arm below then sees an already-seeded `aniso_log_scales` and
3553 // leaves it untouched.
3554 promote_thin_plate_for_scale_dimensions(&mut smooth.basis);
3555 match &mut smooth.basis {
3556 SmoothBasisSpec::Matern {
3557 feature_cols,
3558 spec: matern,
3559 ..
3560 } => {
3561 if matern.aniso_log_scales.is_none() {
3562 let d = feature_cols.len();
3563 matern.aniso_log_scales = Some(vec![0.0; d]);
3564 }
3565 }
3566 SmoothBasisSpec::Duchon {
3567 feature_cols,
3568 spec: duchon,
3569 ..
3570 } => {
3571 if duchon.aniso_log_scales.is_none() {
3572 let d = feature_cols.len();
3573 duchon.aniso_log_scales = Some(vec![0.0; d]);
3574 }
3575 }
3576 _ => {}
3577 }
3578 }
3579}
3580
3581/// Rewrite a multi-axis thin-plate term into the mathematically-equivalent
3582/// anisotropic s=0 Duchon spline so that `scale_dimensions` genuinely engages
3583/// (gam#1676).
3584///
3585/// ## Why a rewrite rather than a new field on the TPS builder
3586///
3587/// A canonical thin-plate regression spline carries a *single* curvature
3588/// penalty — the exact `∫|Dᵐ f|²` reproducing-kernel Gram. That penalty has no
3589/// per-axis structure to make one direction more or less relevant than another,
3590/// so per-axis anisotropy (`scale_dimensions`) cannot be expressed on it. The
3591/// flag was therefore a silent no-op for `bs="tp"` while it engaged for
3592/// `duchon()`/`matern()`.
3593///
3594/// The thin-plate kernel `r^{2m−d}` (the `r²·log r` log-case in even `d`) is
3595/// *exactly* the s=0 Duchon kernel (`DuchonBasisSpec::power = 0`,
3596/// `length_scale = None`) at the matching polynomial null-space order
3597/// `m = thin_plate_penalty_order(d)`. The Duchon polyharmonic family already
3598/// carries the per-axis tension ARD that `scale_dimensions` requests: its
3599/// isotropic first-order roughness penalty `Σ‖∇f‖²` splits into `d` directional
3600/// penalties `Σ(∂f/∂x_a)²`, each with its own REML `λ_a`
3601/// (`duchon_operator_penalty_candidates`). So the well-posed *anisotropic
3602/// thin-plate spline is the anisotropic s=0 Duchon spline*. Rewriting to that
3603/// representation reuses the battle-tested Duchon anisotropy / ψ-derivative /
3604/// freeze / predict machinery instead of duplicating it onto the TPS metadata
3605/// path, and keeps the polyharmonic family internally consistent. The codebase
3606/// already promotes infeasible-`k` TPS to Duchon for the same reason (the
3607/// canonical TPS single curvature penalty cannot deliver a requested
3608/// capability); per-axis anisotropy is another such capability.
3609///
3610/// This fires *only* when the user opts into `scale_dimensions`; the default
3611/// thin-plate path (`scale_dimensions` off) is left bit-for-bit unchanged.
3612/// A 1-D thin-plate term is left untouched — anisotropy is meaningless on a
3613/// single axis (its `Σ η = 0` contrast vector is empty), exactly as for a 1-D
3614/// Matérn/Duchon term.
3615fn promote_thin_plate_for_scale_dimensions(basis: &mut SmoothBasisSpec) {
3616 let SmoothBasisSpec::ThinPlate {
3617 feature_cols,
3618 spec,
3619 input_scales,
3620 } = &*basis
3621 else {
3622 return;
3623 };
3624 let d = feature_cols.len();
3625 if d <= 1 {
3626 return;
3627 }
3628 // m = thin_plate_penalty_order(d) is the TPS penalty order; the Duchon
3629 // null-space order naming is `Zero → m=1`, `Linear → m=2`,
3630 // `Degree(g) → m=g+1`, so the s=0 Duchon kernel exponent
3631 // `2(p+s) − d = 2m − d` reproduces the TPS kernel exactly.
3632 let m = thin_plate_penalty_order(d);
3633 let nullspace_order = match m {
3634 0 | 1 => DuchonNullspaceOrder::Zero,
3635 2 => DuchonNullspaceOrder::Linear,
3636 _ => DuchonNullspaceOrder::Degree(m - 1),
3637 };
3638 let duchon_spec = DuchonBasisSpec {
3639 center_strategy: spec.center_strategy.clone(),
3640 periodic: spec.periodic.clone(),
3641 // Pure, scale-free Duchon — the thin-plate kernel has no length scale
3642 // (a global TPS kernel scale is non-identifiable once REML learns the
3643 // smoothing penalty: gam#718/#721/#731/#732). The per-axis relevance
3644 // the user asked for is carried by the tension-ARD `λ_a`, not a κ axis.
3645 length_scale: None,
3646 // s = 0 ⇒ thin-plate kernel `r^{2m−d}`.
3647 power: 0.0,
3648 nullspace_order,
3649 identifiability: spec.identifiability.clone(),
3650 // All-zero geometry seed sentinel: `auto_seed_aniso_contrasts` resolves
3651 // it from the (standardized) knot cloud, and the per-axis tension split
3652 // engages on `aniso.is_some()`.
3653 aniso_log_scales: Some(vec![0.0; d]),
3654 operator_penalties: DuchonOperatorPenaltySpec::default(),
3655 boundary: OneDimensionalBoundary::Open,
3656 radial_reparam: None,
3657 };
3658 let feature_cols = feature_cols.clone();
3659 let input_scales = input_scales.clone();
3660 // All borrows of `*basis` (the `&*basis` destructure above) end with the
3661 // clones on the two preceding lines, so the reassignment is sound.
3662 *basis = SmoothBasisSpec::Duchon {
3663 feature_cols,
3664 spec: duchon_spec,
3665 input_scales,
3666 };
3667}
3668
3669// ---------------------------------------------------------------------------
3670// Data-aware helpers
3671// ---------------------------------------------------------------------------
3672
3673pub fn spatial_center_strategy_for_dimension(num_centers: usize, d: usize) -> CenterStrategy {
3674 if d <= 3 {
3675 // In low-dimensional spatial smooths, an explicit `k` is a resolution
3676 // request rather than a request for marginal quantile-midpoint centers.
3677 // Use deterministic maximin geometry so Matérn/GP and Duchon REML see a
3678 // well-resolved native kernel block with small fill distance instead of
3679 // compensating for holes or endpoint under-resolution by over-smoothing
3680 // low-noise signals (#504).
3681 CenterStrategy::FarthestPoint { num_centers }
3682 } else {
3683 default_spatial_center_strategy(num_centers, d)
3684 }
3685}
3686
3687pub fn col_minmax(col: ArrayView1<'_, f64>) -> Result<(f64, f64), String> {
3688 let min = col.iter().fold(f64::INFINITY, |a, &b| a.min(b));
3689 let max = col.iter().fold(f64::NEG_INFINITY, |a, &b| a.max(b));
3690 if !min.is_finite() || !max.is_finite() {
3691 return Err(TermBuilderError::degenerate_data(
3692 "non-finite data encountered while inferring knot range",
3693 )
3694 .to_string());
3695 }
3696 if (max - min).abs() < 1e-12 {
3697 Ok((min, min + 1e-6))
3698 } else {
3699 Ok((min, max))
3700 }
3701}
3702
3703pub fn unique_count_column(col: ArrayView1<'_, f64>) -> usize {
3704 use std::collections::HashSet;
3705 let mut set = HashSet::<u64>::with_capacity(col.len());
3706 for &v in col {
3707 let norm = if v == 0.0 { 0.0 } else { v };
3708 set.insert(norm.to_bits());
3709 }
3710 set.len().max(1)
3711}
3712
3713/// Minimum knot count for a natural cubic regression spline: `select_cr_knots`
3714/// places one value-knot per basis function and needs at least an interior knot,
3715/// so the sparsest representable cr basis is `{const, linear, curvature}` at
3716/// three knots. Below this a cr spline is not constructible and the caller must
3717/// degrade to the linear B-spline marginal.
3718pub(crate) const CR_MIN_KNOTS: usize = 3;
3719
3720/// Build a cubic-regression marginal knot spec capped to the covariate's data
3721/// support, mgcv-style.
3722///
3723/// A `cr`/`cs`/`sz` marginal places exactly one basis function per value-knot,
3724/// so `select_cr_knots` cannot place more knots than the covariate has DISTINCT
3725/// values — it `bail`s with "cubic regression spline with k=N requires at least
3726/// N distinct values" otherwise. An unclamped `k` on an ordinary low-cardinality
3727/// covariate (a binary indicator, a 3-level ordinal/Likert score, a small count)
3728/// therefore hard-failed the whole fit instead of reducing the basis the way
3729/// mgcv — and gam's own tensor-margin path (996f829d7, `term_builder.rs:2986` /
3730/// the `k_axis >= 3` cr gate at `:3047`) — do. This is the univariate / factor-
3731/// smooth sibling of that tensor cap (#1541, #1542).
3732///
3733/// Returns:
3734/// - `Some(NaturalCubicRegression { .. })` with `k = min(k_requested, n_distinct)`
3735/// value-knots when the data supports a cr spline (`n_distinct >= CR_MIN_KNOTS`).
3736/// A cr basis of exactly `n_distinct` knots is full-rank for the data — it can
3737/// represent any per-distinct-value structure (e.g. 3 arbitrary group means on
3738/// a ternary covariate) — so the cap never costs recoverable signal.
3739/// - `None` when `n_distinct < CR_MIN_KNOTS` (a binary covariate): too few
3740/// distinct values for ANY cr spline, so the caller degrades to the linear
3741/// B-spline marginal — exactly what the default `s(x, k=..)` basis already
3742/// builds on the same data, and what the tensor path's `< 3` branch builds.
3743///
3744/// `inference_notes` records any reduction so the user sees that `k` was capped
3745/// (mgcv emits a warning in the same situation).
3746fn capped_cr_marginal_knotspec(
3747 col: ArrayView1<'_, f64>,
3748 k_cr_requested: usize,
3749 label: &str,
3750 inference_notes: &mut Vec<String>,
3751) -> Result<Option<BSplineKnotSpec>, String> {
3752 let n_distinct = unique_count_column(col);
3753 let k_cr = k_cr_requested.min(n_distinct);
3754 if k_cr < CR_MIN_KNOTS {
3755 inference_notes.push(format!(
3756 "Smooth '{label}': cubic-regression ('cr'/'cs'/'sz') basis requested k={k_cr_requested}, \
3757 but the covariate has only {n_distinct} distinct value(s) — too few to support a cubic \
3758 regression spline (needs >= {CR_MIN_KNOTS} distinct values). Degraded to the linear \
3759 B-spline marginal the default basis builds on the same data."
3760 ));
3761 return Ok(None);
3762 }
3763 if k_cr < k_cr_requested {
3764 inference_notes.push(format!(
3765 "Smooth '{label}': cubic-regression ('cr'/'cs'/'sz') basis reduced from k={k_cr_requested} \
3766 to k={k_cr} to match the covariate's {n_distinct} distinct value(s) (mgcv-style \
3767 data-support cap; a cr basis cannot place more value-knots than the data has)."
3768 ));
3769 }
3770 let cr_knots = crate::basis::select_cr_knots(col, k_cr).map_err(|e| e.to_string())?;
3771 Ok(Some(BSplineKnotSpec::NaturalCubicRegression {
3772 knots: cr_knots,
3773 }))
3774}
3775
3776/// Smallest number of distinct covariate values seen within any single group
3777/// of `group_col`. For a factor smooth this is the resolution that bounds the
3778/// marginal basis: a group with `m` distinct covariate values can only inform
3779/// `m` basis coefficients, so a marginal richer than that interpolates the
3780/// group instead of estimating a penalized trend. Bits are compared exactly so
3781/// integer-valued covariates (days, dose levels) collapse to their true count.
3782fn min_per_group_unique_count(
3783 feature_col: ArrayView1<'_, f64>,
3784 group_col: ArrayView1<'_, f64>,
3785) -> usize {
3786 use std::collections::{HashMap, HashSet};
3787 let mut per_group: HashMap<u64, HashSet<u64>> = HashMap::new();
3788 for (xi, gi) in feature_col.iter().zip(group_col.iter()) {
3789 let xnorm = if *xi == 0.0 { 0.0 } else { *xi };
3790 let gnorm = if *gi == 0.0 { 0.0 } else { *gi };
3791 per_group
3792 .entry(gnorm.to_bits())
3793 .or_default()
3794 .insert(xnorm.to_bits());
3795 }
3796 per_group
3797 .values()
3798 .map(|s| s.len())
3799 .min()
3800 .unwrap_or(1)
3801 .max(1)
3802}
3803
3804/// Default internal-knot count for an *additive* univariate smooth, derived
3805/// from the column's unique-value count.
3806///
3807/// The basis dimension is `internal_knots + degree + 1`, so the cap below maps
3808/// to a default cubic basis of ~12 functions — deliberately close to mgcv's
3809/// univariate default (`k = 10`). A penalized smooth controls its wiggliness
3810/// through the *penalty*, not the basis size: REML/LAML shrinks a too-rich
3811/// basis toward the null, but it cannot do so cleanly when the basis is so
3812/// over-sized that the design becomes weakly identified. Growing the basis with
3813/// `n` (the old `n^(1/3)`-ceilinged `unique/4` rule, which pinned to 20 internal
3814/// knots ⇒ a 24-function basis for any column with ≥80 unique values) therefore
3815/// *hurts* recovery on finite, weak-signal fits: a 4-smooth additive model on
3816/// n=120 asks for ~92 coefficients, the outer optimizer stalls on the resulting
3817/// flat two-penalty (range + null-space) REML surface, and the truth leaks into
3818/// surplus columns the penalty can't shrink away (gam#1680; the same defect was
3819/// documented for thin-plate fields in gam#1074). A k-sweep on the #1680 design
3820/// confirms a basis of ~10–15 recovers truth at RMSE ≈ 0.12 while the old
3821/// 24-function default lands at ≈ 0.39 (~3× worse) — *whether or not* the
3822/// covariates are collinear, so this is basis over-richness, not collinearity.
3823///
3824/// The cap is flat in `n`: a user who genuinely needs a wigglier fit raises `k`
3825/// explicitly (mgcv's contract — opt *in* to more flexibility), and the SPEC
3826/// requires the default to allow recovering the null rather than forcing the
3827/// user to opt out of overfitting. The 4-knot floor stays put because we still
3828/// need enough basis functions to fit a non-trivial smooth at all, and the
3829/// `unique/4` growth below the cap keeps small/sparse columns (n ≤ 32, where
3830/// `unique/4 ≤ 8`) on exactly their previous knot count.
3831pub fn heuristic_knots_for_column(col: ArrayView1<'_, f64>) -> usize {
3832 /// Default cubic basis ≈ `MAX_DEFAULT_INTERNAL_KNOTS + degree + 1` = 12
3833 /// functions, matching mgcv's lean univariate default.
3834 const MAX_DEFAULT_INTERNAL_KNOTS: usize = 8;
3835 let unique = unique_count_column(col);
3836 (unique / 4).clamp(4, MAX_DEFAULT_INTERNAL_KNOTS)
3837}
3838
3839/// Per-margin basis sizes for a tensor-product smooth (`te`/`ti`/`t2`).
3840///
3841/// The 1-D heuristic [`heuristic_knots_for_column`] is calibrated for an
3842/// *additive* margin: a well-resolved column asks for the lean univariate
3843/// default (≈12 basis functions, the mgcv-like cap of 8 internal knots; see
3844/// gam#1680), which is sensible for a single `s(x)` term.
3845/// A tensor product, however, multiplies the per-margin sizes:
3846/// `p = ∏_d k_d`. Reusing the 1-D rule per margin makes `p` explode with the
3847/// tensor dimension — a 3-D `te(x,y,z)` at the 1-D ceiling of 12/margin is
3848/// `12³ ≈ 1728` columns, and every REML evaluation pays an O(p³) dense
3849/// penalty reparameterization (the full-tensor sum-to-zero constraint is not
3850/// Kronecker-factorable), turning model selection over tensor candidates into
3851/// a multi-minute single-threaded stall (gam#813). It also requests far more
3852/// coefficients than the data can identify whenever `p ≫ n`.
3853///
3854/// mgcv's `te(...)` uses a small per-margin default (`k = 5`, i.e. `5^d`).
3855/// We match that spirit while staying data-adaptive: budget the *total* tensor
3856/// column count `p_target` and distribute it geometrically across the margins
3857/// so `∏ k_d ≈ p_target`, never asking a margin for more functions than its
3858/// own unique values (and the data set) can support.
3859fn heuristic_tensor_margin_knots(cols: &[usize], ds: &Dataset) -> Vec<usize> {
3860 let d = cols.len().max(1);
3861 let degree = DEFAULT_BSPLINE_DEGREE;
3862 let min_k = degree + 2; // smallest margin that carries a difference penalty
3863 let n = ds.values.nrows();
3864
3865 // Per-margin 1-D ceiling: never request more basis functions than the
3866 // margin's own resolution (unique values) supports. This caps each axis
3867 // independently before the joint budget is applied.
3868 let per_margin_cap: Vec<usize> = cols
3869 .iter()
3870 .map(|&c| heuristic_knots_for_column(ds.values.column(c)).max(min_k))
3871 .collect();
3872
3873 // Total-basis budget. A tensor with ∏k ≫ n coefficients is rank-deficient
3874 // and pure REML cost; cap the product at a generous fraction of n while
3875 // honoring mgcv's small default for the common small-d case. The budget
3876 // grows with n but the geometric split below keeps each margin modest.
3877 // d=2 → up to ~7²=49 (mgcv-`te`-like), d=3 → ~5³=125, larger d shrinks
3878 // per-margin further so the product never blows past the data support.
3879 let mgcv_like_per_margin = match d {
3880 2 => 7usize,
3881 3 => 5usize,
3882 _ => 4usize,
3883 };
3884 let mgcv_like_total = (mgcv_like_per_margin as f64).powi(d as i32);
3885 let data_budget = (n as f64) * 0.8;
3886 let p_target = mgcv_like_total
3887 .max(min_k.pow(d as u32) as f64)
3888 .min(data_budget);
3889
3890 // Geometric per-margin target so ∏k ≈ p_target, then clamp each margin to
3891 // its own 1-D resolution cap and the difference-penalty floor.
3892 let geo_per_margin = p_target.powf(1.0 / d as f64).round() as usize;
3893 let unclamped: Vec<usize> = per_margin_cap
3894 .iter()
3895 .map(|&cap| geo_per_margin.clamp(min_k, cap))
3896 .collect();
3897
3898 // The per-margin clamps can pull some axes below `geo_per_margin` (a
3899 // low-resolution column), leaving headroom in the joint budget. Redistribute
3900 // that headroom to the margins that can still grow, so the realized ∏k stays
3901 // close to p_target instead of systematically under-shooting it.
3902 let mut k_list = unclamped;
3903 loop {
3904 let product: f64 = k_list.iter().map(|&k| k as f64).product();
3905 if product >= p_target {
3906 break;
3907 }
3908 // Grow the axis with the most remaining headroom (cap − current),
3909 // breaking ties toward the largest cap. Stop when none can grow.
3910 let Some(idx) = k_list
3911 .iter()
3912 .zip(per_margin_cap.iter())
3913 .enumerate()
3914 .filter(|&(_, (k, cap))| k < cap)
3915 .max_by_key(|&(_, (k, cap))| (cap - k, *cap))
3916 .map(|(i, _)| i)
3917 else {
3918 break;
3919 };
3920 k_list[idx] += 1;
3921 }
3922 k_list
3923}
3924
3925pub fn heuristic_centers(n: usize, d: usize) -> usize {
3926 default_num_centers(n, d)
3927}
3928
3929// ---------------------------------------------------------------------------
3930// Smooth option parsers
3931// ---------------------------------------------------------------------------
3932
3933fn parse_endpoint_side(
3934 value: &str,
3935 context: &str,
3936) -> Result<BSplineEndpointBoundaryCondition, String> {
3937 match value.trim().to_ascii_lowercase().as_str() {
3938 "" | "none" | "open" | "unconstrained" | "free" => {
3939 Ok(BSplineEndpointBoundaryCondition::Free)
3940 }
3941 "clamped" | "clamp" | "zero_derivative" | "zero-derivative" => {
3942 Ok(BSplineEndpointBoundaryCondition::Clamped)
3943 }
3944 "anchored" | "anchor" | "zero" | "zero_value" | "zero-value" => {
3945 Ok(BSplineEndpointBoundaryCondition::Anchored { value: 0.0 })
3946 }
3947 other => Err(format!(
3948 "unsupported {context} boundary condition '{other}'; expected free, clamped, or anchored"
3949 )),
3950 }
3951}
3952
3953fn boundary_anchor_value(
3954 options: &BTreeMap<String, String>,
3955 side: &str,
3956 fallback: Option<f64>,
3957) -> Option<f64> {
3958 [
3959 format!("anchor_{side}"),
3960 format!("{side}_anchor"),
3961 format!("anchor-value-{side}"),
3962 ]
3963 .iter()
3964 .find_map(|key| option_f64(options, key))
3965 .or(fallback)
3966}
3967
3968fn apply_anchor_value(
3969 cond: BSplineEndpointBoundaryCondition,
3970 value: Option<f64>,
3971) -> BSplineEndpointBoundaryCondition {
3972 match cond {
3973 BSplineEndpointBoundaryCondition::Anchored { .. } => {
3974 BSplineEndpointBoundaryCondition::Anchored {
3975 value: value.unwrap_or(0.0),
3976 }
3977 }
3978 other => other,
3979 }
3980}
3981
3982fn parse_bspline_boundary_conditions(
3983 options: &BTreeMap<String, String>,
3984) -> Result<BSplineBoundaryConditions, String> {
3985 let fallback_anchor = option_f64(options, "anchor")
3986 .or_else(|| option_f64(options, "anchor_value"))
3987 .or_else(|| option_f64(options, "value"));
3988 let global_boundary_conditions = options
3989 .get("boundary_conditions")
3990 .or_else(|| options.get("bc"));
3991 let mut boundary_conditions = BSplineBoundaryConditions::default();
3992
3993 if let Some(raw_boundary_conditions) = global_boundary_conditions {
3994 let cond = parse_endpoint_side(raw_boundary_conditions, "boundary_conditions")?;
3995 let side = options
3996 .get("side")
3997 .map(|s| s.trim().to_ascii_lowercase())
3998 .unwrap_or_else(|| "both".to_string());
3999 match side.as_str() {
4000 "both" | "all" | "endpoints" => {
4001 boundary_conditions.left = cond;
4002 boundary_conditions.right = cond;
4003 }
4004 "left" | "start" | "lower" => boundary_conditions.left = cond,
4005 "right" | "end" | "upper" => boundary_conditions.right = cond,
4006 other => {
4007 return Err(format!(
4008 "unsupported B-spline boundary side '{other}'; expected left, right, or both"
4009 ));
4010 }
4011 }
4012 }
4013
4014 if let Some(raw) = options
4015 .get("bc_left")
4016 .or_else(|| options.get("left_bc"))
4017 .or_else(|| options.get("bc_start"))
4018 .or_else(|| options.get("start_bc"))
4019 {
4020 boundary_conditions.left = parse_endpoint_side(raw, "left endpoint")?;
4021 }
4022 if let Some(raw) = options
4023 .get("bc_right")
4024 .or_else(|| options.get("right_bc"))
4025 .or_else(|| options.get("bc_end"))
4026 .or_else(|| options.get("end_bc"))
4027 {
4028 boundary_conditions.right = parse_endpoint_side(raw, "right endpoint")?;
4029 }
4030
4031 boundary_conditions.left = apply_anchor_value(
4032 boundary_conditions.left,
4033 boundary_anchor_value(options, "left", fallback_anchor),
4034 );
4035 boundary_conditions.right = apply_anchor_value(
4036 boundary_conditions.right,
4037 boundary_anchor_value(options, "right", fallback_anchor),
4038 );
4039
4040 // Non-zero anchors require an affine offset term that the current basis
4041 // builder does not synthesize (see `build_bspline_basis_1d` in
4042 // src/terms/basis.rs). Surface the rejection at parse time with the side
4043 // and value in the diagnostic, instead of letting the value-only error
4044 // emerge deep inside the basis builder where the user has no context
4045 // about which anchor key (`anchor`, `left_anchor`, `right_anchor`, …)
4046 // routed into which endpoint.
4047 reject_nonzero_anchor("left", boundary_conditions.left)?;
4048 reject_nonzero_anchor("right", boundary_conditions.right)?;
4049
4050 Ok(boundary_conditions)
4051}
4052
4053fn reject_nonzero_anchor(side: &str, cond: BSplineEndpointBoundaryCondition) -> Result<(), String> {
4054 if let BSplineEndpointBoundaryCondition::Anchored { value } = cond {
4055 if value.abs() > 1e-12 {
4056 return Err(format!(
4057 "non-zero {side} anchor {value} requires an affine offset term that is not yet supported; only anchored value 0 is accepted at parse time"
4058 ));
4059 }
4060 }
4061 Ok(())
4062}
4063
4064/// Resolve the requested internal-knot count and effective spline degree for
4065/// a 1-D penalized B-spline smooth. This mirrors the tensor-margin per-axis
4066/// degree-reduction policy: a 1-D B-spline basis with `k` functions
4067/// is well-defined for any `degree <= k - 1`, so an explicit
4068/// `s(x, bs="ps", k=3)` with default `degree=3` is interpreted as the
4069/// largest representable spline (`effective_degree = k - 1 = 2`, quadratic)
4070/// rather than rejected. The `penalty_order` carried by the caller must be
4071/// clamped to `<= effective_degree` so the marginal difference penalty
4072/// stays well-defined; the returned `effective_degree` makes that explicit.
4073///
4074/// Mirrors the tensor margin treatment in the `te(...)` builder so a
4075/// standalone smooth, a factor smooth, and a tensor margin all interpret
4076/// "small k" the same way.
4077fn parse_ps_internal_knots(
4078 options: &BTreeMap<String, String>,
4079 degree: usize,
4080 default_internal_knots: usize,
4081) -> Result<(usize, bool, usize), String> {
4082 const MIN_EXPRESSIVE_INTERNAL_KNOTS: usize = 2;
4083 // Strict variants: reject `k=-1`, `k=1.5`, `knots=-2` etc. with a
4084 // focused error instead of silently dropping the value and using the
4085 // default. Lenient `option_usize` / `option_usize_any` silently swallow
4086 // unparseable values, which leaves the user thinking they configured
4087 // something when they did not.
4088 // A list-valued `knots=[...]` carries explicit internal positions, not a
4089 // count; it is consumed by `parse_explicit_internal_knots`. Treat it as
4090 // "count not specified" here so the strict integer parse does not reject
4091 // the bracketed value (the Provided path ignores the returned count).
4092 let knots_internal = if knots_option_is_list(options) {
4093 None
4094 } else {
4095 option_usize_strict(options, "knots")?
4096 };
4097 let basis_dim = option_usize_any_strict(options, &["k", "basis_dim", "basis-dim", "basisdim"])?;
4098 if knots_internal.is_some() && basis_dim.is_some() {
4099 return Err(TermBuilderError::incompatible_config(
4100 "ps/bspline smooth: specify either knots=<internal_knots> or k=<basis_dim> (not both)",
4101 )
4102 .to_string());
4103 }
4104 if let Some(k) = basis_dim {
4105 if k < 2 {
4106 return Err(TermBuilderError::invalid_option(format!(
4107 "ps/bspline smooth: k={} too small; B-spline basis requires k >= 2",
4108 k
4109 ))
4110 .to_string());
4111 }
4112 // `degree <= k - 1` is required for the B-spline basis to be
4113 // well-defined; reduce on this axis only when the user asked for
4114 // a smaller k than the cubic default supports. This matches mgcv's
4115 // behaviour (e.g. `s(x, bs="ps", k=3)` becomes a quadratic basis)
4116 // and the per-axis reduction the tensor builder already does.
4117 let effective_degree = degree.min(k - 1).max(1);
4118 let num_internal_knots = if effective_degree < degree {
4119 // Reproduce the requested basis size exactly when degree was
4120 // reduced for a low-cardinality axis: num_basis = k.
4121 k.saturating_sub(effective_degree + 1)
4122 } else {
4123 (k - degree - 1).max(MIN_EXPRESSIVE_INTERNAL_KNOTS)
4124 };
4125 Ok((num_internal_knots, false, effective_degree))
4126 } else {
4127 Ok((
4128 knots_internal.unwrap_or(default_internal_knots),
4129 knots_internal.is_none(),
4130 degree,
4131 ))
4132 }
4133}
4134
4135/// True when the `knots` option value is a *list* literal (`[...]`, `c(...)`,
4136/// or `(...)`) rather than a scalar count. mgcv's `knots=` accepts both: a
4137/// single integer is an internal-knot count, while a vector is explicit
4138/// internal knot positions. We disambiguate purely on the wrapper syntax so a
4139/// bare `knots=5` keeps its historical count meaning.
4140fn knots_option_is_list(options: &BTreeMap<String, String>) -> bool {
4141 options
4142 .get("knots")
4143 .map(|raw| {
4144 let t = raw.trim();
4145 t.starts_with('[') || t.starts_with("c(") || t.starts_with("C(") || t.starts_with('(')
4146 })
4147 .unwrap_or(false)
4148}
4149
4150/// Parse `knots=[k0, k1, ...]` (or `c(...)` / `(...)`) into explicit internal
4151/// knot positions. Returns `Ok(None)` when `knots` is absent or a scalar count
4152/// (handled by [`parse_ps_internal_knots`]); `Ok(Some(positions))` when it is a
4153/// non-empty numeric list; and an error for an empty or unparseable list.
4154fn parse_explicit_internal_knots(
4155 options: &BTreeMap<String, String>,
4156) -> Result<Option<Vec<f64>>, String> {
4157 if !knots_option_is_list(options) {
4158 return Ok(None);
4159 }
4160 let raw = options
4161 .get("knots")
4162 .expect("knots_option_is_list implies the key is present");
4163 let tokens = split_list_option(raw);
4164 if tokens.is_empty() {
4165 return Err(TermBuilderError::invalid_option(format!(
4166 "knots={raw} is an empty list; supply at least one internal knot position \
4167 (e.g. knots=[0.2, 0.5, 0.8]) or a scalar count (e.g. knots=8)"
4168 ))
4169 .to_string());
4170 }
4171 let mut positions = Vec::with_capacity(tokens.len());
4172 for tok in &tokens {
4173 let value = parse_numeric_expr(tok).map_err(|err| {
4174 TermBuilderError::invalid_option(format!(
4175 "knots list entry '{tok}' is not a numeric position: {err}"
4176 ))
4177 .to_string()
4178 })?;
4179 positions.push(value);
4180 }
4181 Ok(Some(positions))
4182}
4183
4184/// Resolve the `knot_placement=` option for an automatically generated knot
4185/// vector. Accepts `"uniform"` (the default, equal spacing on the data range)
4186/// and `"quantile"` (interior knots at empirical data quantiles, better for
4187/// skewed covariates). Unknown values are rejected so typos do not silently
4188/// fall back to uniform.
4189fn parse_knot_placement(
4190 options: &BTreeMap<String, String>,
4191) -> Result<crate::basis::BSplineKnotPlacement, String> {
4192 use crate::basis::BSplineKnotPlacement;
4193 match options
4194 .get("knot_placement")
4195 .or_else(|| options.get("knot-placement"))
4196 .or_else(|| options.get("knotplacement"))
4197 {
4198 None => Ok(BSplineKnotPlacement::Uniform),
4199 Some(raw) => match raw
4200 .trim()
4201 .trim_matches('"')
4202 .trim_matches('\'')
4203 .to_ascii_lowercase()
4204 .as_str()
4205 {
4206 "uniform" | "even" | "equal" => Ok(BSplineKnotPlacement::Uniform),
4207 "quantile" | "quantiles" | "data" | "empirical" => Ok(BSplineKnotPlacement::Quantile),
4208 other => Err(TermBuilderError::invalid_option(format!(
4209 "knot_placement={other} is not recognised; expected \"uniform\" or \"quantile\""
4210 ))
4211 .to_string()),
4212 },
4213 }
4214}
4215
4216/// Build the non-periodic 1D B-spline knot spec for the `ps`/`bspline` and
4217/// factor-smooth marginal paths, honoring (in priority order):
4218/// 1. `knots=[...]` explicit internal positions → [`BSplineKnotSpec::Provided`]
4219/// 2. `knot_placement="quantile"` → [`BSplineKnotSpec::Automatic`]
4220/// 3. uniform generation → [`BSplineKnotSpec::Generate`]
4221///
4222/// `data` is the covariate column (used to clamp explicit positions to the
4223/// observed range and to drive quantile placement); `n_knots` is the resolved
4224/// internal-knot count from [`parse_ps_internal_knots`] used for the automatic
4225/// strategies.
4226fn resolve_nonperiodic_bspline_knotspec(
4227 options: &BTreeMap<String, String>,
4228 data: ArrayView1<'_, f64>,
4229 data_range: (f64, f64),
4230 degree: usize,
4231 n_knots: usize,
4232) -> Result<BSplineKnotSpec, String> {
4233 use crate::basis::{BSplineKnotPlacement, clamped_knot_vector_from_internal_positions};
4234 if let Some(positions) = parse_explicit_internal_knots(options)? {
4235 if option_usize_any_strict(options, &["k", "basis_dim", "basis-dim", "basisdim"])?.is_some()
4236 {
4237 return Err(TermBuilderError::incompatible_config(
4238 "ps/bspline smooth: specify either explicit knots=[...] positions or \
4239 k=<basis_dim> (not both); the basis size is fixed by the knot vector",
4240 )
4241 .to_string());
4242 }
4243 let knots = clamped_knot_vector_from_internal_positions(data_range, &positions, degree)
4244 .map_err(|e| e.to_string())?;
4245 return Ok(BSplineKnotSpec::Provided(knots));
4246 }
4247 match parse_knot_placement(options)? {
4248 BSplineKnotPlacement::Uniform => Ok(BSplineKnotSpec::Generate {
4249 data_range,
4250 num_internal_knots: n_knots,
4251 }),
4252 BSplineKnotPlacement::Quantile => {
4253 // Validate the column up-front so an unfittable request surfaces a
4254 // user-correctable error at parse time rather than deep in basis
4255 // construction. The same data drives the eventual quantile knots.
4256 crate::basis::auto_knot_vector_1d_quantile(data, n_knots, degree)
4257 .map_err(|e| e.to_string())?;
4258 Ok(BSplineKnotSpec::Automatic {
4259 num_internal_knots: Some(n_knots),
4260 placement: BSplineKnotPlacement::Quantile,
4261 })
4262 }
4263 }
4264}
4265
4266/// Reject unknown option keys with a focused error that names the term and
4267/// the offending key, plus suggests near-matches from the known-key list.
4268/// Without this, typos like `lengt_scale=0.1` or `nyu=5/2` are silently
4269/// dropped, the term uses the default, and the user has no idea why their
4270/// option had no effect.
4271pub fn validate_known_options(
4272 term_name: &str,
4273 options: &BTreeMap<String, String>,
4274 known: &[&str],
4275) -> Result<(), String> {
4276 let known_set: std::collections::BTreeSet<&&str> = known.iter().collect();
4277 for key in options.keys() {
4278 if !known_set.contains(&key.as_str()) {
4279 if term_name == "tensor" && is_tensor_k_axis_option_key(key) {
4280 continue;
4281 }
4282 // Suggest near-matches (substring or shared prefix ≥ 3).
4283 let key_l = key.to_ascii_lowercase();
4284 let mut suggestions: Vec<&str> = known
4285 .iter()
4286 .filter(|k| {
4287 let kl = k.to_ascii_lowercase();
4288 kl.contains(&key_l) || key_l.contains(&kl) || {
4289 let n = kl
4290 .chars()
4291 .zip(key_l.chars())
4292 .take_while(|(a, b)| a == b)
4293 .count();
4294 n >= 3
4295 }
4296 })
4297 .copied()
4298 .collect();
4299 suggestions.sort_unstable();
4300 suggestions.dedup();
4301 let hint = if suggestions.is_empty() {
4302 String::new()
4303 } else {
4304 format!(" — did you mean one of [{}]?", suggestions.join(", "))
4305 };
4306 return Err(TermBuilderError::invalid_option(format!(
4307 "{term_name}() does not accept option `{key}`{hint}. Valid options: [{}]",
4308 {
4309 let mut sorted = known.to_vec();
4310 sorted.sort_unstable();
4311 sorted.join(", ")
4312 }
4313 ))
4314 .to_string());
4315 }
4316 }
4317 Ok(())
4318}
4319
4320/// Private (engine-injected) option that caps the *default* spatial center
4321/// count for a secondary (distributional) predictor's smooth — see
4322/// `solver::fit_orchestration::apply_secondary_predictor_basis_parsimony` and #501.
4323///
4324/// It is deliberately NOT one of the user-facing count aliases recognised by
4325/// [`has_explicit_countwith_basis_alias`], so it never flips the spatial basis
4326/// onto the explicit (hard) center-placement strategy: the cap lowers the
4327/// *default* count while the `Auto` strategy is retained, so the count is still
4328/// softly reduced when the data can't support it.
4329pub const SECONDARY_CENTER_CAP_OPTION: &str = "__secondary_center_cap";
4330
4331/// Apply the secondary-predictor center cap to a *default* spatial center
4332/// count. A no-op when the cap option is absent (the common case) or when the
4333/// user supplied an explicit count (then `default_count` is ignored downstream
4334/// by [`parse_countwith_basis_alias`] anyway).
4335pub(crate) fn cap_default_spatial_centers(
4336 options: &BTreeMap<String, String>,
4337 default_count: usize,
4338) -> usize {
4339 match option_usize(options, SECONDARY_CENTER_CAP_OPTION) {
4340 Some(cap) => default_count.min(cap),
4341 None => default_count,
4342 }
4343}
4344
4345fn default_matern_center_count(n: usize, d: usize, planned_count: usize) -> usize {
4346 // #1074: the mgcv-sized basis cap (`k = 10·3^(d-1)`) was DELETED here too — it
4347 // masked the same over-sizing/under-penalization defect by shrinking the basis
4348 // rather than fixing the optimizer. The default now uses the generic n-scaling
4349 // plan. A small-n floor against a numerically-fragile two-column kernel block
4350 // is a legitimate degenerate guard and is kept. Explicit `k`/`centers` still
4351 // take full effect upstream.
4352 let low_n_floor = (d + 4).min(n);
4353 planned_count.max(low_n_floor).max(1)
4354}
4355
4356pub fn parse_countwith_basis_alias(
4357 options: &BTreeMap<String, String>,
4358 primarykey: &str,
4359 default_count: usize,
4360) -> Result<usize, String> {
4361 // Strict: reject unparseable values (e.g. `centers=many`, `centers=-1`,
4362 // `centers=1.5`) instead of silently dropping them and falling through
4363 // to the default. Without this the user gets the auto-inferred count
4364 // silently and never realizes their explicit option was ignored.
4365 let primary = option_usize_strict(options, primarykey)?;
4366 let basis_dim = option_usize_any_strict(
4367 options,
4368 &["k", "basis_dim", "basis-dim", "basisdim", "knots"],
4369 )?;
4370 if primary.is_some() && basis_dim.is_some() {
4371 return Err(TermBuilderError::incompatible_config(format!(
4372 "specify either {}=<count> or k=<basis_dim> (not both)",
4373 primarykey
4374 ))
4375 .to_string());
4376 }
4377 Ok(primary.or(basis_dim).unwrap_or(default_count))
4378}
4379
4380pub fn has_explicit_countwith_basis_alias(
4381 options: &BTreeMap<String, String>,
4382 primarykey: &str,
4383) -> bool {
4384 options.contains_key(primarykey)
4385 || ["k", "basis_dim", "basis-dim", "basisdim", "knots"]
4386 .iter()
4387 .any(|alias| options.contains_key(*alias))
4388}
4389
4390pub fn parse_cyclic_boundary(
4391 options: &BTreeMap<String, String>,
4392 minv: f64,
4393 maxv: f64,
4394) -> Result<OneDimensionalBoundary, String> {
4395 let cyclic = option_bool(options, "cyclic")
4396 .or_else(|| option_bool(options, "periodic"))
4397 .unwrap_or(false);
4398 if !cyclic {
4399 return Ok(OneDimensionalBoundary::Open);
4400 }
4401 let start = match option_numeric_expr(options, "period_start")? {
4402 Some(v) => v,
4403 None => option_numeric_expr(options, "start")?.unwrap_or(minv),
4404 };
4405 let end = match option_numeric_expr(options, "period_end")? {
4406 Some(v) => v,
4407 None => option_numeric_expr(options, "end")?.unwrap_or(maxv),
4408 };
4409 if end <= start {
4410 return Err(format!(
4411 "cyclic smooth requires period_end/end ({end}) > period_start/start ({start})"
4412 ));
4413 }
4414 Ok(OneDimensionalBoundary::Cyclic { start, end })
4415}
4416
4417/// Parse the periodic-uniform domain for a one-dimensional cyclic smooth.
4418///
4419/// Returns the `(domain_start, period)` pair derived from
4420/// `period_start` / `start`, `period_end` / `end`, falling back to the
4421/// data range `[minv, maxv)` when neither bound is provided. The period
4422/// must be strictly positive.
4423pub fn parse_periodic_domain_1d(
4424 options: &BTreeMap<String, String>,
4425 minv: f64,
4426 maxv: f64,
4427) -> Result<(f64, f64), String> {
4428 let start_opt = match option_numeric_expr(options, "period_start")? {
4429 Some(v) => Some(v),
4430 None => option_numeric_expr(options, "start")?,
4431 };
4432 let end_opt = match option_numeric_expr(options, "period_end")? {
4433 Some(v) => Some(v),
4434 None => option_numeric_expr(options, "end")?,
4435 };
4436 // Reject the pure data-range fallback. A B-spline periodic smooth that takes
4437 // its wrap from the observed [min, max] is sample-dependent and silently
4438 // wrong: uniform draws on a true period of 2π land on [ε, 2π−ε], so using
4439 // (max−min) as the period seams the curve with an off-by-ε discontinuity and
4440 // the fit drifts with the sample. (Unlike the radial closed-lattice Duchon
4441 // path, whose centers DO tile a full period, so its span-derive is exact —
4442 // see `parse_periodic_axes_option`.) Require the caller to name the period
4443 // explicitly via `period=`/`period_end`. The end is only defaulted to `maxv`
4444 // when a `period_start`/`start` was given (a half-open declaration); a bare
4445 // periodic smooth with neither bound is an error.
4446 if end_opt.is_none() && start_opt.is_none() {
4447 return Err(
4448 "periodic B-spline smooth requires an explicit period: pass period=<value> \
4449 (e.g. period=2*pi) or period_start=/period_end=. Deriving the period from the \
4450 observed data range is sample-dependent and produces an off-by-ε seam, so it is \
4451 not inferred."
4452 .to_string(),
4453 );
4454 }
4455 let start = start_opt.unwrap_or(minv);
4456 let end = end_opt.unwrap_or(maxv);
4457 if !(start.is_finite() && end.is_finite()) {
4458 return Err(format!(
4459 "periodic smooth domain requires finite endpoints, got ({start}, {end})"
4460 ));
4461 }
4462 if end <= start {
4463 return Err(format!(
4464 "periodic smooth requires period_end/end ({end}) > period_start/start ({start})"
4465 ));
4466 }
4467 Ok((start, end - start))
4468}
4469
4470fn parse_matern_nu(raw: &str) -> Result<MaternNu, String> {
4471 let trimmed = raw.trim();
4472 let lowered = trimmed.to_ascii_lowercase();
4473 match lowered.as_str() {
4474 "1/2" | "0.5" | "half" => return Ok(MaternNu::Half),
4475 "3/2" | "1.5" => return Ok(MaternNu::ThreeHalves),
4476 "5/2" | "2.5" => return Ok(MaternNu::FiveHalves),
4477 "7/2" | "3.5" => return Ok(MaternNu::SevenHalves),
4478 "9/2" | "4.5" => return Ok(MaternNu::NineHalves),
4479 _ => {}
4480 }
4481
4482 let value = if let Some((num, den)) = trimmed.split_once('/') {
4483 let num = num
4484 .trim()
4485 .parse::<f64>()
4486 .map_err(|err| format!("{}: {err}", unsupported_matern_nu_message(raw)))?;
4487 let den = den
4488 .trim()
4489 .parse::<f64>()
4490 .map_err(|err| format!("{}: {err}", unsupported_matern_nu_message(raw)))?;
4491 if den == 0.0 || !num.is_finite() || !den.is_finite() {
4492 return Err(unsupported_matern_nu_message(raw));
4493 }
4494 num / den
4495 } else {
4496 trimmed
4497 .parse::<f64>()
4498 .map_err(|err| format!("{}: {err}", unsupported_matern_nu_message(raw)))?
4499 };
4500
4501 const TOL: f64 = 1e-12;
4502 if (value - 0.5).abs() <= TOL {
4503 Ok(MaternNu::Half)
4504 } else if (value - 1.5).abs() <= TOL {
4505 Ok(MaternNu::ThreeHalves)
4506 } else if (value - 2.5).abs() <= TOL {
4507 Ok(MaternNu::FiveHalves)
4508 } else if (value - 3.5).abs() <= TOL {
4509 Ok(MaternNu::SevenHalves)
4510 } else if (value - 4.5).abs() <= TOL {
4511 Ok(MaternNu::NineHalves)
4512 } else {
4513 Err(unsupported_matern_nu_message(raw))
4514 }
4515}
4516
4517fn unsupported_matern_nu_message(raw: &str) -> String {
4518 TermBuilderError::unsupported_feature(format!(
4519 "unsupported Matern nu '{raw}'; supported half-integer values are 1/2, 3/2, 5/2, 7/2, and 9/2"
4520 ))
4521 .to_string()
4522}
4523
4524#[derive(Clone, Debug, serde::Serialize, serde::Deserialize)]
4525pub enum DuchonPowerPolicy {
4526 Explicit(f64),
4527 /// No explicit `power=` given: defer to the cubic structural default, which
4528 /// the builder resolves dimension-aware as `s = (d − 1)/2` (so `φ(r) = r³`
4529 /// in every dimension). There is no triple-operator minimum any more.
4530 CubicStructuralDefault,
4531}
4532
4533pub fn parse_duchon_power_policy(
4534 options: &BTreeMap<String, String>,
4535) -> Result<DuchonPowerPolicy, String> {
4536 if let Some(raw_nu) = options.get("nu") {
4537 return Err(TermBuilderError::incompatible_config(format!(
4538 "Duchon smooths use power=<number>, not nu='{}'. Use power=1.5, power=2, etc.",
4539 raw_nu
4540 ))
4541 .to_string());
4542 }
4543 match options.get("power") {
4544 Some(raw) => {
4545 let value = raw.parse::<f64>().map_err(|err| {
4546 TermBuilderError::invalid_option(format!(
4547 "invalid Duchon power '{}'; expected a non-negative number such as power=1.5 or power=2: {}",
4548 raw, err
4549 ))
4550 .to_string()
4551 })?;
4552 if !value.is_finite() || value < 0.0 {
4553 return Err(TermBuilderError::invalid_option(format!(
4554 "invalid Duchon power '{}'; expected a finite non-negative number such as power=1.5 or power=2",
4555 raw
4556 ))
4557 .to_string());
4558 }
4559 Ok(DuchonPowerPolicy::Explicit(value))
4560 }
4561 None => Ok(DuchonPowerPolicy::CubicStructuralDefault),
4562 }
4563}
4564
4565pub fn parse_duchon_power(options: &BTreeMap<String, String>) -> Result<f64, String> {
4566 match parse_duchon_power_policy(options)? {
4567 DuchonPowerPolicy::Explicit(power) => Ok(power),
4568 // Context-free placeholder: the bare option parser has no column count,
4569 // so it cannot compute the dimension-aware cubic power `s = (d − 1)/2`.
4570 // The dimension-aware resolution happens later in `build_smooth_basis`;
4571 // this 1.5 is only a stand-in for callers that need a concrete number
4572 // without data context (e.g. round-trip parser tests).
4573 DuchonPowerPolicy::CubicStructuralDefault => Ok(1.5),
4574 }
4575}
4576
4577pub fn parse_duchon_order(
4578 options: &BTreeMap<String, String>,
4579) -> Result<DuchonNullspaceOrder, String> {
4580 match options.get("order") {
4581 // Structural cubic Duchon is affine-by-default: an unspecified order is
4582 // the `Linear` (constant + linear) null space, matching the magic
4583 // default. An explicit `order=0` still selects the constant-only space.
4584 None => Ok(DuchonNullspaceOrder::Linear),
4585 Some(raw) => match raw.parse::<usize>() {
4586 Ok(0) => Ok(DuchonNullspaceOrder::Zero),
4587 Ok(1) => Ok(DuchonNullspaceOrder::Linear),
4588 Ok(other) => Ok(DuchonNullspaceOrder::Degree(other)),
4589 Err(_) => Err(TermBuilderError::invalid_option(format!(
4590 "invalid Duchon order '{}'; expected a non-negative integer such as order=0, order=1, or order=2",
4591 raw
4592 ))
4593 .to_string()),
4594 },
4595 }
4596}
4597
4598fn parse_matern_identifiability(
4599 options: &BTreeMap<String, String>,
4600) -> Result<MaternIdentifiability, TermBuilderError> {
4601 let Some(raw) = options.get("identifiability").map(String::as_str) else {
4602 return Ok(MaternIdentifiability::default());
4603 };
4604 match raw.trim().to_ascii_lowercase().as_str() {
4605 "none" => Ok(MaternIdentifiability::None),
4606 "sum_tozero" | "sum-to-zero" | "center_sum_tozero" | "center-sum-to-zero" | "centered" => {
4607 Ok(MaternIdentifiability::CenterSumToZero)
4608 }
4609 "linear" | "center_linear_orthogonal" | "center-linear-orthogonal" => {
4610 Ok(MaternIdentifiability::CenterLinearOrthogonal)
4611 }
4612 other => Err(TermBuilderError::unsupported_feature(format!(
4613 "invalid Matérn identifiability '{other}'; expected one of: none, sum_tozero, linear"
4614 ))),
4615 }
4616}
4617
4618fn parse_spatial_identifiability(
4619 options: &BTreeMap<String, String>,
4620) -> Result<SpatialIdentifiability, TermBuilderError> {
4621 let Some(raw) = options.get("identifiability").map(String::as_str) else {
4622 return Ok(SpatialIdentifiability::default());
4623 };
4624 match raw.trim().to_ascii_lowercase().as_str() {
4625 "none" => Ok(SpatialIdentifiability::None),
4626 "orthogonal"
4627 | "orthogonal_to_parametric"
4628 | "orthogonal-to-parametric"
4629 | "parametric_orthogonal" => Ok(SpatialIdentifiability::OrthogonalToParametric),
4630 "frozen" => Err(TermBuilderError::unsupported_feature(
4631 "spatial identifiability 'frozen' is internal-only; use none or orthogonal_to_parametric",
4632 )),
4633 other => Err(TermBuilderError::unsupported_feature(format!(
4634 "invalid spatial identifiability '{other}'; expected one of: none, orthogonal_to_parametric"
4635 ))),
4636 }
4637}
4638
4639#[cfg(test)]
4640mod tests {
4641 use super::*;
4642 use crate::inference::formula_dsl::parse_formula;
4643 use gam_data::{DataSchema, SchemaColumn};
4644 use ndarray::Array2;
4645 use std::collections::BTreeMap;
4646
4647 fn continuous_dataset(headers: &[&str], rows: Vec<Vec<f64>>) -> Dataset {
4648 let nrows = rows.len();
4649 let ncols = headers.len();
4650 let values = Array2::from_shape_vec(
4651 (nrows, ncols),
4652 rows.into_iter().flat_map(|row| row.into_iter()).collect(),
4653 )
4654 .expect("rectangular test data");
4655 Dataset {
4656 headers: headers.iter().map(|name| name.to_string()).collect(),
4657 values,
4658 schema: DataSchema {
4659 columns: headers
4660 .iter()
4661 .map(|name| SchemaColumn {
4662 name: name.to_string(),
4663 kind: ColumnKindTag::Continuous,
4664 levels: vec![],
4665 })
4666 .collect(),
4667 },
4668 column_kinds: vec![ColumnKindTag::Continuous; ncols],
4669 }
4670 }
4671
4672 fn factor_dataset() -> Dataset {
4673 let rows = (0..24)
4674 .map(|i| {
4675 let x = i as f64 / 23.0;
4676 let g = (i % 2) as f64;
4677 vec![x + g, x, g]
4678 })
4679 .collect::<Vec<_>>();
4680 Dataset {
4681 headers: vec!["y".into(), "x".into(), "g".into()],
4682 values: Array2::from_shape_vec(
4683 (rows.len(), 3),
4684 rows.into_iter().flat_map(|row| row.into_iter()).collect(),
4685 )
4686 .expect("rectangular factor test data"),
4687 schema: DataSchema {
4688 columns: vec![
4689 SchemaColumn {
4690 name: "y".into(),
4691 kind: ColumnKindTag::Continuous,
4692 levels: vec![],
4693 },
4694 SchemaColumn {
4695 name: "x".into(),
4696 kind: ColumnKindTag::Continuous,
4697 levels: vec![],
4698 },
4699 SchemaColumn {
4700 name: "g".into(),
4701 kind: ColumnKindTag::Categorical,
4702 levels: vec!["a".into(), "b".into()],
4703 },
4704 ],
4705 },
4706 column_kinds: vec![
4707 ColumnKindTag::Continuous,
4708 ColumnKindTag::Continuous,
4709 ColumnKindTag::Categorical,
4710 ],
4711 }
4712 }
4713
4714 /// #1378: the DEFAULT univariate `s(x, bs="tp")` must build a *modest*
4715 /// mgcv-sized basis, not the n-scaled spatial heuristic. The oversized
4716 /// default basis left the two-penalty REML ρ-surface with a flat valley
4717 /// whose optimizer landing point depended on row order, breaking
4718 /// row-permutation invariance. Pin the default 1-D center count so a
4719 /// regression that reinstates the n-scaled default trips here, fast, with
4720 /// no fit/optimizer in the loop.
4721 #[test]
4722 fn default_univariate_thinplate_basis_dim_is_modest() {
4723 // n = 300 (the #1378 scenario): the n-scaled spatial heuristic would
4724 // request ~75 centers here. The modest default must stay near k = 10.
4725 let n = 300usize;
4726 let rows: Vec<Vec<f64>> = (0..n)
4727 .map(|i| {
4728 let x = -3.0 + 6.0 * (i as f64) / ((n - 1) as f64);
4729 vec![x.sin(), x]
4730 })
4731 .collect();
4732 let ds = continuous_dataset(&["y", "x"], rows);
4733
4734 let mut options = BTreeMap::new();
4735 options.insert("bs".to_string(), "tp".to_string());
4736
4737 let mut notes = Vec::new();
4738 let basis = build_smooth_basis(
4739 SmoothKind::S,
4740 &["x".to_string()],
4741 &[1],
4742 &options,
4743 &ds,
4744 &mut notes,
4745 &ResourcePolicy::default_library(),
4746 1,
4747 )
4748 .expect("build default univariate tp smooth");
4749
4750 let centers = match &basis {
4751 SmoothBasisSpec::ThinPlate { spec, .. } => match &spec.center_strategy {
4752 CenterStrategy::Auto(inner) => match inner.as_ref() {
4753 CenterStrategy::FarthestPoint { num_centers }
4754 | CenterStrategy::EqualMass { num_centers }
4755 | CenterStrategy::EqualMassCovarRepresentative { num_centers }
4756 | CenterStrategy::KMeans { num_centers, .. } => *num_centers,
4757 other => panic!("unexpected auto inner center strategy: {other:?}"),
4758 },
4759 CenterStrategy::FarthestPoint { num_centers }
4760 | CenterStrategy::EqualMass { num_centers }
4761 | CenterStrategy::EqualMassCovarRepresentative { num_centers }
4762 | CenterStrategy::KMeans { num_centers, .. } => *num_centers,
4763 other => panic!("unexpected center strategy: {other:?}"),
4764 },
4765 other => panic!("expected ThinPlate basis, got {other:?}"),
4766 };
4767
4768 // #1074: the mgcv-sized basis-dim ceiling assertion was removed with the
4769 // cap it tested. The default tp basis is now n-scaled; we only assert it
4770 // still builds a usable basis.
4771 assert!(
4772 centers >= 1,
4773 "default univariate tp must still build a usable basis (centers={centers})",
4774 );
4775 }
4776
4777 /// gam#1629: a default 2-D `matern(x1, x2)` (no explicit `length_scale`)
4778 /// must leave the length-scale at the `0.0` auto sentinel — NOT the full
4779 /// data diameter — so the planner's `auto_init_length_scale_in_place` seeds
4780 /// it on the wiggly/resolving side (`max_range / sqrt(n)`), the same regime
4781 /// thin-plate uses. The previous `default_matern_length_scale` returned the
4782 /// full diameter, which is non-zero, so the `0.0`-gated auto-init was a
4783 /// no-op and the κ-optimizer started in the over-smoothed corner and parked
4784 /// there (truth-RMSE ~6× worse than thin-plate/tensor on identical
4785 /// high-frequency 2-D surfaces, insensitive to `k`). This pins the corrected
4786 /// seed geometry without a fit/optimizer in the loop.
4787 #[test]
4788 fn default_matern_2d_seeds_resolving_length_scale_not_overscaled_diameter() {
4789 // A fine multi-frequency 2-D grid (the #1629 reproduction shape): the
4790 // data diameter is O(1.4) in each axis; the resolving seed must be far
4791 // smaller than the diameter so high-frequency structure stays reachable.
4792 let side = 24usize; // n = 576
4793 let mut rows: Vec<Vec<f64>> = Vec::with_capacity(side * side);
4794 for i in 0..side {
4795 for j in 0..side {
4796 let x1 = i as f64 / (side - 1) as f64; // [0, 1]
4797 let x2 = j as f64 / (side - 1) as f64; // [0, 1]
4798 let y = (6.0 * x1).sin() * (6.0 * x2).cos();
4799 rows.push(vec![y, x1, x2]);
4800 }
4801 }
4802 let n = rows.len();
4803 let ds = continuous_dataset(&["y", "x1", "x2"], rows);
4804
4805 let mut options = BTreeMap::new();
4806 options.insert("bs".to_string(), "gp".to_string()); // gp ⇒ Matérn
4807 let mut notes = Vec::new();
4808 let mut basis = build_smooth_basis(
4809 SmoothKind::S,
4810 &["x1".to_string(), "x2".to_string()],
4811 &[1, 2],
4812 &options,
4813 &ds,
4814 &mut notes,
4815 &ResourcePolicy::default_library(),
4816 1,
4817 )
4818 .expect("build default 2-D matern smooth");
4819
4820 // (1) The builder must emit the auto sentinel, not a baked-in diameter.
4821 let (feature_cols, seeded_length_scale) = match &basis {
4822 SmoothBasisSpec::Matern {
4823 feature_cols, spec, ..
4824 } => (feature_cols.clone(), spec.length_scale),
4825 other => panic!("expected Matern basis, got {other:?}"),
4826 };
4827 assert_eq!(
4828 seeded_length_scale, 0.0,
4829 "default matern() must leave length_scale at the 0.0 auto sentinel \
4830 (got {seeded_length_scale}); a non-zero diameter default re-enters the \
4831 over-smoothed basin and disables the planner's wiggly-side auto-init",
4832 );
4833
4834 // (2) After the shared auto-init runs, the realized length-scale must
4835 // land in the resolving regime: `max_range / sqrt(n)`, far below the
4836 // data diameter. This is the seed the κ-optimizer starts REML from.
4837 crate::smooth::auto_init_length_scale_in_basis(ds.values.view(), &mut basis);
4838 let realized = match &basis {
4839 SmoothBasisSpec::Matern { spec, .. } => spec.length_scale,
4840 other => panic!("expected Matern basis after auto-init, got {other:?}"),
4841 };
4842 let expected =
4843 crate::smooth::auto_initial_length_scale(ds.values.view(), &feature_cols);
4844 assert!(
4845 (realized - expected).abs() <= 1e-12,
4846 "auto-init must seed the wiggly-side length scale max_range/sqrt(n) \
4847 (expected {expected}, got {realized})",
4848 );
4849
4850 // Sanity: the resolving seed is well below the per-axis range (≈1.0).
4851 // Before the fix the seed was the full diameter (≈√2 ≈ 1.414); the
4852 // resolving seed here is ≈ 1.0 / sqrt(576) ≈ 0.042, ~30× smaller.
4853 let max_range = 1.0_f64; // each axis spans [0, 1]
4854 assert!(
4855 realized < max_range / 4.0,
4856 "matern seed length_scale {realized} must be in the resolving regime, \
4857 not the over-smoothed diameter corner (n={n}, max_range≈{max_range})",
4858 );
4859 }
4860
4861 /// gam#1778: `matern(..., periodic=true)` and `thinplate(..., periodic=true)`
4862 /// must be ACCEPTED. The squash-merge that wired periodic support into the
4863 /// matern/thinplate basis specs forgot to add the periodic option keys to
4864 /// those two builders' `validate_known_options` whitelists (only `duchon`
4865 /// got both), so `periodic=`/`period=`/`cyclic=`/`period_start=`/`period_end=`
4866 /// were rejected as unknown options even though the spec/builder consume them.
4867 /// Before the whitelist fix this returned an "unknown option" error.
4868 #[test]
4869 fn matern_and_thinplate_accept_periodic_option() {
4870 let n = 200usize;
4871 let rows: Vec<Vec<f64>> = (0..n)
4872 .map(|i| {
4873 let x = -3.0 + 6.0 * (i as f64) / ((n - 1) as f64);
4874 vec![x.sin(), x]
4875 })
4876 .collect();
4877 let ds = continuous_dataset(&["y", "x"], rows);
4878
4879 // matern() with periodic=true must build without an unknown-option error.
4880 let mut matern_opts = BTreeMap::new();
4881 matern_opts.insert("bs".to_string(), "gp".to_string()); // gp ⇒ Matérn
4882 matern_opts.insert("periodic".to_string(), "true".to_string());
4883 let mut notes = Vec::new();
4884 let matern_basis = build_smooth_basis(
4885 SmoothKind::S,
4886 &["x".to_string()],
4887 &[1],
4888 &matern_opts,
4889 &ds,
4890 &mut notes,
4891 &ResourcePolicy::default_library(),
4892 1,
4893 )
4894 .expect("matern(x, periodic=true) must be accepted");
4895 match &matern_basis {
4896 SmoothBasisSpec::Matern { spec, .. } => assert!(
4897 spec.periodic.is_some(),
4898 "periodic=true must thread a Some(periodic) into the matern spec",
4899 ),
4900 other => panic!("expected Matern basis, got {other:?}"),
4901 }
4902
4903 // thinplate()/tps() with periodic=true must likewise be accepted.
4904 let mut tps_opts = BTreeMap::new();
4905 tps_opts.insert("bs".to_string(), "tp".to_string());
4906 tps_opts.insert("periodic".to_string(), "true".to_string());
4907 let mut notes = Vec::new();
4908 let tps_basis = build_smooth_basis(
4909 SmoothKind::S,
4910 &["x".to_string()],
4911 &[1],
4912 &tps_opts,
4913 &ds,
4914 &mut notes,
4915 &ResourcePolicy::default_library(),
4916 1,
4917 )
4918 .expect("thinplate(x, periodic=true) must be accepted");
4919 match &tps_basis {
4920 SmoothBasisSpec::ThinPlate { spec, .. } => assert!(
4921 spec.periodic.is_some(),
4922 "periodic=true must thread a Some(periodic) into the thinplate spec",
4923 ),
4924 other => panic!("expected ThinPlate basis, got {other:?}"),
4925 }
4926 }
4927
4928 fn inferred_tensor_basis_product(ds: &Dataset) -> usize {
4929 let parsed = parse_formula("y ~ te(theta, h)").expect("parse tensor formula");
4930 let col_map = ds.column_map();
4931 let mut notes = Vec::new();
4932 let terms = build_termspec(
4933 &parsed.terms,
4934 ds,
4935 &col_map,
4936 &mut notes,
4937 &ResourcePolicy::default_library(),
4938 )
4939 .expect("build tensor termspec");
4940 let SmoothBasisSpec::TensorBSpline { spec, .. } = &terms.smooth_terms[0].basis else {
4941 panic!("expected tensor smooth");
4942 };
4943 spec.marginalspecs
4944 .iter()
4945 .map(|marginal| match marginal.knotspec {
4946 BSplineKnotSpec::Generate {
4947 num_internal_knots, ..
4948 } => num_internal_knots + marginal.degree + 1,
4949 BSplineKnotSpec::PeriodicUniform { num_basis, .. } => num_basis,
4950 BSplineKnotSpec::Automatic {
4951 num_internal_knots: Some(num_internal_knots),
4952 ..
4953 } => num_internal_knots + marginal.degree + 1,
4954 BSplineKnotSpec::Automatic {
4955 num_internal_knots: None,
4956 ..
4957 } => panic!("test helper cannot infer automatic knot count"),
4958 BSplineKnotSpec::Provided(ref knots) => {
4959 knots.len().saturating_sub(marginal.degree + 1)
4960 }
4961 // cr basis dimension equals the knot count (no degree offset).
4962 BSplineKnotSpec::NaturalCubicRegression { ref knots } => knots.len(),
4963 })
4964 .product()
4965 }
4966
4967 fn tensor_margin_basis_sizes(ds: &Dataset, formula: &str) -> Vec<usize> {
4968 let parsed = parse_formula(formula).expect("parse tensor formula");
4969 let col_map = ds.column_map();
4970 let mut notes = Vec::new();
4971 let terms = build_termspec(
4972 &parsed.terms,
4973 ds,
4974 &col_map,
4975 &mut notes,
4976 &ResourcePolicy::default_library(),
4977 )
4978 .expect("build tensor termspec");
4979 let SmoothBasisSpec::TensorBSpline { spec, .. } = &terms.smooth_terms[0].basis else {
4980 panic!("expected tensor smooth");
4981 };
4982 spec.marginalspecs
4983 .iter()
4984 .map(|marginal| match marginal.knotspec {
4985 BSplineKnotSpec::Generate {
4986 num_internal_knots, ..
4987 } => num_internal_knots + marginal.degree + 1,
4988 BSplineKnotSpec::PeriodicUniform { num_basis, .. } => num_basis,
4989 BSplineKnotSpec::Automatic {
4990 num_internal_knots: Some(num_internal_knots),
4991 ..
4992 } => num_internal_knots + marginal.degree + 1,
4993 BSplineKnotSpec::Automatic {
4994 num_internal_knots: None,
4995 ..
4996 } => panic!("test helper cannot infer automatic knot count"),
4997 BSplineKnotSpec::Provided(ref knots) => {
4998 knots.len().saturating_sub(marginal.degree + 1)
4999 }
5000 // cr basis dimension equals the knot count (no degree offset).
5001 BSplineKnotSpec::NaturalCubicRegression { ref knots } => knots.len(),
5002 })
5003 .collect()
5004 }
5005
5006 #[test]
5007 fn validate_known_options_lists_valid_option_names_for_unknown_parameter() {
5008 let mut options = BTreeMap::new();
5009 options.insert("lengt_scale".to_string(), "0.25".to_string());
5010 let err = validate_known_options(
5011 "matern",
5012 &options,
5013 &["type", "bs", "length_scale", "centers", "k", "nu"],
5014 )
5015 .expect_err("unknown smooth option should be rejected");
5016 assert!(
5017 err.contains("matern() does not accept option `lengt_scale`"),
5018 "error should name the invalid option, got: {err}"
5019 );
5020 assert!(
5021 err.contains("did you mean one of [length_scale]"),
5022 "error should suggest the closest valid option, got: {err}"
5023 );
5024 assert!(
5025 err.contains("Valid options: ["),
5026 "error should list valid option names, got: {err}"
5027 );
5028 }
5029
5030 #[test]
5031 fn tensor_k_accepts_square_bracket_per_margin_list() {
5032 let ds = continuous_dataset(
5033 &["y", "x", "z"],
5034 (0..40)
5035 .map(|i| {
5036 let x = i as f64 / 39.0;
5037 let z = ((i * 7) % 40) as f64 / 39.0;
5038 vec![x.sin() + z.cos(), x, z]
5039 })
5040 .collect(),
5041 );
5042
5043 assert_eq!(
5044 tensor_margin_basis_sizes(&ds, "y ~ te(x, z, k=[5, 6])"),
5045 vec![5, 6],
5046 "square-bracket k lists should materialize the requested per-margin values"
5047 );
5048 }
5049
5050 /// #1776 / #1752: a bare doubly-cyclic tensor `te(x, z, bs=c('cc','cc'))`
5051 /// with NO explicit `period=` must build — each cyclic margin wraps on its
5052 /// own observed `[min, max]` data span (mirroring mgcv's `bs="cc"` and the
5053 /// 1-D cyclic fallback), instead of hard-erroring "periodic but requires an
5054 /// explicit period". The periodic-radial refactor (c8c3192fa) replaced that
5055 /// fallback with an unconditional `period=`-required error and orphaned the
5056 /// `margin_is_cc` binding that drives it (the #1776 dead-binding `-D
5057 /// warnings` build break). This pins the restored data-range derivation so a
5058 /// regression that drops the `None if margin_is_cc` branch trips here, fast,
5059 /// with no fit/optimizer in the loop.
5060 #[test]
5061 fn bare_doubly_cyclic_tensor_derives_period_from_data_range_1776() {
5062 let ds = continuous_dataset(
5063 &["y", "x", "z"],
5064 (0..40)
5065 .map(|i| {
5066 let x = i as f64 / 39.0;
5067 let z = ((i * 7) % 40) as f64 / 39.0;
5068 vec![x.sin() + z.cos(), x, z]
5069 })
5070 .collect(),
5071 );
5072
5073 let parsed = parse_formula("y ~ te(x, z, bs=c('cc','cc'))")
5074 .expect("parse doubly-cyclic tensor formula");
5075 let col_map = ds.column_map();
5076 let mut notes = Vec::new();
5077 // Must NOT hard-error: the bare cyclic margins derive their period from
5078 // the observed data range (the restored #1752 fallback).
5079 let terms = build_termspec(
5080 &parsed.terms,
5081 &ds,
5082 &col_map,
5083 &mut notes,
5084 &ResourcePolicy::default_library(),
5085 )
5086 .expect(
5087 "bare cc-cc tensor must build via the data-range period fallback (#1776/#1752), \
5088 not hard-error on a missing explicit period",
5089 );
5090 let SmoothBasisSpec::TensorBSpline { spec, .. } = &terms.smooth_terms[0].basis else {
5091 panic!("expected tensor smooth");
5092 };
5093 assert_eq!(
5094 spec.marginalspecs.len(),
5095 2,
5096 "te(x, z) builds exactly two tensor margins"
5097 );
5098 for (axis, marginal) in spec.marginalspecs.iter().enumerate() {
5099 assert!(
5100 matches!(marginal.knotspec, BSplineKnotSpec::PeriodicUniform { .. }),
5101 "cyclic margin {axis} must build a periodic (wrapped) knotspec from the \
5102 data range, got {:?}",
5103 marginal.knotspec
5104 );
5105 }
5106 }
5107
5108 #[test]
5109 fn parse_cylinder_periodic_options_match_requested_forms() {
5110 let mut opts = BTreeMap::new();
5111 opts.insert("periodic".to_string(), "[0]".to_string());
5112 opts.insert("period".to_string(), "[2*pi, None]".to_string());
5113 let axes = parse_periodic_axes(&opts, 2).expect("axes");
5114 let periods = parse_periods(&opts, &axes).expect("periods");
5115 assert_eq!(axes, vec![true, false]);
5116 assert!((periods[0].unwrap() - 2.0 * std::f64::consts::PI).abs() < 1e-12);
5117 assert_eq!(periods[1], None);
5118
5119 let mut boundary_opts = BTreeMap::new();
5120 boundary_opts.insert(
5121 "boundary".to_string(),
5122 "['periodic', 'natural']".to_string(),
5123 );
5124 boundary_opts.insert("period".to_string(), "[2*pi, None]".to_string());
5125 let boundary_axes = parse_periodic_axes(&boundary_opts, 2).expect("boundary axes");
5126 let boundary_periods =
5127 parse_periods(&boundary_opts, &boundary_axes).expect("boundary periods");
5128 assert_eq!(boundary_axes, vec![true, false]);
5129 assert!((boundary_periods[0].unwrap() - 2.0 * std::f64::consts::PI).abs() < 1e-12);
5130 assert_eq!(boundary_periods[1], None);
5131
5132 let mut unicode_opts = BTreeMap::new();
5133 unicode_opts.insert("periodic".to_string(), "[0,1]".to_string());
5134 unicode_opts.insert("period".to_string(), "[2π, τ]".to_string());
5135 let unicode_axes = parse_periodic_axes(&unicode_opts, 2).expect("unicode axes");
5136 let unicode_periods = parse_periods(&unicode_opts, &unicode_axes).expect("unicode periods");
5137 assert_eq!(unicode_axes, vec![true, true]);
5138 assert!((unicode_periods[0].unwrap() - 2.0 * std::f64::consts::PI).abs() < 1e-12);
5139 assert!((unicode_periods[1].unwrap() - std::f64::consts::TAU).abs() < 1e-12);
5140 }
5141
5142 /// The tensor boundary-token guard must ACCEPT `clamped`/`open` (the
5143 /// B-spline-clamped, non-periodic margin spelling) alongside the periodic
5144 /// selectors and the other inert non-periodic markers, and still REJECT a
5145 /// genuine endpoint constraint like `anchored`. This locks the #415 /
5146 /// cylinder fix (`te(theta, z, boundary=['periodic','clamped'])`, mgcv
5147 /// `te(bs=c("cc","ps"))`) in the fast unit lane — the end-to-end cylinder
5148 /// recovery test is R-gated (`run_r` + mgcv), so without this the guard
5149 /// regressing back to rejecting `clamped` would slip through CPU CI.
5150 #[test]
5151 fn tensor_boundary_tokens_accept_clamped_open_reject_anchored() {
5152 fn boundary(raw: &str, dim: usize) -> Result<(), String> {
5153 let mut opts = BTreeMap::new();
5154 opts.insert("boundary".to_string(), raw.to_string());
5155 validate_tensor_boundary_tokens(&opts, dim)
5156 }
5157
5158 // Mixed periodic + clamped (the cylinder) and its bare/case/quote
5159 // variants are all accepted.
5160 for raw in [
5161 "['periodic', 'clamped']",
5162 "['periodic', 'open']",
5163 "['cc', 'clamped']",
5164 "['clamped', 'natural']",
5165 "[Periodic, CLAMPED]",
5166 "c('cc', 'clamped')", // mgcv-style c(...) vector form round-trips
5167 ] {
5168 assert!(
5169 boundary(raw, 2).is_ok(),
5170 "boundary={raw:?} must be accepted (clamped/open/inert non-periodic markers)"
5171 );
5172 }
5173
5174 // `bc=` is an accepted alias for `boundary=`.
5175 let mut bc_opts = BTreeMap::new();
5176 bc_opts.insert("bc".to_string(), "['periodic', 'clamped']".to_string());
5177 assert!(validate_tensor_boundary_tokens(&bc_opts, 2).is_ok());
5178
5179 // A genuine endpoint constraint has no ordinary-margin meaning on a
5180 // tensor and must still be surfaced as a clean unsupported-feature error
5181 // rather than silently dropped.
5182 let err = boundary("['periodic', 'anchored']", 2)
5183 .expect_err("anchored endpoint constraint must be rejected on a tensor margin");
5184 assert!(
5185 err.contains("anchored") && err.contains("not supported"),
5186 "rejection must name the offending token and be an unsupported-feature error: {err}"
5187 );
5188
5189 // Absent boundary/bc is a no-op success.
5190 assert!(validate_tensor_boundary_tokens(&BTreeMap::new(), 2).is_ok());
5191 }
5192
5193 #[test]
5194 fn parse_single_axis_periodic_zero_as_axis_not_false() {
5195 let mut opts = BTreeMap::new();
5196 opts.insert("periodic".to_string(), "[0]".to_string());
5197 opts.insert("period".to_string(), "2*pi".to_string());
5198 opts.insert("origin".to_string(), "0".to_string());
5199 let axes = parse_periodic_axes(&opts, 1).expect("axes");
5200 let periods = parse_periods(&opts, &axes).expect("periods");
5201 let origins = parse_period_origins(&opts, &axes).expect("origins");
5202 assert_eq!(axes, vec![true]);
5203 assert!((periods[0].unwrap() - 2.0 * std::f64::consts::PI).abs() < 1e-12);
5204 assert_eq!(origins[0], Some(0.0));
5205 }
5206
5207 #[test]
5208 fn one_dimensional_bspline_accepts_boundary_periodic() {
5209 let ds = continuous_dataset(
5210 &["y", "theta"],
5211 (0..16)
5212 .map(|i| {
5213 let theta = std::f64::consts::TAU * i as f64 / 16.0;
5214 vec![theta.sin(), theta]
5215 })
5216 .collect(),
5217 );
5218 let parsed = parse_formula("y ~ s(theta, boundary=periodic, period=2*pi, origin=0, k=8)")
5219 .expect("parse");
5220 let col_map = ds.column_map();
5221 let mut notes = Vec::new();
5222 let terms = build_termspec(
5223 &parsed.terms,
5224 &ds,
5225 &col_map,
5226 &mut notes,
5227 &gam_runtime::resource::ResourcePolicy::default_library(),
5228 )
5229 .expect("periodic boundary should build");
5230 let SmoothBasisSpec::BSpline1D { spec, .. } = &terms.smooth_terms[0].basis else {
5231 panic!("expected 1D B-spline");
5232 };
5233 assert!(matches!(
5234 &spec.knotspec,
5235 BSplineKnotSpec::PeriodicUniform {
5236 data_range,
5237 num_basis: 8
5238 } if *data_range == (0.0, std::f64::consts::TAU)
5239 ));
5240 }
5241
5242 #[test]
5243 fn univariate_smooth_accepts_mgcv_cubic_regression_aliases() {
5244 let ds = continuous_dataset(
5245 &["y", "x"],
5246 (0..32)
5247 .map(|i| {
5248 let x = i as f64 / 31.0;
5249 vec![x * x, x]
5250 })
5251 .collect(),
5252 );
5253 let col_map = ds.column_map();
5254
5255 for (selector, expect_double_penalty) in [("cr", false), ("cs", true)] {
5256 let formula = format!("y ~ s(x, bs='{selector}')");
5257 let parsed = parse_formula(&formula).expect("parse cr/cs smooth");
5258 let mut notes = Vec::new();
5259 let terms = build_termspec(
5260 &parsed.terms,
5261 &ds,
5262 &col_map,
5263 &mut notes,
5264 &gam_runtime::resource::ResourcePolicy::default_library(),
5265 )
5266 .unwrap_or_else(|err| panic!("bs='{selector}' must build a 1-D smooth, got: {err:?}"));
5267 let SmoothBasisSpec::BSpline1D { spec, .. } = &terms.smooth_terms[0].basis else {
5268 panic!(
5269 "bs='{selector}' must lower to a BSpline1D; got {:?}",
5270 terms.smooth_terms[0].basis
5271 );
5272 };
5273 assert_eq!(
5274 spec.double_penalty, expect_double_penalty,
5275 "bs='{selector}' must default double_penalty to mgcv's convention \
5276 (cr=no-shrinkage, cs=shrinkage); got double_penalty={}",
5277 spec.double_penalty
5278 );
5279 }
5280 }
5281
5282 #[test]
5283 fn univariate_ps_small_k_degree_reduces_through_build(/* gam#1130 */) {
5284 // mgcv accepts `s(x, bs="ps", k=3)` (and the default cubic-regression
5285 // `s(x, k=3)`) by silently reducing the cubic basis to a quadratic.
5286 // The univariate ps/bspline build path used to reject this with
5287 // "k too small for degree 3"; it must now lower to a degree-2 basis
5288 // with zero internal knots (num_basis = k = 3), matching the te(...)
5289 // margin behaviour fixed in b75f55a91. Verified across the ps alias
5290 // and the default (cr) selector that both route through
5291 // parse_ps_internal_knots.
5292 let ds = continuous_dataset(
5293 &["y", "x"],
5294 (0..32)
5295 .map(|i| {
5296 let x = i as f64 / 31.0;
5297 vec![x * x, x]
5298 })
5299 .collect(),
5300 );
5301 let col_map = ds.column_map();
5302
5303 for formula in ["y ~ s(x, bs='ps', k=3)", "y ~ s(x, k=3)"] {
5304 let parsed = parse_formula(formula).expect("parse small-k ps/cr smooth");
5305 let mut notes = Vec::new();
5306 let terms = build_termspec(
5307 &parsed.terms,
5308 &ds,
5309 &col_map,
5310 &mut notes,
5311 &gam_runtime::resource::ResourcePolicy::default_library(),
5312 )
5313 .unwrap_or_else(|err| {
5314 panic!("`{formula}` must degree-reduce, not error; got: {err:?}")
5315 });
5316 let SmoothBasisSpec::BSpline1D { spec, .. } = &terms.smooth_terms[0].basis else {
5317 panic!(
5318 "`{formula}` must lower to a BSpline1D; got {:?}",
5319 terms.smooth_terms[0].basis
5320 );
5321 };
5322 assert_eq!(
5323 spec.degree, 2,
5324 "`{formula}` must drop the cubic default to a quadratic basis"
5325 );
5326 let num_internal = match &spec.knotspec {
5327 BSplineKnotSpec::Generate {
5328 num_internal_knots, ..
5329 } => *num_internal_knots,
5330 BSplineKnotSpec::Automatic {
5331 num_internal_knots: Some(n),
5332 ..
5333 } => *n,
5334 other => panic!("`{formula}` unexpected knotspec: {other:?}"),
5335 };
5336 assert_eq!(
5337 num_internal, 0,
5338 "`{formula}` must have zero internal knots (num_basis = k = 3)"
5339 );
5340 // Resulting basis dimension is num_internal + degree + 1 = 3 = k.
5341 assert!(
5342 spec.penalty_order >= 1 && spec.penalty_order <= spec.degree,
5343 "`{formula}` penalty_order {} must satisfy 1 <= order <= degree={}",
5344 spec.penalty_order,
5345 spec.degree
5346 );
5347 }
5348 }
5349
5350 #[test]
5351 fn formula_shape_constraint_round_trips_and_rejects_bogus() {
5352 let ds = continuous_dataset(
5353 &["y", "x"],
5354 (0..32)
5355 .map(|i| {
5356 let x = i as f64 / 31.0;
5357 vec![x * x, x]
5358 })
5359 .collect(),
5360 );
5361 let col_map = ds.column_map();
5362
5363 let parsed =
5364 parse_formula("y ~ s(x, shape=monotone_increasing)").expect("parse monotone smooth");
5365 let mut notes = Vec::new();
5366 let terms = build_termspec(
5367 &parsed.terms,
5368 &ds,
5369 &col_map,
5370 &mut notes,
5371 &gam_runtime::resource::ResourcePolicy::default_library(),
5372 )
5373 .expect("monotone smooth should build");
5374 assert_eq!(
5375 terms.smooth_terms[0].shape,
5376 ShapeConstraint::MonotoneIncreasing
5377 );
5378
5379 let parsed_bad = parse_formula("y ~ s(x, shape=bogus)").expect("parse bogus shape");
5380 let mut notes_bad = Vec::new();
5381 let err = build_termspec(
5382 &parsed_bad.terms,
5383 &ds,
5384 &col_map,
5385 &mut notes_bad,
5386 &gam_runtime::resource::ResourcePolicy::default_library(),
5387 )
5388 .expect_err("bogus shape must error");
5389 assert!(
5390 format!("{err:?}").contains("unknown shape constraint"),
5391 "got: {err:?}"
5392 );
5393 }
5394
5395 #[test]
5396 fn default_sphere_smooth_uses_spherical_farthest_point_centers() {
5397 let ds = continuous_dataset(
5398 &["y", "lat", "lon"],
5399 (0..24)
5400 .map(|i| {
5401 let t = i as f64 / 24.0;
5402 let lat = -60.0 + 120.0 * t;
5403 let lon = -180.0 + 360.0 * ((7 * i) % 24) as f64 / 24.0;
5404 vec![lat.to_radians().sin(), lat, lon]
5405 })
5406 .collect(),
5407 );
5408 let parsed = parse_formula("y ~ sphere(lat, lon)").expect("parse");
5409 let col_map = ds.column_map();
5410 let mut notes = Vec::new();
5411 let terms = build_termspec(
5412 &parsed.terms,
5413 &ds,
5414 &col_map,
5415 &mut notes,
5416 &gam_runtime::resource::ResourcePolicy::default_library(),
5417 )
5418 .expect("build sphere termspec");
5419 let SmoothBasisSpec::Sphere { spec, .. } = &terms.smooth_terms[0].basis else {
5420 panic!("expected sphere term");
5421 };
5422 assert!(matches!(
5423 spec.center_strategy,
5424 CenterStrategy::FarthestPoint { .. }
5425 ));
5426 }
5427
5428 #[test]
5429 fn one_dimensional_duchon_defaults_to_scale_free_length_scale() {
5430 let ds = continuous_dataset(
5431 &["y", "x"],
5432 (0..32)
5433 .map(|i| {
5434 let x = i as f64 / 31.0;
5435 vec![(std::f64::consts::TAU * x).sin(), x]
5436 })
5437 .collect(),
5438 );
5439 let parsed = parse_formula("y ~ duchon(x)").expect("parse");
5440 let col_map = ds.column_map();
5441 let mut notes = Vec::new();
5442 let terms = build_termspec(
5443 &parsed.terms,
5444 &ds,
5445 &col_map,
5446 &mut notes,
5447 &gam_runtime::resource::ResourcePolicy::default_library(),
5448 )
5449 .expect("build default duchon termspec");
5450 let SmoothBasisSpec::Duchon { spec, .. } = &terms.smooth_terms[0].basis else {
5451 panic!("expected Duchon term");
5452 };
5453 assert_eq!(spec.length_scale, None);
5454 }
5455
5456 #[test]
5457 fn one_dimensional_duchon_length_scale_opts_into_hybrid_mode() {
5458 let ds = continuous_dataset(
5459 &["y", "x"],
5460 (0..32)
5461 .map(|i| {
5462 let x = i as f64 / 31.0;
5463 vec![(std::f64::consts::TAU * x).sin(), x]
5464 })
5465 .collect(),
5466 );
5467 let parsed = parse_formula("y ~ duchon(x, length_scale=0.25)").expect("parse");
5468 let col_map = ds.column_map();
5469 let mut notes = Vec::new();
5470 let terms = build_termspec(
5471 &parsed.terms,
5472 &ds,
5473 &col_map,
5474 &mut notes,
5475 &gam_runtime::resource::ResourcePolicy::default_library(),
5476 )
5477 .expect("build hybrid duchon termspec");
5478 let SmoothBasisSpec::Duchon { spec, .. } = &terms.smooth_terms[0].basis else {
5479 panic!("expected Duchon term");
5480 };
5481 assert_eq!(spec.length_scale, Some(0.25));
5482 }
5483
5484 #[test]
5485 fn parse_matern_nu_accepts_equivalent_half_integer_forms() {
5486 let cases = [
5487 ("1/2", MaternNu::Half),
5488 (" 1 / 2 ", MaternNu::Half),
5489 (".5", MaternNu::Half),
5490 ("0.50", MaternNu::Half),
5491 ("half", MaternNu::Half),
5492 ("3 / 2", MaternNu::ThreeHalves),
5493 ("1.50", MaternNu::ThreeHalves),
5494 ("5 / 2", MaternNu::FiveHalves),
5495 ("2.500000000000", MaternNu::FiveHalves),
5496 ("7 / 2", MaternNu::SevenHalves),
5497 ("3.50", MaternNu::SevenHalves),
5498 ("9 / 2", MaternNu::NineHalves),
5499 ("4.50", MaternNu::NineHalves),
5500 ];
5501 for (raw, expected) in cases {
5502 let parsed = parse_matern_nu(raw).expect(raw);
5503 assert!(
5504 matches!(
5505 (parsed, expected),
5506 (MaternNu::Half, MaternNu::Half)
5507 | (MaternNu::ThreeHalves, MaternNu::ThreeHalves)
5508 | (MaternNu::FiveHalves, MaternNu::FiveHalves)
5509 | (MaternNu::SevenHalves, MaternNu::SevenHalves)
5510 | (MaternNu::NineHalves, MaternNu::NineHalves)
5511 ),
5512 "parsed {raw:?} as {parsed:?}, expected {expected:?}"
5513 );
5514 }
5515 }
5516
5517 #[test]
5518 fn parse_matern_nu_rejects_unsupported_or_invalid_values() {
5519 for raw in ["1", "2", "11/2", "1/0", "nan", "fast"] {
5520 let err = parse_matern_nu(raw).expect_err(raw);
5521 assert!(
5522 err.contains("supported half-integer values"),
5523 "unexpected error for {raw:?}: {err}"
5524 );
5525 }
5526 }
5527
5528 #[test]
5529 fn parse_ps_k_promotes_underexpressive_cubic_basis() {
5530 let mut opts = BTreeMap::new();
5531 opts.insert("k".to_string(), "4".to_string());
5532 let (internal, inferred, eff_degree) = parse_ps_internal_knots(&opts, 3, 20).expect("k=4");
5533 assert_eq!(internal, 2);
5534 assert_eq!(eff_degree, 3);
5535 assert!(!inferred);
5536
5537 opts.insert("k".to_string(), "6".to_string());
5538 let (internal, inferred, eff_degree) = parse_ps_internal_knots(&opts, 3, 20).expect("k=6");
5539 assert_eq!(internal, 2);
5540 assert_eq!(eff_degree, 3);
5541 assert!(!inferred);
5542
5543 opts.insert("k".to_string(), "10".to_string());
5544 let (internal, inferred, eff_degree) = parse_ps_internal_knots(&opts, 3, 20).expect("k=10");
5545 assert_eq!(internal, 6);
5546 assert_eq!(eff_degree, 3);
5547 assert!(!inferred);
5548 }
5549
5550 #[test]
5551 fn parse_ps_internal_knots_drops_degree_for_small_k() {
5552 // mgcv's `s(x, bs="ps", k=3)` with the default cubic basis silently
5553 // reduces to a quadratic (`degree=2`) marginal. `k=3, degree=3`
5554 // should yield a quadratic basis with zero internal knots
5555 // (`num_basis = k = 3`).
5556 let mut opts = BTreeMap::new();
5557 opts.insert("k".to_string(), "3".to_string());
5558 let (internal, inferred, eff_degree) = parse_ps_internal_knots(&opts, 3, 20).expect("k=3");
5559 assert_eq!(eff_degree, 2);
5560 assert_eq!(internal, 0);
5561 assert!(!inferred);
5562
5563 // `k=2` reduces to a linear (`degree=1`) marginal — the smallest
5564 // non-trivial spline basis.
5565 opts.insert("k".to_string(), "2".to_string());
5566 let (internal, inferred, eff_degree) = parse_ps_internal_knots(&opts, 3, 20).expect("k=2");
5567 assert_eq!(eff_degree, 1);
5568 assert_eq!(internal, 0);
5569 assert!(!inferred);
5570
5571 // The under-2 case is structurally under-specified and rejected even
5572 // by the degree-reducing variant: no B-spline basis has fewer than
5573 // two functions.
5574 opts.insert("k".to_string(), "1".to_string());
5575 let err = parse_ps_internal_knots(&opts, 3, 20)
5576 .expect_err("k=1 is below the irreducible spline floor");
5577 assert!(err.contains("requires k >= 2"), "unexpected error: {err}");
5578
5579 // When the user already passed `k >= degree+1`, the helper must
5580 // preserve the existing knot geometry exactly.
5581 opts.insert("k".to_string(), "4".to_string());
5582 let (internal, inferred, eff_degree) = parse_ps_internal_knots(&opts, 3, 20).expect("k=4");
5583 assert_eq!(eff_degree, 3);
5584 assert_eq!(internal, 2);
5585 assert!(!inferred);
5586 }
5587
5588 #[test]
5589 fn factor_smooth_marginal_degree_reduces_for_small_k() {
5590 let ds = factor_dataset();
5591 let col_map = ds.column_map();
5592
5593 for (k, expected_degree) in [(3usize, 2usize), (2usize, 1usize)] {
5594 let parsed =
5595 parse_formula(&format!("y ~ s(x, g, bs=fs, k={k})")).expect("parse factor smooth");
5596 let mut notes = Vec::new();
5597 let terms = build_termspec(
5598 &parsed.terms,
5599 &ds,
5600 &col_map,
5601 &mut notes,
5602 &gam_runtime::resource::ResourcePolicy::default_library(),
5603 )
5604 .unwrap_or_else(|err| panic!("fs k={k} should degree-reduce, got: {err:?}"));
5605 let SmoothBasisSpec::FactorSmooth { spec } = &terms.smooth_terms[0].basis else {
5606 panic!(
5607 "expected factor smooth, got {:?}",
5608 terms.smooth_terms[0].basis
5609 );
5610 };
5611 assert_eq!(spec.marginal.degree, expected_degree);
5612 assert!(
5613 spec.marginal.penalty_order <= spec.marginal.degree,
5614 "penalty_order {} must be clamped to degree {}",
5615 spec.marginal.penalty_order,
5616 spec.marginal.degree
5617 );
5618 let basis_size = match spec.marginal.knotspec {
5619 BSplineKnotSpec::Generate {
5620 num_internal_knots, ..
5621 } => num_internal_knots + spec.marginal.degree + 1,
5622 BSplineKnotSpec::Automatic {
5623 num_internal_knots: Some(num_internal_knots),
5624 ..
5625 } => num_internal_knots + spec.marginal.degree + 1,
5626 ref other => panic!("unexpected factor-smooth knotspec: {other:?}"),
5627 };
5628 assert_eq!(basis_size, k);
5629 }
5630 }
5631
5632 /// Build a dataset with a ternary continuous covariate `x ∈ {0,1,2}` and a
5633 /// 2-level categorical group `g`, for the low-cardinality cr-cap tests.
5634 fn ternary_factor_dataset() -> Dataset {
5635 let rows = (0..120)
5636 .map(|i| {
5637 let x = (i % 3) as f64;
5638 let g = (i % 2) as f64;
5639 vec![x + g, x, g]
5640 })
5641 .collect::<Vec<_>>();
5642 Dataset {
5643 headers: vec!["y".into(), "x".into(), "g".into()],
5644 values: Array2::from_shape_vec(
5645 (rows.len(), 3),
5646 rows.into_iter().flat_map(|row| row.into_iter()).collect(),
5647 )
5648 .expect("rectangular ternary factor test data"),
5649 schema: DataSchema {
5650 columns: vec![
5651 SchemaColumn {
5652 name: "y".into(),
5653 kind: ColumnKindTag::Continuous,
5654 levels: vec![],
5655 },
5656 SchemaColumn {
5657 name: "x".into(),
5658 kind: ColumnKindTag::Continuous,
5659 levels: vec![],
5660 },
5661 SchemaColumn {
5662 name: "g".into(),
5663 kind: ColumnKindTag::Categorical,
5664 levels: vec!["a".into(), "b".into()],
5665 },
5666 ],
5667 },
5668 column_kinds: vec![
5669 ColumnKindTag::Continuous,
5670 ColumnKindTag::Continuous,
5671 ColumnKindTag::Categorical,
5672 ],
5673 }
5674 }
5675
5676 #[test]
5677 fn univariate_cr_smooth_caps_knots_to_data_support() {
5678 // #1541: `s(x, bs=cr, k=10)` on a ternary covariate (3 distinct values)
5679 // must NOT hard-fail in cr-knot selection ("cubic regression spline with
5680 // k=10 requires at least 10 distinct values, got 3"). The cr basis is
5681 // capped to the data support — exactly 3 value-knots at {0,1,2} — which
5682 // is full-rank for the data, so it can still represent any 3 group means.
5683 let ds = continuous_dataset(
5684 &["y", "x"],
5685 (0..90)
5686 .map(|i| vec![(i % 3) as f64, (i % 3) as f64])
5687 .collect(),
5688 );
5689 let col_map = ds.column_map();
5690 let parsed = parse_formula("y ~ s(x, bs=cr, k=10)").expect("parse cr smooth");
5691 let mut notes = Vec::new();
5692 let terms = build_termspec(
5693 &parsed.terms,
5694 &ds,
5695 &col_map,
5696 &mut notes,
5697 &gam_runtime::resource::ResourcePolicy::default_library(),
5698 )
5699 .expect("cr k=10 must cap to data support instead of erroring");
5700 let SmoothBasisSpec::BSpline1D { spec, .. } = &terms.smooth_terms[0].basis else {
5701 panic!("expected BSpline1D for s(x, bs=cr)");
5702 };
5703 let BSplineKnotSpec::NaturalCubicRegression { knots } = &spec.knotspec else {
5704 panic!("expected cr knotspec, got {:?}", spec.knotspec);
5705 };
5706 // Capped to exactly the 3 distinct covariate values.
5707 assert_eq!(knots.len(), 3, "cr basis not capped to 3 distinct values");
5708 assert_eq!(knots.as_slice().unwrap(), &[0.0, 1.0, 2.0]);
5709 // The reduction is surfaced to the user (mgcv warns in the same case).
5710 assert!(
5711 notes.iter().any(|n| n.contains("data-support cap")),
5712 "cap not reported in inference notes: {notes:?}"
5713 );
5714 }
5715
5716 #[test]
5717 fn univariate_cr_smooth_binary_covariate_degrades_to_bspline() {
5718 // #1541: a BINARY covariate has too few distinct values (2) for ANY cr
5719 // spline (needs >= 3 distinct). `s(x, bs=cr)` must degrade to a B-spline
5720 // marginal — the default basis the same data already fits — NOT hard-fail.
5721 let ds = continuous_dataset(
5722 &["y", "x"],
5723 (0..80)
5724 .map(|i| vec![(i % 2) as f64, (i % 2) as f64])
5725 .collect(),
5726 );
5727 let col_map = ds.column_map();
5728 let parsed = parse_formula("y ~ s(x, bs=cr, k=10)").expect("parse cr smooth");
5729 let mut notes = Vec::new();
5730 let terms = build_termspec(
5731 &parsed.terms,
5732 &ds,
5733 &col_map,
5734 &mut notes,
5735 &gam_runtime::resource::ResourcePolicy::default_library(),
5736 )
5737 .expect("binary cr must degrade to B-spline instead of erroring");
5738 let SmoothBasisSpec::BSpline1D { spec, .. } = &terms.smooth_terms[0].basis else {
5739 panic!("expected BSpline1D for s(x, bs=cr)");
5740 };
5741 assert!(
5742 !matches!(
5743 spec.knotspec,
5744 BSplineKnotSpec::NaturalCubicRegression { .. }
5745 ),
5746 "binary covariate must NOT build a cr basis, got {:?}",
5747 spec.knotspec
5748 );
5749 assert!(
5750 notes
5751 .iter()
5752 .any(|n| n.contains("Degraded to the linear B-spline")),
5753 "degradation not reported in inference notes: {notes:?}"
5754 );
5755 }
5756
5757 #[test]
5758 fn sz_factor_smooth_low_cardinality_uses_bspline_marginal() {
5759 // #1605: the `sz` factor-smooth marginal is the SAME penalized B-spline
5760 // the `fs` sibling uses — NOT a natural cubic regression (`cr`) marginal,
5761 // whose hard natural boundary conditions f''=0 bias curved deviations
5762 // (a consistency failure). #1542 (the reason this test exists) is
5763 // subsumed: with a B-spline marginal a low-cardinality covariate no
5764 // longer needs a special cr data-support cap and can never hard-fail the
5765 // way the old cr-marginal `sz` spelling did — the build just succeeds,
5766 // exactly as `fs` already does on the identical data.
5767 let ds = ternary_factor_dataset();
5768 let col_map = ds.column_map();
5769 let parsed = parse_formula("y ~ s(x, g, bs=sz, k=10)").expect("parse sz factor smooth");
5770 let mut notes = Vec::new();
5771 let terms = build_termspec(
5772 &parsed.terms,
5773 &ds,
5774 &col_map,
5775 &mut notes,
5776 &gam_runtime::resource::ResourcePolicy::default_library(),
5777 )
5778 .expect("sz on a ternary covariate must build (B-spline marginal), not hard-fail");
5779 let SmoothBasisSpec::FactorSmooth { spec } = &terms.smooth_terms[0].basis else {
5780 panic!("expected FactorSmooth for s(x, g, bs=sz)");
5781 };
5782 assert!(
5783 !matches!(
5784 spec.marginal.knotspec,
5785 BSplineKnotSpec::NaturalCubicRegression { .. }
5786 ),
5787 "sz marginal must be a B-spline (curvature-capable), not the \
5788 natural-BC cr basis; got {:?}",
5789 spec.marginal.knotspec
5790 );
5791 }
5792
5793 /// A dataset with a genuinely continuous covariate `x` (many distinct
5794 /// values) and a `L`-level grouping factor `g`, suitable for building a
5795 /// real factor-smooth marginal with a non-trivial {const, linear} null
5796 /// space. `y` is unused by the structural penalty checks below.
5797 fn continuous_x_factor_dataset(n: usize, n_groups: usize) -> Dataset {
5798 let rows = (0..n)
5799 .map(|i| {
5800 let x = i as f64 / (n as f64 - 1.0);
5801 let g = (i % n_groups) as f64;
5802 vec![x + g, x, g]
5803 })
5804 .collect::<Vec<_>>();
5805 let levels: Vec<String> = (0..n_groups).map(|k| format!("g{k}")).collect();
5806 Dataset {
5807 headers: vec!["y".into(), "x".into(), "g".into()],
5808 values: Array2::from_shape_vec(
5809 (rows.len(), 3),
5810 rows.into_iter().flat_map(|row| row.into_iter()).collect(),
5811 )
5812 .expect("rectangular continuous-x factor data"),
5813 schema: DataSchema {
5814 columns: vec![
5815 SchemaColumn {
5816 name: "y".into(),
5817 kind: ColumnKindTag::Continuous,
5818 levels: vec![],
5819 },
5820 SchemaColumn {
5821 name: "x".into(),
5822 kind: ColumnKindTag::Continuous,
5823 levels: vec![],
5824 },
5825 SchemaColumn {
5826 name: "g".into(),
5827 kind: ColumnKindTag::Categorical,
5828 levels,
5829 },
5830 ],
5831 },
5832 column_kinds: vec![
5833 ColumnKindTag::Continuous,
5834 ColumnKindTag::Continuous,
5835 ColumnKindTag::Categorical,
5836 ],
5837 }
5838 }
5839
5840 fn factor_smooth_spec_for(formula: &str, ds: &Dataset) -> FactorSmoothSpec {
5841 let col_map = ds.column_map();
5842 let parsed = parse_formula(formula).expect("parse factor smooth formula");
5843 let mut notes = Vec::new();
5844 let terms = build_termspec(
5845 &parsed.terms,
5846 ds,
5847 &col_map,
5848 &mut notes,
5849 &gam_runtime::resource::ResourcePolicy::default_library(),
5850 )
5851 .expect("build factor smooth term");
5852 let SmoothBasisSpec::FactorSmooth { spec } = &terms.smooth_terms[0].basis else {
5853 panic!("expected FactorSmooth basis for `{formula}`");
5854 };
5855 spec.clone()
5856 }
5857
5858 /// #1605: the sum-to-zero factor smooth `s(x, g, bs="sz")` under-fit data
5859 /// drawn from its own model class because its deviation blocks carried ONLY
5860 /// the marginal wiggliness penalty — the {const, linear} null space of every
5861 /// deviation curve was left completely unpenalized, so the single combined
5862 /// wiggliness λ could not separate per-group intercept/slope variance from
5863 /// curvature variance and REML parked it over-smoothed (same defect class as
5864 /// the closed #700, more severe). mgcv's `bs="fs"` sibling avoids the gap by
5865 /// adding a SEPARATE per-null-dimension ridge (one λ each), the
5866 /// double-penalty `I_L ⊗ S_j` structure. The fix gives `sz` the same
5867 /// null-space-ridge structure, mapped into the zero-sum CONTRAST space so the
5868 /// constraint (and `sz`'s distinctness from `fs`) is preserved.
5869 ///
5870 /// This pins the structural defect: after the fix the `sz` deviation build
5871 /// must carry MORE than just its wiggliness penalty(s) — exactly one extra
5872 /// null-space-ridge penalty per marginal null direction, matching the count
5873 /// that `fs` carries — while keeping the narrower `(L-1)·p` zero-sum design
5874 /// (NOT the `L·p` full-rank `fs` design). Before the fix `sz` carried only
5875 /// the wiggliness penalties and this fails.
5876 #[test]
5877 fn sz_factor_smooth_carries_null_space_ridge_like_fs() {
5878 let ds = continuous_x_factor_dataset(180, 4);
5879 let mut workspace = crate::basis::BasisWorkspace::new();
5880
5881 let sz_spec = factor_smooth_spec_for("y ~ s(x, g, bs=sz, k=8)", &ds);
5882 let sz_built = crate::smooth::build_factor_smooth(
5883 ds.values.view(),
5884 &sz_spec,
5885 "sz_term",
5886 &mut workspace,
5887 )
5888 .expect("build sz factor smooth");
5889
5890 let fs_spec = factor_smooth_spec_for("y ~ s(x, g, bs=fs, k=8)", &ds);
5891 let fs_built = crate::smooth::build_factor_smooth(
5892 ds.values.view(),
5893 &fs_spec,
5894 "fs_term",
5895 &mut workspace,
5896 )
5897 .expect("build fs factor smooth");
5898
5899 // Penalty structure (#1074 + #1605). `fs` is the exchangeable
5900 // random-effect smooth: all `L` level blocks share ONE wiggliness λ per
5901 // marginal penalty, plus one rank-1 null-space ridge per marginal null
5902 // direction (the #1605 double penalty). `sz` is the sum-to-zero factor
5903 // smooth and mgcv's `smooth.construct.sz` emits ONE penalty matrix PER
5904 // LEVEL — `L` independent curvature smoothing parameters — so REML can
5905 // shrink a low-amplitude group's deviation hard while leaving a busy
5906 // group nearly unpenalized. We mirror that: the single marginal
5907 // wiggliness penalty is split into its `L` independent zero-sum-contrast
5908 // summands (`L-1` free per-group blocks `(e_k e_kᵀ)⊗S` + the reference
5909 // coupling block `(11ᵀ)⊗S`), each carrying its own λ, and the null-space
5910 // ridges stay POOLED (the per-group intercept/slope shrinkage mgcv pools
5911 // under one variance even for `sz`).
5912 //
5913 // So with `nw` marginal wiggliness penalties and `nn` marginal null
5914 // directions: fs has `nw + nn` penalties; sz has `L·nw + nn`. sz must
5915 // therefore carry strictly MORE penalties than fs (the per-group split),
5916 // and the surplus must be exactly `(L-1)·nw`.
5917 let n_levels = sz_spec
5918 .group_frozen_levels
5919 .as_ref()
5920 .map(|l| l.len())
5921 .unwrap_or(4);
5922 assert!(n_levels >= 3, "test needs >=3 groups, got {n_levels}");
5923
5924 // fs = nw + nn ⇒ nn = fs_penalties - nw. The marginal has nw==1
5925 // wiggliness penalty (a single difference/curvature operator), so the
5926 // per-group split adds exactly (L-1)·nw = (L-1) extra penalties on top of
5927 // fs's count.
5928 let nw = 1usize; // one marginal wiggliness penalty for the B-spline marginal
5929 let expected_sz = fs_built.penalties.len() + (n_levels - 1) * nw;
5930 assert_eq!(
5931 sz_built.penalties.len(),
5932 expected_sz,
5933 "sz must split its wiggliness penalty per level (#1074): expected \
5934 fs_count {} + (L-1)·nw {} = {}, but sz had {}",
5935 fs_built.penalties.len(),
5936 (n_levels - 1) * nw,
5937 expected_sz,
5938 sz_built.penalties.len(),
5939 );
5940 assert!(
5941 sz_built.penalties.len() > fs_built.penalties.len(),
5942 "sz must carry strictly more penalties than fs after the per-group \
5943 split (sz={}, fs={})",
5944 sz_built.penalties.len(),
5945 fs_built.penalties.len(),
5946 );
5947
5948 // The null-space ridges must still be present (the #1605 property that
5949 // keeps the deviation curvature un-over-smoothed). After removing the `L`
5950 // per-group wiggliness blocks, the remainder are the pooled null ridges,
5951 // and there must be at least one (a B-spline marginal has a non-empty
5952 // {const, linear} null space).
5953 let n_wiggliness = n_levels * nw; // L per-group blocks
5954 assert!(
5955 sz_built.penalties.len() > n_wiggliness,
5956 "sz deviation block carries no null-space ridge (penalties={}, \
5957 wiggliness blocks={}); the null space is unpenalized and REML \
5958 over-smooths the deviations",
5959 sz_built.penalties.len(),
5960 n_wiggliness,
5961 );
5962
5963 // The zero-sum constraint must be preserved: the sz design must stay the
5964 // NARROWER `(L-1)·p` contrast design, strictly narrower than the fs
5965 // full-rank `L·p` design. This guards against "fixing" sz by making it
5966 // identical to fs (which would break identifiability / sum-to-zero).
5967 assert!(
5968 sz_built.dim < fs_built.dim,
5969 "sz design width {} must be strictly less than fs width {} \
5970 (zero-sum contrast drops one level block)",
5971 sz_built.dim,
5972 fs_built.dim,
5973 );
5974
5975 // Every penalty/metadata vector must stay parallel (length invariant the
5976 // downstream REML assembly relies on).
5977 assert_eq!(sz_built.penalties.len(), sz_built.nullspaces.len());
5978 assert_eq!(sz_built.penalties.len(), sz_built.penaltyinfo.len());
5979 assert_eq!(sz_built.penalties.len(), sz_built.null_eigenvectors.len());
5980 }
5981
5982 /// #1457: `y ~ s(x, by=g) + g` with a BARE categorical `g` must NOT lower to
5983 /// two `g` design blocks. The bare `+ g` is auto-promoted to a single
5984 /// penalized random-effect block owning the factor's full level offsets; the
5985 /// `by=` branch must then recognize that owner and skip adding its own
5986 /// unpenalized treatment-coded main effect. Before the fix the dedup guard
5987 /// recognized only explicit `group(g)` (a `ParsedTerm::RandomEffect`), so the
5988 /// auto-promoted bare-`+ g` block slipped past and a spurious second `g`
5989 /// block (plus an extra smoothing parameter) was added. Assert exactly ONE
5990 /// `g` random/categorical block, and that adding the bare `+ g` introduces no
5991 /// extra `g` blocks beyond `y ~ s(x, by=g)` alone.
5992 fn factor_dataset_l3() -> Dataset {
5993 // `g` is categorical with THREE levels (encoded 0.0/1.0/2.0).
5994 let rows = (0..30)
5995 .map(|i| {
5996 let x = i as f64 / 29.0;
5997 let g = (i % 3) as f64;
5998 vec![x + g, x, g]
5999 })
6000 .collect::<Vec<_>>();
6001 Dataset {
6002 headers: vec!["y".into(), "x".into(), "g".into()],
6003 values: Array2::from_shape_vec(
6004 (rows.len(), 3),
6005 rows.into_iter().flat_map(|row| row.into_iter()).collect(),
6006 )
6007 .expect("rectangular L=3 factor test data"),
6008 schema: DataSchema {
6009 columns: vec![
6010 SchemaColumn {
6011 name: "y".into(),
6012 kind: ColumnKindTag::Continuous,
6013 levels: vec![],
6014 },
6015 SchemaColumn {
6016 name: "x".into(),
6017 kind: ColumnKindTag::Continuous,
6018 levels: vec![],
6019 },
6020 SchemaColumn {
6021 name: "g".into(),
6022 kind: ColumnKindTag::Categorical,
6023 levels: vec!["a".into(), "b".into(), "c".into()],
6024 },
6025 ],
6026 },
6027 column_kinds: vec![
6028 ColumnKindTag::Continuous,
6029 ColumnKindTag::Continuous,
6030 ColumnKindTag::Categorical,
6031 ],
6032 }
6033 }
6034
6035 #[test]
6036 fn factor_by_smooth_plus_bare_categorical_does_not_duplicate_factor_block() {
6037 let ds = factor_dataset_l3();
6038 let col_map = ds.column_map();
6039
6040 let g_blocks = |formula: &str| -> usize {
6041 let parsed = parse_formula(formula).expect("parse by-smooth formula");
6042 let mut notes = Vec::new();
6043 let terms = build_termspec(
6044 &parsed.terms,
6045 &ds,
6046 &col_map,
6047 &mut notes,
6048 &ResourcePolicy::default_library(),
6049 )
6050 .unwrap_or_else(|err| panic!("`{formula}` must build, got: {err:?}"));
6051 terms
6052 .random_effect_terms
6053 .iter()
6054 .filter(|rt| rt.name == "g")
6055 .count()
6056 };
6057
6058 // Baseline: the standalone factor-by smooth carries exactly ONE `g`
6059 // block (the unpenalized treatment-coded factor main effect added by the
6060 // `by=` branch).
6061 let by_only = g_blocks("y ~ s(x, by=g, k=10)");
6062 assert_eq!(
6063 by_only, 1,
6064 "`y ~ s(x, by=g)` must produce exactly one `g` design block"
6065 );
6066
6067 // The bug: adding a bare `+ g` (auto-promoted to a penalized random
6068 // block owning the same level offsets) must NOT introduce a second `g`
6069 // block. Before the fix this was 2.
6070 let by_plus_bare = g_blocks("y ~ s(x, by=g, k=10) + g");
6071 assert_eq!(
6072 by_plus_bare, 1,
6073 "`y ~ s(x, by=g) + g` must collapse to ONE `g` block (#1457): the bare \
6074 `+ g` already owns the factor's level offsets, so the `by=` branch \
6075 must not add a second, treatment-coded main effect"
6076 );
6077
6078 // The bare `+ g` adds no spurious extra `g` block versus the baseline.
6079 assert_eq!(
6080 by_plus_bare, by_only,
6081 "the bare `+ g` collision must add zero extra `g` blocks (#1457)"
6082 );
6083 }
6084
6085 #[test]
6086 fn parse_tensor_periods_and_origins_aliases() {
6087 let mut opts = BTreeMap::new();
6088 opts.insert(
6089 "boundary".to_string(),
6090 "['periodic', 'periodic']".to_string(),
6091 );
6092 opts.insert("periods".to_string(), "[7, 24]".to_string());
6093 opts.insert("origins".to_string(), "[0, -12]".to_string());
6094 let axes = parse_periodic_axes(&opts, 2).expect("axes");
6095 let periods = parse_periods(&opts, &axes).expect("periods");
6096 let origins = parse_period_origins(&opts, &axes).expect("origins");
6097 assert_eq!(axes, vec![true, true]);
6098 assert_eq!(periods, vec![Some(7.0), Some(24.0)]);
6099 assert_eq!(origins, vec![Some(0.0), Some(-12.0)]);
6100 }
6101
6102 #[test]
6103 fn tensor_smooth_honors_per_margin_k_list() {
6104 let ds = continuous_dataset(
6105 &["y", "theta", "h"],
6106 (0..20)
6107 .map(|i| {
6108 let theta = std::f64::consts::TAU * i as f64 / 20.0;
6109 let h = -1.0 + 2.0 * (i % 5) as f64 / 4.0;
6110 vec![theta.cos() + h, theta, h]
6111 })
6112 .collect(),
6113 );
6114 let parsed = parse_formula(
6115 "y ~ te(theta, h, periodic=[0], period=[2*pi, None], origin=[0, None], k=[9,5])",
6116 )
6117 .expect("parse tensor formula");
6118 let col_map = ds.column_map();
6119 let mut notes = Vec::new();
6120 let terms = build_termspec(
6121 &parsed.terms,
6122 &ds,
6123 &col_map,
6124 &mut notes,
6125 &gam_runtime::resource::ResourcePolicy::default_library(),
6126 )
6127 .expect("build tensor terms");
6128 let SmoothBasisSpec::TensorBSpline { spec, .. } = &terms.smooth_terms[0].basis else {
6129 panic!("expected tensor B-spline");
6130 };
6131 let dims = spec
6132 .marginalspecs
6133 .iter()
6134 .map(|m| match m.knotspec {
6135 BSplineKnotSpec::PeriodicUniform { num_basis, .. } => num_basis,
6136 BSplineKnotSpec::Generate {
6137 num_internal_knots, ..
6138 } => num_internal_knots + m.degree + 1,
6139 // The mgcv-default `cr` margin (#1074) reports its basis size as
6140 // the number of value-knots placed.
6141 BSplineKnotSpec::NaturalCubicRegression { ref knots } => knots.len(),
6142 _ => panic!("unexpected tensor marginal knotspec"),
6143 })
6144 .collect::<Vec<_>>();
6145 assert_eq!(dims, vec![9, 5]);
6146 }
6147
6148 #[test]
6149 fn tensor_smooth_honors_per_margin_k_axis_aliases() {
6150 let ds = continuous_dataset(
6151 &["resp", "x", "y"],
6152 (0..12)
6153 .map(|i| {
6154 let t = i as f64 / 11.0;
6155 vec![t, t, 1.0 - t]
6156 })
6157 .collect(),
6158 );
6159 assert_eq!(
6160 tensor_margin_basis_sizes(&ds, "resp ~ te(x, y, k_x=9, k_y=5)"),
6161 vec![9, 5],
6162 "k_<margin> aliases should materialize requested per-margin values"
6163 );
6164 }
6165
6166 #[test]
6167 fn tensor_smooth_low_cardinality_axis_falls_back_to_lower_degree_basis() {
6168 // mgcv-style: `te(x, b, k=c(5, 2))` with a BINARY second margin (only
6169 // values {0, 1}) is a legitimate request — the binary axis can hold at
6170 // most a 2-function linear basis. We must NOT reject k=2 with a
6171 // "k too small for degree 3" config error; instead, drop the spline
6172 // degree on the binary axis to k_axis - 1 (here 1, linear) while
6173 // keeping the continuous margin at the requested degree=3, k=5.
6174 let ds = continuous_dataset(
6175 &["y", "x", "b"],
6176 (0..40)
6177 .map(|i| {
6178 let x = i as f64 / 39.0;
6179 let b = (i % 2) as f64;
6180 vec![x.sin() + 0.5 * b, x, b]
6181 })
6182 .collect(),
6183 );
6184 let parsed = parse_formula("y ~ te(x, b, k=[5, 2])").expect("parse tensor with k=[5,2]");
6185 let col_map = ds.column_map();
6186 let mut notes = Vec::new();
6187 let terms = build_termspec(
6188 &parsed.terms,
6189 &ds,
6190 &col_map,
6191 &mut notes,
6192 &gam_runtime::resource::ResourcePolicy::default_library(),
6193 )
6194 .expect("build tensor with binary margin");
6195 let SmoothBasisSpec::TensorBSpline { spec, .. } = &terms.smooth_terms[0].basis else {
6196 panic!("expected tensor B-spline for te(x, b)");
6197 };
6198 // Continuous margin keeps requested degree=3 and k=5; binary margin
6199 // drops to degree=1 (linear) so the requested k=2 yields exactly two
6200 // basis functions before tensor-product identifiability is applied.
6201 let continuous = &spec.marginalspecs[0];
6202 let binary = &spec.marginalspecs[1];
6203 assert_eq!(continuous.degree, 3);
6204 assert_eq!(binary.degree, 1);
6205 assert!(
6206 binary.penalty_order >= 1 && binary.penalty_order <= binary.degree,
6207 "binary margin penalty_order {} must satisfy 1 <= order <= degree={}",
6208 binary.penalty_order,
6209 binary.degree
6210 );
6211 let basis_size = |m: &BSplineBasisSpec| match m.knotspec {
6212 BSplineKnotSpec::PeriodicUniform { num_basis, .. } => num_basis,
6213 BSplineKnotSpec::Generate {
6214 num_internal_knots, ..
6215 } => num_internal_knots + m.degree + 1,
6216 BSplineKnotSpec::Automatic {
6217 num_internal_knots: Some(n),
6218 ..
6219 } => n + m.degree + 1,
6220 // The mgcv-default `cr` margin (#1074) reports its basis size as the
6221 // number of value-knots placed.
6222 BSplineKnotSpec::NaturalCubicRegression { ref knots } => knots.len(),
6223 _ => panic!("unexpected tensor marginal knotspec"),
6224 };
6225 assert_eq!(basis_size(continuous), 5);
6226 assert_eq!(basis_size(binary), 2);
6227 }
6228
6229 #[test]
6230 fn tensor_smooth_uniform_k_is_capped_to_a_low_cardinality_margins_distinct_values() {
6231 // Regression: a SINGLE `k=5` applied to every axis of `te(x, b, k=5)`
6232 // with a BINARY second margin (`b ∈ {0, 1}`) must build a valid tensor,
6233 // NOT hard-fail in cr-knot selection ("cubic regression spline with k=5
6234 // requires at least 5 distinct values, got 2"). mgcv caps a margin's
6235 // basis to its data support; the binary axis becomes the 2-function
6236 // (linear) margin, while the continuous axis keeps the requested k=5.
6237 // This is the `te(age, badh, k=5)` real-data case that previously errored.
6238 let ds = continuous_dataset(
6239 &["y", "x", "b"],
6240 (0..40)
6241 .map(|i| {
6242 let x = i as f64 / 39.0;
6243 let b = (i % 2) as f64;
6244 vec![x.sin() + 0.5 * b, x, b]
6245 })
6246 .collect(),
6247 );
6248 let parsed = parse_formula("y ~ te(x, b, k=5)").expect("parse tensor with uniform k=5");
6249 let col_map = ds.column_map();
6250 let mut notes = Vec::new();
6251 let terms = build_termspec(
6252 &parsed.terms,
6253 &ds,
6254 &col_map,
6255 &mut notes,
6256 &gam_runtime::resource::ResourcePolicy::default_library(),
6257 )
6258 .expect("uniform k=5 must auto-cap the binary margin instead of erroring");
6259 let SmoothBasisSpec::TensorBSpline { spec, .. } = &terms.smooth_terms[0].basis else {
6260 panic!("expected tensor B-spline for te(x, b)");
6261 };
6262 let basis_size = |m: &BSplineBasisSpec| match &m.knotspec {
6263 BSplineKnotSpec::PeriodicUniform { num_basis, .. } => *num_basis,
6264 BSplineKnotSpec::Generate {
6265 num_internal_knots, ..
6266 } => num_internal_knots + m.degree + 1,
6267 BSplineKnotSpec::Automatic {
6268 num_internal_knots: Some(n),
6269 ..
6270 } => n + m.degree + 1,
6271 BSplineKnotSpec::NaturalCubicRegression { knots } => knots.len(),
6272 other => panic!("unexpected tensor marginal knotspec: {other:?}"),
6273 };
6274 let binary = &spec.marginalspecs[1];
6275 // Binary margin is reduced to the 2-function linear basis its data
6276 // supports (k capped from 5 to 2, degree dropped to 1).
6277 assert_eq!(basis_size(binary), 2);
6278 assert_eq!(binary.degree, 1);
6279 // The continuous margin is unaffected by the cap (40 distinct values).
6280 assert_eq!(basis_size(&spec.marginalspecs[0]), 5);
6281 }
6282
6283 #[test]
6284 fn tensor_all_tp_margins_with_per_margin_k_routes_to_bspline_tensor() {
6285 // `te(x1, x2, bs=c('tp','tp'), k=c(5,5))` is mgcv's per-margin tp tensor
6286 // with per-margin basis sizes — a tensor product of two 1-D bases, each
6287 // of dimension 5. The list-valued `k=c(5,5)` is honored by
6288 // `parse_tensor_k_list`, producing one penalized B-spline margin per axis
6289 // (each spanning the requested per-axis thin-plate function space). This
6290 // is the same anisotropic-tensor routing the scalar/no-`k` case takes —
6291 // a `te()` request is ALWAYS a tensor product, never a silent isotropic
6292 // thin-plate substitution.
6293 let ds = continuous_dataset(
6294 &["y", "x1", "x2"],
6295 (0..32)
6296 .map(|i| {
6297 let t = i as f64 / 31.0;
6298 vec![t.sin(), t, 1.0 - t]
6299 })
6300 .collect(),
6301 );
6302 let parsed =
6303 parse_formula("y ~ te(x1, x2, bs=c('tp','tp'), k=c(5,5))").expect("parse tensor");
6304 let col_map = ds.column_map();
6305 let mut notes = Vec::new();
6306 let terms = build_termspec(
6307 &parsed.terms,
6308 &ds,
6309 &col_map,
6310 &mut notes,
6311 &gam_runtime::resource::ResourcePolicy::default_library(),
6312 )
6313 .expect("build tensor terms with per-margin k");
6314 let SmoothBasisSpec::TensorBSpline { spec, .. } = &terms.smooth_terms[0].basis else {
6315 panic!(
6316 "expected B-spline tensor when k=c(5,5) is supplied with bs=c('tp','tp'), got {:?}",
6317 terms.smooth_terms[0].basis
6318 );
6319 };
6320 // Since #1074 a `tp` tensor margin (k >= 3) is realized as a
6321 // Lancaster–Salkauskas natural cubic-regression margin (cr basis
6322 // dimension == knot count), not an open `Generate` B-spline. It is
6323 // still a `TensorBSpline` spec with one penalized 1-D margin per axis,
6324 // so the routing assertion above still holds; only the per-margin
6325 // knotspec variant changed. The earlier `_ => panic!` arm pinned the
6326 // pre-#1074 `Generate`-only representation and is stale. Decode every
6327 // margin variant to its basis dimension (mirroring the
6328 // `tensor_margin_basis_sizes` helper).
6329 let dims = spec
6330 .marginalspecs
6331 .iter()
6332 .map(|m| match m.knotspec {
6333 BSplineKnotSpec::Generate {
6334 num_internal_knots, ..
6335 } => num_internal_knots + m.degree + 1,
6336 BSplineKnotSpec::Automatic {
6337 num_internal_knots: Some(num_internal_knots),
6338 ..
6339 } => num_internal_knots + m.degree + 1,
6340 BSplineKnotSpec::PeriodicUniform { num_basis, .. } => num_basis,
6341 BSplineKnotSpec::Provided(ref knots) => {
6342 knots.len().saturating_sub(m.degree + 1)
6343 }
6344 BSplineKnotSpec::NaturalCubicRegression { ref knots } => knots.len(),
6345 BSplineKnotSpec::Automatic {
6346 num_internal_knots: None,
6347 ..
6348 } => panic!("test cannot infer automatic knot count"),
6349 })
6350 .collect::<Vec<_>>();
6351 assert_eq!(dims, vec![5, 5]);
6352 }
6353
6354 #[test]
6355 fn tensor_all_tp_margins_without_per_margin_k_builds_anisotropic_tensor() {
6356 // `te(x1, x2, bs=c('tp','tp'))` is a tensor-product request and must
6357 // build a genuine anisotropic tensor product (one smoothing parameter
6358 // per margin), NOT a silently-substituted multi-D isotropic thin-plate
6359 // radial smooth — that would be a different model (`s(x1,x2,bs='tp')`).
6360 // The routing is now consistent whether or not `k` is list-valued: a tp
6361 // margin vector always realizes each axis as a 1-D penalized B-spline
6362 // margin spanning the same per-axis thin-plate function space (#1082).
6363 let ds = continuous_dataset(
6364 &["y", "x1", "x2"],
6365 (0..32)
6366 .map(|i| {
6367 let t = i as f64 / 31.0;
6368 vec![t.sin(), t, 1.0 - t]
6369 })
6370 .collect(),
6371 );
6372 let parsed = parse_formula("y ~ te(x1, x2, bs=c('tp','tp'))").expect("parse tensor");
6373 let col_map = ds.column_map();
6374 let mut notes = Vec::new();
6375 let terms = build_termspec(
6376 &parsed.terms,
6377 &ds,
6378 &col_map,
6379 &mut notes,
6380 &gam_runtime::resource::ResourcePolicy::default_library(),
6381 )
6382 .expect("build tensor terms without per-margin k");
6383 let SmoothBasisSpec::TensorBSpline { spec, .. } = &terms.smooth_terms[0].basis else {
6384 panic!(
6385 "te(...,bs=c('tp','tp')) must route to an anisotropic tensor product, not a \
6386 silent isotropic thin-plate substitution; got {:?}",
6387 terms.smooth_terms[0].basis
6388 );
6389 };
6390 assert_eq!(
6391 spec.marginalspecs.len(),
6392 2,
6393 "tp tensor must carry one penalized B-spline margin per axis"
6394 );
6395 }
6396
6397 #[test]
6398 fn explicit_basis_sizes_are_not_small_n_clamped() {
6399 let ds = continuous_dataset(
6400 &["y", "x1", "x2", "x3", "x4", "x5"],
6401 (0..12)
6402 .map(|i| {
6403 let x = i as f64 / 11.0;
6404 vec![x.sin(), x, x * x, x + 0.1, 1.0 - x, (2.0 * x).sin()]
6405 })
6406 .collect(),
6407 );
6408 let parsed = parse_formula("y ~ s(x1, k=10) + s(x2) + s(x3) + s(x4) + s(x5)")
6409 .expect("parse multi-smooth formula");
6410 let col_map = ds.column_map();
6411 let mut notes = Vec::new();
6412 let terms = build_termspec(
6413 &parsed.terms,
6414 &ds,
6415 &col_map,
6416 &mut notes,
6417 &gam_runtime::resource::ResourcePolicy::default_library(),
6418 )
6419 .expect("build multi-smooth terms");
6420 let SmoothBasisSpec::BSpline1D { spec, .. } = &terms.smooth_terms[0].basis else {
6421 panic!("expected first smooth to be B-spline");
6422 };
6423 assert!(matches!(
6424 &spec.knotspec,
6425 BSplineKnotSpec::Generate {
6426 num_internal_knots: 6,
6427 ..
6428 }
6429 ));
6430 }
6431
6432 #[test]
6433 fn explicit_duchon_centers_are_not_small_n_bumped() {
6434 let ds = continuous_dataset(
6435 &["y", "x1", "x2", "x3", "x4", "x5"],
6436 (0..12)
6437 .map(|i| {
6438 let x = i as f64 / 11.0;
6439 vec![x.sin(), x, x * x, x + 0.1, 1.0 - x, (2.0 * x).sin()]
6440 })
6441 .collect(),
6442 );
6443 // Pure 1D Duchon at default options resolves the nullspace to Linear
6444 // (2s < d forces escalation), giving 2 polynomial nullspace columns;
6445 // the well-posedness gate requires num_centers > polynomial_cols, so
6446 // 3 is the smallest valid count. It is still well below the small-N
6447 // bump target of polynomial_cols + 4 = 6, so this exercises the
6448 // "explicit value is honored" path the test name advertises.
6449 let parsed = parse_formula("y ~ duchon(x1, centers=3) + s(x2) + s(x3) + s(x4) + s(x5)")
6450 .expect("parse multi-smooth formula");
6451 let col_map = ds.column_map();
6452 let mut notes = Vec::new();
6453 let terms = build_termspec(
6454 &parsed.terms,
6455 &ds,
6456 &col_map,
6457 &mut notes,
6458 &gam_runtime::resource::ResourcePolicy::default_library(),
6459 )
6460 .expect("build multi-smooth terms");
6461 let SmoothBasisSpec::Duchon { spec, .. } = &terms.smooth_terms[0].basis else {
6462 panic!("expected first smooth to be Duchon");
6463 };
6464 assert!(matches!(
6465 spec.center_strategy,
6466 CenterStrategy::FarthestPoint { num_centers: 3 }
6467 ));
6468 }
6469
6470 #[test]
6471 fn inferred_tensor_basis_cap_uses_coordinate_support_not_duplicate_rows() {
6472 let mut unique_rows = Vec::new();
6473 for i in 0..50 {
6474 let theta = i as f64 / 50.0;
6475 for j in 0..16 {
6476 let h = -1.0 + 2.0 * (j as f64) / 15.0;
6477 let y = theta.cos() + h;
6478 unique_rows.push(vec![y, theta, h]);
6479 }
6480 }
6481 let mut repeated_rows = Vec::new();
6482 for _ in 0..12 {
6483 repeated_rows.extend(unique_rows.iter().cloned());
6484 }
6485
6486 let unique = continuous_dataset(&["y", "theta", "h"], unique_rows);
6487 let repeated = continuous_dataset(&["y", "theta", "h"], repeated_rows);
6488
6489 let unique_basis = inferred_tensor_basis_product(&unique);
6490 let repeated_basis = inferred_tensor_basis_product(&repeated);
6491
6492 assert_eq!(
6493 unique_basis, repeated_basis,
6494 "duplicating existing tensor coordinates must not inflate inferred basis width"
6495 );
6496 }
6497
6498 #[test]
6499 fn inferred_three_dim_tensor_basis_stays_bounded_for_reml_selection() {
6500 // Regression for gam#813: the inferred per-margin k must be
6501 // dimension-aware so the 3-D tensor width p = ∏ k_d does not explode.
6502 // With the old 1-D-per-margin rule a 3-D `te` defaulted to 7³=343 at
6503 // small n and 20³=8000 at larger n, making the (non-Kronecker-factorable)
6504 // full-tensor sum-to-zero penalty's O(p³) REML reparameterization a
6505 // multi-minute stall. The dimension-aware budget keeps the product near
6506 // mgcv's te default (≈5³=125) regardless of n.
6507 let make = |n: usize| -> usize {
6508 let mut rows = Vec::with_capacity(n);
6509 for i in 0..n {
6510 let f = i as f64 / n as f64;
6511 rows.push(vec![f.sin(), f, (2.0 * f).cos(), (3.0 * f) % 1.0]);
6512 }
6513 let ds = continuous_dataset(&["y", "x1", "x2", "x3"], rows);
6514 let parsed = parse_formula("y ~ te(x1, x2, x3)").expect("parse 3-D tensor");
6515 let col_map = ds.column_map();
6516 let mut notes = Vec::new();
6517 let terms = build_termspec(
6518 &parsed.terms,
6519 &ds,
6520 &col_map,
6521 &mut notes,
6522 &ResourcePolicy::default_library(),
6523 )
6524 .expect("build 3-D tensor termspec");
6525 let SmoothBasisSpec::TensorBSpline { spec, .. } = &terms.smooth_terms[0].basis else {
6526 panic!("expected tensor smooth");
6527 };
6528 spec.marginalspecs
6529 .iter()
6530 .map(|m| match m.knotspec {
6531 BSplineKnotSpec::Generate {
6532 num_internal_knots, ..
6533 } => num_internal_knots + m.degree + 1,
6534 BSplineKnotSpec::Automatic {
6535 num_internal_knots: Some(num_internal_knots),
6536 ..
6537 } => num_internal_knots + m.degree + 1,
6538 // The mgcv-default `cr` margin (#1074) reports its basis size
6539 // as the number of value-knots placed.
6540 BSplineKnotSpec::NaturalCubicRegression { ref knots } => knots.len(),
6541 _ => panic!("unexpected tensor margin knotspec"),
6542 })
6543 .product()
6544 };
6545
6546 // n=30 (the issue's data): was 7³=343, must now be modest.
6547 assert!(
6548 make(60) <= 216,
6549 "3-D te at small n must stay near the mgcv te default, got {}",
6550 make(60)
6551 );
6552 // Larger n must NOT grow the product toward n³ (was 20³=8000).
6553 assert!(
6554 make(2000) <= 216,
6555 "3-D te at large n must not blow ∏k toward the data size, got {}",
6556 make(2000)
6557 );
6558 }
6559
6560 #[test]
6561 fn parse_bspline_boundary_conditions_and_side_selector() {
6562 // Non-zero anchors are rejected at parse time; the diagnostic must
6563 // name the side and value, which doubles as a check that the
6564 // `side=left` filter routes the global `anchor=` value to the
6565 // left endpoint (not the right).
6566 let mut opts = BTreeMap::new();
6567 opts.insert("boundary_conditions".to_string(), "anchored".to_string());
6568 opts.insert("side".to_string(), "left".to_string());
6569 opts.insert("anchor".to_string(), "2.5".to_string());
6570 let err = parse_bspline_boundary_conditions(&opts)
6571 .expect_err("non-zero left anchor must be rejected")
6572 .to_string();
6573 assert!(
6574 err.contains("left") && err.contains("2.5"),
6575 "rejection should name the affected side and value: {err}"
6576 );
6577
6578 // Side-specific aliases (`start_bc`/`end_bc`) plus the side-specific
6579 // anchor key (`right_anchor`) must funnel the value onto the right
6580 // endpoint — verified through the rejection diagnostic.
6581 let mut opts = BTreeMap::new();
6582 opts.insert("start_bc".to_string(), "clamped".to_string());
6583 opts.insert("end_bc".to_string(), "zero".to_string());
6584 opts.insert("right_anchor".to_string(), "-1.0".to_string());
6585 let err = parse_bspline_boundary_conditions(&opts)
6586 .expect_err("non-zero right anchor must be rejected")
6587 .to_string();
6588 assert!(
6589 err.contains("right") && err.contains("-1"),
6590 "rejection should name the affected side and value: {err}"
6591 );
6592
6593 // With anchors at zero the basis builder accepts the configuration,
6594 // so the same alias plumbing yields a clean `Anchored { value: 0.0 }`
6595 // on the right and `Clamped` on the left.
6596 let mut opts = BTreeMap::new();
6597 opts.insert("start_bc".to_string(), "clamped".to_string());
6598 opts.insert("end_bc".to_string(), "zero".to_string());
6599 let parsed = parse_bspline_boundary_conditions(&opts).expect("boundary conditions");
6600 assert!(matches!(
6601 parsed.left,
6602 BSplineEndpointBoundaryCondition::Clamped
6603 ));
6604 assert!(matches!(
6605 parsed.right,
6606 BSplineEndpointBoundaryCondition::Anchored { value } if value.abs() < 1e-12
6607 ));
6608 }
6609
6610 #[test]
6611 fn categorical_by_numeric_interaction_expands_treatment_coded_cells() {
6612 // `y ~ x:g` is an INTERACTION-ONLY numeric-by-factor model: there is no
6613 // `x` main effect, so the marginal parent that would identify a dropped
6614 // reference level is ABSENT. The expansion must therefore be marginality-
6615 // aware (gam#1158) and DUMMY-code `g` — keep ALL levels — yielding the
6616 // "common intercept, separate slopes" design (one x-slope column per
6617 // group). Treatment-coding here (dropping the reference level) would pin
6618 // the reference group's slope to zero, a rank-deficient fit; that wrong
6619 // behaviour is what this test now guards against. (The treatment-coded
6620 // path is exercised when the `x` parent is present — see
6621 // `categorical_by_numeric_interaction_keeps_treatment_coding_with_parent`.)
6622 let ds = factor_dataset();
6623 // `g` is categorical with two levels (encoded 0.0 → "a", 1.0 → "b").
6624 let parsed = parse_formula("y ~ x:g").expect("parse `y ~ x:g`");
6625 let col_map = ds.column_map();
6626 let mut notes = Vec::new();
6627 let terms = build_termspec(
6628 &parsed.terms,
6629 &ds,
6630 &col_map,
6631 &mut notes,
6632 &ResourcePolicy::default_library(),
6633 )
6634 .expect("factor-aware `x:g` interaction must build, not error");
6635
6636 assert_eq!(
6637 terms.linear_terms.len(),
6638 2,
6639 "interaction-only `x:g` keeps ALL factor levels (full dummy coding): one slope column per group"
6640 );
6641
6642 let x_col = *col_map.get("x").expect("x column");
6643 let g_col = *col_map.get("g").expect("g column");
6644
6645 // Both level gates must appear exactly once across the two cell columns,
6646 // and each cell carries `x` as a product factor (not a raw column for g).
6647 let mut seen_bits = std::collections::HashSet::new();
6648 for term in &terms.linear_terms {
6649 assert!(
6650 term.is_interaction(),
6651 "the categorical-by-numeric cell is a Wilkinson-Rogers interaction"
6652 );
6653 assert_eq!(term.feature_cols, vec![x_col]);
6654 assert_eq!(term.categorical_levels.len(), 1);
6655 let (gate_col, gate_bits) = term.categorical_levels[0];
6656 assert_eq!(gate_col, g_col);
6657 assert!(seen_bits.insert(gate_bits), "each level appears once");
6658
6659 // Realize and check it equals `1[g == gate_bits] * x` row by row.
6660 let column = term
6661 .realized_design_column(ds.values.view())
6662 .expect("realize cell column");
6663 let n = ds.values.nrows();
6664 assert_eq!(column.len(), n);
6665 for row in 0..n {
6666 let x = ds.values[[row, x_col]];
6667 let g = ds.values[[row, g_col]];
6668 let expected = if g.to_bits() == gate_bits { x } else { 0.0 };
6669 assert!(
6670 (column[row] - expected).abs() < 1e-12,
6671 "row {row}: g={g}, x={x}, expected {expected}, got {}",
6672 column[row]
6673 );
6674 }
6675 }
6676 // Both the reference level "a" (0.0) and the non-reference "b" (1.0) are
6677 // kept — the reference level is NOT dropped in the interaction-only form.
6678 assert!(seen_bits.contains(&0.0_f64.to_bits()));
6679 assert!(seen_bits.contains(&1.0_f64.to_bits()));
6680 }
6681
6682 #[test]
6683 fn categorical_by_numeric_interaction_keeps_treatment_coding_with_parent() {
6684 // With the `x` main effect PRESENT (`y ~ x + x:g`), the marginal parent
6685 // that identifies a dropped reference level exists, so `x:g` keeps its
6686 // historical treatment coding: the reference level "a" is dropped and
6687 // only the non-reference slope-deviation column for "b" is emitted. This
6688 // guards that the marginality-aware fix (gam#1158) does NOT regress the
6689 // parent-present form, which must stay column-space-identical to mgcv's
6690 // `x + x:g`.
6691 let ds = factor_dataset();
6692 let parsed = parse_formula("y ~ x + x:g").expect("parse `y ~ x + x:g`");
6693 let col_map = ds.column_map();
6694 let mut notes = Vec::new();
6695 let terms = build_termspec(
6696 &parsed.terms,
6697 &ds,
6698 &col_map,
6699 &mut notes,
6700 &ResourcePolicy::default_library(),
6701 )
6702 .expect("`x + x:g` must build");
6703
6704 // One main-effect `x` column plus one treatment-coded interaction cell.
6705 let x_col = *col_map.get("x").expect("x column");
6706 let g_col = *col_map.get("g").expect("g column");
6707 let interaction_cells: Vec<_> = terms
6708 .linear_terms
6709 .iter()
6710 .filter(|t| t.is_interaction())
6711 .collect();
6712 assert_eq!(
6713 interaction_cells.len(),
6714 1,
6715 "with `x` present, `x:g` is treatment-coded → one cell (reference dropped)"
6716 );
6717 let term = interaction_cells[0];
6718 assert_eq!(term.feature_cols, vec![x_col]);
6719 assert_eq!(term.categorical_levels.len(), 1);
6720 let (gate_col, gate_bits) = term.categorical_levels[0];
6721 assert_eq!(gate_col, g_col);
6722 // The dropped reference is "a" (0.0); the kept gate is "b" (1.0).
6723 assert_eq!(gate_bits, 1.0_f64.to_bits());
6724 }
6725
6726 #[test]
6727 fn categorical_by_categorical_interaction_expands_full_cross_cells() {
6728 // `y ~ f:g` is an INTERACTION-ONLY factor-by-factor model: neither `f`
6729 // nor `g` appears as a main effect, so neither marginal parent is
6730 // present and BOTH factors must be dummy-coded (gam#1159). The correct
6731 // design is the SATURATED cell-means model: the full cross of ALL levels
6732 // (3 * 2 = 6 cells) minus ONE reference cell (the lexicographically-first
6733 // level of every factor, here f0:g0) absorbed by the intercept — rank
6734 // 6-1 = 5 cell columns + intercept, column-space-identical to `f*g`.
6735 // Treatment-coding both factors (the old behaviour) kept only
6736 // (3-1)*(2-1) = 2 cells and collapsed the rest onto the intercept, a
6737 // rank-deficient fit; that is the bug this test now guards against.
6738 let n = 30usize;
6739 let mut rows = Vec::with_capacity(n);
6740 for i in 0..n {
6741 let y = (i as f64).sin();
6742 let f = (i % 3) as f64; // 3 levels: 0,1,2
6743 let g = (i % 2) as f64; // 2 levels: 0,1
6744 rows.push(vec![y, f, g]);
6745 }
6746 let values = Array2::from_shape_vec(
6747 (n, 3),
6748 rows.into_iter().flat_map(|row| row.into_iter()).collect(),
6749 )
6750 .expect("rectangular cross-factor data");
6751 let ds = Dataset {
6752 headers: vec!["y".into(), "f".into(), "g".into()],
6753 values,
6754 schema: DataSchema {
6755 columns: vec![
6756 SchemaColumn {
6757 name: "y".into(),
6758 kind: ColumnKindTag::Continuous,
6759 levels: vec![],
6760 },
6761 SchemaColumn {
6762 name: "f".into(),
6763 kind: ColumnKindTag::Categorical,
6764 levels: vec!["f0".into(), "f1".into(), "f2".into()],
6765 },
6766 SchemaColumn {
6767 name: "g".into(),
6768 kind: ColumnKindTag::Categorical,
6769 levels: vec!["g0".into(), "g1".into()],
6770 },
6771 ],
6772 },
6773 column_kinds: vec![
6774 ColumnKindTag::Continuous,
6775 ColumnKindTag::Categorical,
6776 ColumnKindTag::Categorical,
6777 ],
6778 };
6779
6780 let parsed = parse_formula("y ~ f:g").expect("parse `y ~ f:g`");
6781 let col_map = ds.column_map();
6782 let mut notes = Vec::new();
6783 let terms = build_termspec(
6784 &parsed.terms,
6785 &ds,
6786 &col_map,
6787 &mut notes,
6788 &ResourcePolicy::default_library(),
6789 )
6790 .expect("factor-by-factor `f:g` interaction must build, not error");
6791
6792 assert_eq!(
6793 terms.linear_terms.len(),
6794 5,
6795 "saturated 3*2 = 6 cross cells minus one reference cell (f0:g0) = 5"
6796 );
6797
6798 let f_col = *col_map.get("f").expect("f column");
6799 let g_col = *col_map.get("g").expect("g column");
6800 // The dropped reference cell pairs each factor's lexicographically-first
6801 // level: f0 (0.0) and g0 (0.0). It must NOT appear among the emitted
6802 // cells; every OTHER cross cell must.
6803 let f0 = 0.0_f64.to_bits();
6804 let g0 = 0.0_f64.to_bits();
6805 let mut emitted = std::collections::HashSet::new();
6806 for term in &terms.linear_terms {
6807 // No numeric operand: the realized column is a pure cell indicator.
6808 assert!(term.feature_cols.is_empty());
6809 assert_eq!(term.categorical_levels.len(), 2);
6810 let mut gates = std::collections::HashMap::new();
6811 for &(col, bits) in &term.categorical_levels {
6812 gates.insert(col, bits);
6813 }
6814 let f_bits = *gates.get(&f_col).expect("f gate present");
6815 let g_bits = *gates.get(&g_col).expect("g gate present");
6816 // The reference cell f0:g0 must have been dropped.
6817 assert!(
6818 !(f_bits == f0 && g_bits == g0),
6819 "the reference cell f0:g0 must be absorbed by the intercept, not emitted"
6820 );
6821 emitted.insert((f_bits, g_bits));
6822
6823 let column = term
6824 .realized_design_column(ds.values.view())
6825 .expect("realize cross cell");
6826 for row in 0..n {
6827 let f = ds.values[[row, f_col]];
6828 let g = ds.values[[row, g_col]];
6829 let expected = if f.to_bits() == f_bits && g.to_bits() == g_bits {
6830 1.0
6831 } else {
6832 0.0
6833 };
6834 assert!(
6835 (column[row] - expected).abs() < 1e-12,
6836 "row {row}: expected {expected}, got {}",
6837 column[row]
6838 );
6839 }
6840 assert!(
6841 column.iter().any(|&v| v == 1.0),
6842 "each cross cell must be observed in the data"
6843 );
6844 }
6845 // Every non-reference cross cell is present exactly once: all 6 cells
6846 // except f0:g0.
6847 let f_levels = [0.0_f64.to_bits(), 1.0_f64.to_bits(), 2.0_f64.to_bits()];
6848 let g_levels = [0.0_f64.to_bits(), 1.0_f64.to_bits()];
6849 for &fb in &f_levels {
6850 for &gb in &g_levels {
6851 if fb == f0 && gb == g0 {
6852 continue;
6853 }
6854 assert!(
6855 emitted.contains(&(fb, gb)),
6856 "saturated cross cell must be present"
6857 );
6858 }
6859 }
6860 }
6861}