gam_terms/term_builder.rs
1//! Term construction: bridge from parsed formula terms to `TermCollectionSpec`.
2//!
3//! This module takes the AST produced by `inference::formula_dsl` and a loaded
4//! dataset, resolves column references, infers knot counts and center strategies,
5//! and produces a `TermCollectionSpec` ready for `build_term_collection_design`.
6
7use std::collections::{BTreeMap, BTreeSet, HashMap};
8use std::path::PathBuf;
9
10use ndarray::{Array2, ArrayView1};
11
12use crate::basis::{
13 BSplineBasisSpec, BSplineBoundaryConditions, BSplineEndpointBoundaryCondition,
14 BSplineIdentifiability, BSplineKnotSpec, CenterCountRequest, CenterStrategy,
15 ConstantCurvatureBasisSpec, ConstantCurvatureIdentifiability, DuchonBasisSpec,
16 DuchonNullspaceOrder, DuchonOperatorPenaltySpec, MaternBasisSpec, MaternIdentifiability,
17 MaternNu, MeasureJetBasisSpec, MeasureJetIdentifiability, OneDimensionalBoundary,
18 SpatialIdentifiability, SphereMethod, SphereWahbaKernel, SphericalSplineBasisSpec,
19 SphericalSplineIdentifiability, ThinPlateBasisSpec, auto_spatial_center_strategy,
20 default_num_centers, default_spatial_center_strategy, default_spherical_harmonic_degree,
21 plan_spatial_basis, thin_plate_penalty_order,
22};
23use crate::inference::formula_dsl::{
24 ParsedTerm, SmoothKind, option_bool, option_f64, option_f64_strict, option_usize,
25 option_usize_any, option_usize_any_strict, option_usize_strict, strip_quotes,
26};
27use crate::smooth::{
28 ByVarKind, FactorSmoothFlavour, FactorSmoothSpec, LinearCoefficientGeometry, LinearTermSpec,
29 RandomEffectTermSpec, ShapeConstraint, SmoothBasisSpec, SmoothTermSpec,
30 TensorBSplineIdentifiability, TensorBSplinePenaltyDecomposition, TensorBSplineSpec,
31 TermCollectionSpec,
32};
33use gam_problem::types::ColIdx;
34use gam_data::{ColumnKindTag, DataError, EncodedDataset as Dataset};
35use gam_runtime::resource::ResourcePolicy;
36
37/// Default B-spline degree when a smooth's `degree=` option is absent. Cubic
38/// (degree 3) is the standard GAM convention: C² continuity with a low knot
39/// count.
40const DEFAULT_BSPLINE_DEGREE: usize = 3;
41
42/// Default difference-penalty order when a smooth's `penalty_order=` (alias
43/// `m=`) option is absent. Second-order (curvature) is the standard P-spline
44/// convention.
45const DEFAULT_PENALTY_ORDER: usize = 2;
46
47/// Default basis dimension for one-dimensional cyclic cubic P-splines.
48///
49/// Periodic smooths spend no coefficients on free endpoints, so they should not
50/// inherit the larger open B-spline knot ceiling by default. This is still only
51/// a default: callers can request a richer periodic space with `k=`.
52const CYCLIC_DEFAULT_BASIS_DIM: usize = 12;
53
54/// Default shared-marginal basis dimension for `bs="fs"`/`bs="sz"` factor smooths,
55/// matching mgcv's factor-smooth default `k=10`. A factor smooth shares one
56/// marginal across all levels; a modest basis recovers the shared signal without
57/// over-fitting each group's within-group noise (gam#903). Overridden by an
58/// explicit `k`/`basis_dim`.
59const FACTOR_SMOOTH_DEFAULT_BASIS_DIM: usize = 10;
60
61/// Default row-chunk size for the out-of-core PCA-basis smooth when the
62/// `chunk_size=` option is absent. Streams the design in row blocks to bound
63/// peak memory independent of the dataset row count.
64const DEFAULT_PCA_CHUNK_SIZE: usize = 4096;
65
66// ---------------------------------------------------------------------------
67// Typed errors
68// ---------------------------------------------------------------------------
69
70/// Typed errors emitted by term-builder helpers. `Display` reproduces the exact
71/// pre-refactor `format!(...)` text byte-for-byte, so callers that string-match
72/// on the message (tests, log assertions) keep working unchanged. Public-API
73/// functions still return `Result<_, String>` and use `.to_string()` shims at
74/// their boundary to stay compatible with callers in protected modules.
75#[derive(Clone, Debug)]
76pub enum TermBuilderError {
77 /// Column-resolution / column-kind lookup failures whose context is purely
78 /// internal (column-kind table out-of-sync, alias map missing an entry,
79 /// etc.). User-facing "this formula references a column that doesn't
80 /// exist" diagnostics use the dedicated `ColumnNotFound` variant so the
81 /// FFI boundary can lift the structured payload into a Python
82 /// `ColumnNotFoundError` without parsing prose.
83 MissingColumn { reason: String },
84 /// A formula referenced a column that is not present in the input data.
85 /// Mirrors `DataError::ColumnNotFound` field-for-field so the conversion
86 /// across module boundaries is a pure data move (no re-derivation, no
87 /// string re-parsing). Public callers see byte-identical `Display`
88 /// output to the legacy `missing_column_message` text.
89 ColumnNotFound {
90 name: String,
91 role: Option<String>,
92 available: Vec<String>,
93 similar: Vec<String>,
94 tsv_hint: bool,
95 },
96 /// User-specified configuration is internally inconsistent (e.g. too few
97 /// variables for a smooth type, conflicting size options, requested basis
98 /// dimension below the polynomial nullspace).
99 IncompatibleConfig { reason: String },
100 /// Option parsing failure: malformed numeric expression, unknown option
101 /// key, out-of-range integer, list-length mismatch, etc.
102 InvalidOption { reason: String },
103 /// User requested a feature that is intentionally not supported (unknown
104 /// smooth type / method / kernel / identifiability, non-zero anchor,
105 /// internal-only token, etc.).
106 UnsupportedFeature { reason: String },
107 /// Input data is degenerate for the requested term (constant column,
108 /// non-finite categorical entries, ...).
109 DegenerateData { reason: String },
110 /// Term-collection-stage formula error — a node that the caller was
111 /// supposed to resolve upstream reached the builder.
112 MalformedFormula { reason: String },
113}
114
115impl std::fmt::Display for TermBuilderError {
116 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
117 match self {
118 TermBuilderError::MissingColumn { reason }
119 | TermBuilderError::IncompatibleConfig { reason }
120 | TermBuilderError::InvalidOption { reason }
121 | TermBuilderError::UnsupportedFeature { reason }
122 | TermBuilderError::DegenerateData { reason }
123 | TermBuilderError::MalformedFormula { reason } => f.write_str(reason),
124 // Delegate to the canonical `DataError::ColumnNotFound` formatter
125 // so a single source of truth defines the human text. The
126 // intermediate `DataError` constructed here owns its strings only
127 // for the duration of the Display call — no allocation cost
128 // beyond the original payload that this variant already holds.
129 TermBuilderError::ColumnNotFound {
130 name,
131 role,
132 available,
133 similar,
134 tsv_hint,
135 } => {
136 let canonical = DataError::ColumnNotFound {
137 name: name.clone(),
138 role: role.clone(),
139 available: available.clone(),
140 similar: similar.clone(),
141 tsv_hint: *tsv_hint,
142 };
143 std::fmt::Display::fmt(&canonical, f)
144 }
145 }
146 }
147}
148
149impl From<TermBuilderError> for String {
150 fn from(err: TermBuilderError) -> String {
151 err.to_string()
152 }
153}
154
155/// Catchall lift for the term-builder's internal `Result<_, String>` helpers
156/// (numeric expression parsing, option lookup, boundary-condition parsing,
157/// ...) that flow into `build_termspec` via `?`. Maps to
158/// `IncompatibleConfig`, which is the most appropriate generic bucket for
159/// option/config-style failures — leaf sites that emit structured payloads
160/// (`From<DataError>` for column-not-found) bypass this fallback.
161impl From<String> for TermBuilderError {
162 fn from(reason: String) -> Self {
163 Self::IncompatibleConfig { reason }
164 }
165}
166
167/// Typed lift from data-layer errors. `DataError::ColumnNotFound` becomes
168/// `TermBuilderError::ColumnNotFound` field-for-field — no stringification,
169/// no information loss — so the FFI boundary downstream can dispatch on
170/// the typed variant. Other `DataError` variants degrade into
171/// `MissingColumn` since they describe column-resolution-time failures
172/// without a dedicated structured destination.
173impl From<DataError> for TermBuilderError {
174 fn from(err: DataError) -> Self {
175 match err {
176 DataError::ColumnNotFound {
177 name,
178 role,
179 available,
180 similar,
181 tsv_hint,
182 } => Self::ColumnNotFound {
183 name,
184 role,
185 available,
186 similar,
187 tsv_hint,
188 },
189 DataError::SchemaMismatch { reason }
190 | DataError::ParseError { reason }
191 | DataError::EncodingFailure { reason }
192 | DataError::EmptyInput { reason }
193 | DataError::InvalidValue { reason } => Self::MissingColumn { reason },
194 }
195 }
196}
197
198// Constructor helpers — keep error-site code compact and consistent.
199impl TermBuilderError {
200 #[inline]
201 fn missing_column(reason: impl Into<String>) -> Self {
202 TermBuilderError::MissingColumn {
203 reason: reason.into(),
204 }
205 }
206 #[inline]
207 fn incompatible_config(reason: impl Into<String>) -> Self {
208 TermBuilderError::IncompatibleConfig {
209 reason: reason.into(),
210 }
211 }
212 #[inline]
213 fn invalid_option(reason: impl Into<String>) -> Self {
214 TermBuilderError::InvalidOption {
215 reason: reason.into(),
216 }
217 }
218 #[inline]
219 fn unsupported_feature(reason: impl Into<String>) -> Self {
220 TermBuilderError::UnsupportedFeature {
221 reason: reason.into(),
222 }
223 }
224 #[inline]
225 fn degenerate_data(reason: impl Into<String>) -> Self {
226 TermBuilderError::DegenerateData {
227 reason: reason.into(),
228 }
229 }
230 #[inline]
231 fn malformed_formula(reason: impl Into<String>) -> Self {
232 TermBuilderError::MalformedFormula {
233 reason: reason.into(),
234 }
235 }
236}
237
238// ---------------------------------------------------------------------------
239// Column resolution
240// ---------------------------------------------------------------------------
241
242/// Resolve a bare column name to its index, returning a typed
243/// `DataError::ColumnNotFound` on miss so the FFI boundary can surface a
244/// structured `gamfit.ColumnNotFoundError(column=…, available=…)` rather
245/// than rely on string-classification of human prose. Internal callers that
246/// still flow `Result<_, String>` get byte-identical text via
247/// `From<DataError> for String`.
248pub fn resolve_col(col_map: &HashMap<String, usize>, name: &str) -> Result<usize, DataError> {
249 col_map
250 .get(name)
251 .copied()
252 .ok_or_else(|| DataError::column_not_found(col_map, name, None))
253}
254
255/// Like `resolve_col` but tags the missing-column payload with a role label
256/// (`"response"`, `"entry"`, `"exit"`, `"event"`, `"z"`, `"id"`, …) so the
257/// boundary-side Python exception can disambiguate which formula slot held
258/// the bad reference.
259pub fn resolve_role_col(
260 col_map: &HashMap<String, usize>,
261 name: &str,
262 role: &str,
263) -> Result<usize, DataError> {
264 col_map
265 .get(name)
266 .copied()
267 .ok_or_else(|| DataError::column_not_found(col_map, name, Some(role)))
268}
269
270fn encoded_levels_for_column(ds: &Dataset, col: ColIdx) -> Vec<(u64, String)> {
271 let mut seen = BTreeSet::<u64>::new();
272 for value in ds.values.column(col.get()) {
273 if value.is_finite() {
274 seen.insert(value.to_bits());
275 }
276 }
277 let schema_levels = ds
278 .schema
279 .columns
280 .get(col.get())
281 .map(|column| column.levels.as_slice())
282 .unwrap_or(&[]);
283 seen.into_iter()
284 .enumerate()
285 .map(|(idx, bits)| {
286 let fallback = format!("level{}", idx + 1);
287 let label = schema_levels.get(idx).cloned().unwrap_or(fallback);
288 (bits, label)
289 })
290 .collect()
291}
292
293pub fn column_map_with_alias(
294 col_map: &HashMap<String, usize>,
295 alias: &str,
296 target_column: &str,
297) -> HashMap<String, usize> {
298 let mut aliased = col_map.clone();
299 if let Some(idx) = col_map.get(target_column).copied() {
300 aliased.entry(alias.to_string()).or_insert(idx);
301 }
302 aliased
303}
304
305// ---------------------------------------------------------------------------
306// ParsedTerm[] + Dataset → TermCollectionSpec
307// ---------------------------------------------------------------------------
308
309pub fn build_termspec(
310 terms: &[ParsedTerm],
311 ds: &Dataset,
312 col_map: &HashMap<String, usize>,
313 inference_notes: &mut Vec<String>,
314 policy: &ResourcePolicy,
315) -> Result<TermCollectionSpec, TermBuilderError> {
316 let mut linear_terms = Vec::<LinearTermSpec>::new();
317 let mut random_terms = Vec::<RandomEffectTermSpec>::new();
318 let mut smooth_terms = Vec::<SmoothTermSpec>::new();
319 let smooth_coordinate_count = terms
320 .iter()
321 .map(|term| match term {
322 ParsedTerm::Smooth { vars, .. } => vars.len(),
323 _ => 0,
324 })
325 .sum::<usize>();
326
327 for t in terms {
328 match t {
329 ParsedTerm::Linear {
330 name,
331 explicit,
332 coefficient_min,
333 coefficient_max,
334 } => {
335 let col = resolve_col(col_map, name)?;
336 let auto_kind = ds.column_kinds.get(col).copied().ok_or_else(|| {
337 TermBuilderError::missing_column(format!(
338 "internal column-kind lookup failed for '{name}'"
339 ))
340 .to_string()
341 })?;
342 if *explicit {
343 linear_terms.push(LinearTermSpec {
344 name: name.clone(),
345 feature_col: col,
346 feature_cols: vec![col],
347 categorical_levels: vec![],
348 // Parametric linear terms are unpenalized by default
349 // (MLE, matching mgcv/glm); see #749.
350 double_penalty: false,
351 coefficient_geometry: LinearCoefficientGeometry::Unconstrained,
352 coefficient_min: *coefficient_min,
353 coefficient_max: *coefficient_max,
354 });
355 } else {
356 match auto_kind {
357 ColumnKindTag::Continuous | ColumnKindTag::Binary => {
358 linear_terms.push(LinearTermSpec {
359 name: name.clone(),
360 feature_col: col,
361 feature_cols: vec![col],
362 categorical_levels: vec![],
363 // Unpenalized parametric effect by default (#749).
364 double_penalty: false,
365 coefficient_geometry: LinearCoefficientGeometry::Unconstrained,
366 coefficient_min: *coefficient_min,
367 coefficient_max: *coefficient_max,
368 });
369 }
370 ColumnKindTag::Categorical => {
371 if coefficient_min.is_some() || coefficient_max.is_some() {
372 return Err(TermBuilderError::incompatible_config(format!(
373 "coefficient constraints are not supported for categorical auto-random-effect term '{name}'; use group({name}) or an unconstrained numeric term"
374 )));
375 }
376 random_terms.push(RandomEffectTermSpec {
377 name: name.clone(),
378 feature_col: col,
379 drop_first_level: false,
380 penalized: true,
381 frozen_levels: None,
382 });
383 }
384 }
385 }
386 }
387 ParsedTerm::BoundedLinear {
388 name,
389 min,
390 max,
391 prior,
392 } => {
393 let col = resolve_col(col_map, name)?;
394 let auto_kind = ds.column_kinds.get(col).copied().ok_or_else(|| {
395 TermBuilderError::missing_column(format!(
396 "internal column-kind lookup failed for '{name}'"
397 ))
398 .to_string()
399 })?;
400 if !matches!(auto_kind, ColumnKindTag::Continuous | ColumnKindTag::Binary) {
401 return Err(TermBuilderError::incompatible_config(format!(
402 "bounded() currently supports only numeric columns, got categorical '{name}'"
403 )));
404 }
405 linear_terms.push(LinearTermSpec {
406 name: name.clone(),
407 feature_col: col,
408 feature_cols: vec![col],
409 categorical_levels: vec![],
410 double_penalty: false,
411 coefficient_geometry: LinearCoefficientGeometry::Bounded {
412 min: *min,
413 max: *max,
414 prior: prior.clone(),
415 },
416 coefficient_min: None,
417 coefficient_max: None,
418 });
419 }
420 ParsedTerm::RandomEffect { name } => {
421 let col = resolve_col(col_map, name)?;
422 random_terms.push(RandomEffectTermSpec {
423 name: name.clone(),
424 feature_col: col,
425 drop_first_level: false,
426 penalized: true,
427 frozen_levels: None,
428 });
429 }
430 ParsedTerm::Smooth {
431 label,
432 vars,
433 kind,
434 options,
435 } => {
436 let smooth_vars = vars.clone();
437 let by_name = options.get("by").cloned();
438 // `bs="sz"` (sum-to-zero), like `bs="fs"`/`bs="re"`, is a
439 // factor-smooth family handled natively by `build_smooth_basis`'s
440 // fs/sz/re path: it detects the categorical factor among the
441 // variables and emits a `SmoothBasisSpec::FactorSmooth { Sz }`
442 // with the correct single-penalty marginal and modest default
443 // basis. Route sz straight through `build_smooth_basis` rather
444 // than intercepting it into a legacy `FactorSumToZero` envelope
445 // here (which left `sz(fac, x)` mis-typed as `FactorSumToZero`
446 // instead of the expected `FactorSmooth { Sz }`).
447 let cols = smooth_vars
448 .iter()
449 .map(|v| resolve_col(col_map, v))
450 .collect::<Result<Vec<_>, _>>()?;
451 let mut inner_options = options.clone();
452 inner_options.remove("by");
453 // `ordered=` is consumed here (ByVarKind::Factor routing) and
454 // must not propagate to the inner basis builder, which has no
455 // allow-list entry for it and would reject it as an unknown option.
456 inner_options.remove("ordered");
457 // Pop the shape constraint before `build_smooth_basis` runs so
458 // it never reaches the per-kind `validate_known_options`
459 // allow-lists (the constraint is a property of the smooth term,
460 // not of any one basis kind). Basis-incompatible requests still
461 // fail loudly downstream via `shape_supports_basis`.
462 let shape = match inner_options.remove("shape") {
463 None => ShapeConstraint::None,
464 Some(raw) => crate::smooth::parse_shape_constraint(&raw)
465 .map_err(TermBuilderError::invalid_option)?,
466 };
467 let inner_basis = build_smooth_basis(
468 *kind,
469 &smooth_vars,
470 &cols,
471 &inner_options,
472 ds,
473 inference_notes,
474 policy,
475 smooth_coordinate_count,
476 )?;
477 if let Some(by_name) = by_name {
478 let by_col = resolve_col(col_map, &by_name)?;
479 match ds.column_kinds.get(by_col).copied().ok_or_else(|| {
480 format!("internal column-kind lookup failed for by variable '{by_name}'")
481 })? {
482 ColumnKindTag::Categorical => {
483 let levels = encoded_levels_for_column(ds, ColIdx::new(by_col));
484 // A penalized random block for this factor already
485 // owns its full level offsets when EITHER an explicit
486 // `group(factor)` appears, OR a *bare* categorical
487 // `+ factor` does — the latter is auto-promoted to a
488 // penalized random-effect block (see the
489 // `ParsedTerm::Linear` / `ColumnKindTag::Categorical`
490 // arm above, `penalized: true`). Both representations
491 // carry the same per-level offsets, so #1457: the
492 // `by=` branch must NOT additionally add its own
493 // unpenalized treatment-coded main effect, which would
494 // double-represent the factor (two `g` design blocks +
495 // a spurious extra smoothing parameter).
496 let penalized_group_owner_present =
497 terms.iter().any(|other| match other {
498 ParsedTerm::RandomEffect { name } => name == &by_name,
499 ParsedTerm::Linear {
500 name,
501 explicit: false,
502 ..
503 } if name == &by_name => col_map
504 .get(name)
505 .and_then(|c| ds.column_kinds.get(*c).copied())
506 .map(|kind| matches!(kind, ColumnKindTag::Categorical))
507 .unwrap_or(false),
508 _ => false,
509 });
510 // Add an unpenalized treatment-coded fixed main
511 // effect for a standalone factor-by smooth, unless
512 // the same factor already has an explicit
513 // `group(factor)` term OR a bare categorical `+
514 // factor` that was auto-promoted to a penalized
515 // random block (#1457). In those mixed-model forms
516 // the penalized random intercept is the coherent
517 // owner of level offsets; adding a no-pooling fixed
518 // factor effect would bypass random-effect
519 // shrinkage and degrade BLUP-style predictions.
520 if !random_terms.iter().any(|rt| rt.name == by_name)
521 && !penalized_group_owner_present
522 {
523 random_terms.push(RandomEffectTermSpec {
524 name: by_name.clone(),
525 feature_col: by_col,
526 drop_first_level: true,
527 penalized: false,
528 frozen_levels: None,
529 });
530 }
531 // Route to a single BySmooth::Factor term with
532 // frozen levels pre-populated from the training data.
533 // Design building later gates each level into its own
534 // column block (see build_by_smooth_local in term_specs).
535 let frozen_levels: Vec<u64> =
536 levels.iter().map(|(bits, _)| *bits).collect();
537 smooth_terms.push(SmoothTermSpec {
538 name: label.clone(),
539 basis: SmoothBasisSpec::BySmooth {
540 smooth: Box::new(inner_basis),
541 by_kind: ByVarKind::Factor {
542 feature_col: by_col,
543 ordered: option_bool(options, "ordered").unwrap_or(false),
544 frozen_levels: Some(frozen_levels),
545 },
546 },
547 shape,
548 joint_null_rotation: None,
549 });
550 }
551 ColumnKindTag::Binary | ColumnKindTag::Continuous => {
552 smooth_terms.push(SmoothTermSpec {
553 name: label.clone(),
554 basis: SmoothBasisSpec::BySmooth {
555 smooth: Box::new(inner_basis),
556 by_kind: ByVarKind::Numeric {
557 feature_col: by_col,
558 },
559 },
560 shape,
561 joint_null_rotation: None,
562 });
563 }
564 }
565 } else {
566 smooth_terms.push(SmoothTermSpec {
567 name: label.clone(),
568 basis: inner_basis,
569 shape,
570 joint_null_rotation: None,
571 });
572 }
573 }
574 ParsedTerm::LinkWiggle { .. }
575 | ParsedTerm::TimeWiggle { .. }
576 | ParsedTerm::LinkConfig { .. }
577 | ParsedTerm::SurvivalConfig { .. } => {
578 // Consumed at formula level, not design terms.
579 }
580 ParsedTerm::LogSlopeSurface { .. } => {
581 return Err(TermBuilderError::malformed_formula(
582 "logslope(...) declarations must be resolved by the marginal-slope formula path before building a term spec",
583 ));
584 }
585 ParsedTerm::Interaction { vars } => {
586 // A linear `:` interaction realizes one design column equal to
587 // the elementwise product of its operands. Numeric (continuous/
588 // binary) operands multiply directly; a categorical operand is
589 // a factor, so the product is expanded factor-aware: one design
590 // column per surviving cell of the factor(s), each an indicator
591 // `1[factor == level]` gating the numeric product.
592 //
593 // Coding is MARGINALITY-AWARE (gam#1158, gam#1159). A categorical
594 // operand `g` is treatment-coded (its lexicographically first
595 // reference level dropped) ONLY when the lower-order term obtained
596 // by removing `g` from this interaction is also present in the
597 // model — that lower-order term is what makes the dropped level
598 // identifiable, exactly mgcv's marginality rule. When that parent
599 // is ABSENT (the interaction-only form), dropping the reference
600 // level instead pins a group to the reference fit (a rank-deficient
601 // design), so we keep ALL levels (full dummy coding) and rely on a
602 // single intercept cell-drop below for identifiability:
603 // * `y ~ x:g` with no `x` main effect → "common intercept,
604 // separate slopes": every group keeps its own x-slope.
605 // * `y ~ g:h` with no `g`/`h` main effects → the saturated
606 // cell-means model: full cross of all levels minus one
607 // reference cell absorbed by the intercept.
608 // When the parents ARE present (`x + x:g`, or `g*h` = `g + h +
609 // g:h`), the historical treatment coding is preserved so those
610 // forms stay correct.
611 //
612 // A main effect for var V is a `Linear`/`BoundedLinear`/
613 // `RandomEffect` ParsedTerm whose referenced name is V (an
614 // auto-detected categorical `Linear` becomes a RandomEffect main
615 // effect; either spelling counts). We only treat such standalone
616 // main-effect terms as parents — not V appearing inside another
617 // interaction.
618 let main_effect_present = |target: &str| -> bool {
619 terms.iter().any(|other| match other {
620 ParsedTerm::Linear { name, .. }
621 | ParsedTerm::BoundedLinear { name, .. }
622 | ParsedTerm::RandomEffect { name } => name == target,
623 _ => false,
624 })
625 };
626 // The lower-order parent of dropping operand `drop_var` from this
627 // interaction is present iff EVERY other operand is a main effect.
628 // For the two cases we care about (`x:g`, `g:h`) the interaction
629 // has two operands, so this reduces to "is the single remaining
630 // operand a main effect"; the general form handles any arity.
631 let parent_present = |drop_var: &str| -> bool {
632 vars.iter()
633 .filter(|v| v.as_str() != drop_var)
634 .all(|v| main_effect_present(v))
635 };
636
637 let mut numeric_cols = Vec::<usize>::new();
638 // Per categorical operand: (var name, col, kept levels, was the
639 // reference level dropped / treatment-coded?).
640 let mut categorical_factors =
641 Vec::<(String, usize, Vec<(u64, String)>, bool)>::new();
642 for var in vars {
643 let col = resolve_col(col_map, var)?;
644 let kind = ds.column_kinds.get(col).copied().ok_or_else(|| {
645 TermBuilderError::missing_column(format!(
646 "internal column-kind lookup failed for '{var}'"
647 ))
648 .to_string()
649 })?;
650 match kind {
651 ColumnKindTag::Continuous | ColumnKindTag::Binary => numeric_cols.push(col),
652 ColumnKindTag::Categorical => {
653 let mut levels = encoded_levels_for_column(ds, ColIdx::new(col));
654 // Treatment-code (drop the reference level) only when
655 // the marginal parent that identifies it is present;
656 // otherwise keep every level (full dummy coding).
657 let treatment_coded = parent_present(var);
658 if treatment_coded && levels.len() > 1 {
659 levels.remove(0);
660 }
661 if levels.is_empty() {
662 return Err(TermBuilderError::incompatible_config(format!(
663 "interaction `{}` references categorical column `{var}` with no usable levels",
664 vars.join(":")
665 )));
666 }
667 categorical_factors.push((var.clone(), col, levels, treatment_coded));
668 }
669 }
670 }
671
672 let label = vars.join(":");
673
674 if categorical_factors.is_empty() {
675 // Pure numeric `:` interaction — single product column,
676 // identical to the historical behaviour.
677 linear_terms.push(LinearTermSpec {
678 name: label,
679 feature_col: numeric_cols[0],
680 feature_cols: numeric_cols,
681 categorical_levels: vec![],
682 // Parametric `:` interaction column is unpenalized by
683 // default, same as any other linear term (#749).
684 double_penalty: false,
685 coefficient_geometry: LinearCoefficientGeometry::Unconstrained,
686 coefficient_min: None,
687 coefficient_max: None,
688 });
689 inference_notes.push(format!(
690 "wired linear interaction `{}` as product of numeric columns",
691 vars.join(":")
692 ));
693 } else {
694 // Factor-aware expansion: cartesian product over the kept
695 // levels of every categorical operand. Each cell yields one
696 // column gating the numeric product (or, with no numeric
697 // operand, a pure cell indicator).
698 let mut cells: Vec<Vec<(usize, u64, String)>> = vec![Vec::new()];
699 for (_var, col, levels, _treatment_coded) in &categorical_factors {
700 let mut next = Vec::with_capacity(cells.len() * levels.len());
701 for cell in &cells {
702 for (bits, level_label) in levels {
703 let mut extended = cell.clone();
704 extended.push((*col, *bits, level_label.clone()));
705 next.push(extended);
706 }
707 }
708 cells = next;
709 }
710
711 // Intercept-identifiability cell drop. When the cells are PURE
712 // INDICATORS (no numeric operand) and at least one factor was
713 // dummy-coded (kept all its levels), the full set of cell
714 // columns sums to the all-ones intercept and is rank-deficient
715 // against it. Drop exactly ONE reference cell — the cell where
716 // every factor sits at its reference (lexicographically first)
717 // level — so the remaining saturated cells are identifiable
718 // (rank n_g*n_h - 1 cells + intercept). With a numeric operand
719 // the cells gate `x` and sum to `x`, not the intercept, so no
720 // cell is dropped (the collinearity there is with the absent
721 // `x` main effect, which is exactly why full coding is right).
722 let any_dummy_coded = categorical_factors
723 .iter()
724 .any(|(_, _, _, treatment_coded)| !*treatment_coded);
725 if numeric_cols.is_empty() && any_dummy_coded {
726 // The reference cell pairs each factor's column with the
727 // bits of its lexicographically-first (index 0) level.
728 let reference_cell: Vec<(usize, u64)> = categorical_factors
729 .iter()
730 .map(|(_, col, _, _)| {
731 let levels = encoded_levels_for_column(ds, ColIdx::new(*col));
732 (*col, levels[0].0)
733 })
734 .collect();
735 cells.retain(|cell| {
736 !reference_cell.iter().all(|(rcol, rbits)| {
737 cell.iter()
738 .any(|(col, bits, _)| col == rcol && bits == rbits)
739 })
740 });
741 }
742
743 let n_cells = cells.len();
744 for cell in cells {
745 let cell_suffix = cell
746 .iter()
747 .map(|(_, _, level_label)| level_label.as_str())
748 .collect::<Vec<_>>()
749 .join(":");
750 let categorical_levels =
751 cell.iter().map(|(col, bits, _)| (*col, *bits)).collect();
752 // `feature_col` is required to point at a real column;
753 // use the first numeric operand when present, otherwise
754 // the first categorical column (its raw value is never
755 // multiplied — `realized_design_column` starts from ones
756 // and only gates by the level indicators).
757 let feature_col = numeric_cols
758 .first()
759 .copied()
760 .unwrap_or(categorical_factors[0].1);
761 linear_terms.push(LinearTermSpec {
762 name: format!("{label}:{cell_suffix}"),
763 feature_col,
764 feature_cols: numeric_cols.clone(),
765 categorical_levels,
766 double_penalty: false,
767 coefficient_geometry: LinearCoefficientGeometry::Unconstrained,
768 coefficient_min: None,
769 coefficient_max: None,
770 });
771 }
772 let all_treatment_coded = !any_dummy_coded;
773 let coding = if all_treatment_coded {
774 "treatment-coded"
775 } else {
776 "marginality-aware (full dummy / saturated)"
777 };
778 inference_notes.push(format!(
779 "wired factor-aware linear interaction `{}` as {} {} cell column(s)",
780 vars.join(":"),
781 n_cells,
782 coding
783 ));
784 }
785 }
786 }
787 }
788
789 Ok(TermCollectionSpec {
790 linear_terms,
791 random_effect_terms: random_terms,
792 smooth_terms,
793 })
794}
795
796fn split_list_option(raw: &str) -> Vec<String> {
797 let t = raw.trim();
798 // Accept the Python/JSON list form `[a, b]` AND mgcv's R-vector forms
799 // `c(a, b)` / `(a, b)` as bracketed wrappers around a comma-separated body.
800 // mgcv-style formulas pass per-margin numeric options as `k=c(5,5)` /
801 // `period=c(2*pi, pi)`; without R-vector peeling here those entries were
802 // split into `["c(5", "5)"]` and the downstream numeric parser then
803 // misreported the leading garbage as the invalid digit.
804 let inner = t
805 .strip_prefix('[')
806 .and_then(|u| u.strip_suffix(']'))
807 .or_else(|| {
808 t.strip_prefix("c(")
809 .or_else(|| t.strip_prefix("C("))
810 .or_else(|| t.strip_prefix('('))
811 .and_then(|u| u.strip_suffix(')'))
812 })
813 .unwrap_or(t);
814 inner
815 .split(',')
816 .map(|v| v.trim().to_string())
817 .filter(|v| !v.is_empty())
818 .collect()
819}
820
821fn parse_numeric_expr(raw: &str) -> Result<f64, String> {
822 let mut acc = 1.0f64;
823 let normalized = raw.replace(' ', "");
824 if normalized.eq_ignore_ascii_case("none") {
825 return Err("None is not numeric".to_string());
826 }
827 for factor in normalized.split('*') {
828 if factor.is_empty() {
829 return Err(format!("invalid numeric expression '{raw}'"));
830 }
831 let value = if factor.eq_ignore_ascii_case("pi") || factor == "π" {
832 std::f64::consts::PI
833 } else if factor.eq_ignore_ascii_case("tau") || factor == "τ" {
834 std::f64::consts::TAU
835 } else if let Some(prefix) = factor
836 .strip_suffix("pi")
837 .or_else(|| factor.strip_suffix("π"))
838 {
839 let coefficient = if prefix.is_empty() {
840 1.0
841 } else {
842 prefix
843 .parse::<f64>()
844 .map_err(|err| format!("invalid numeric expression '{raw}': {err}"))?
845 };
846 coefficient * std::f64::consts::PI
847 } else if let Some(prefix) = factor
848 .strip_suffix("tau")
849 .or_else(|| factor.strip_suffix("τ"))
850 {
851 let coefficient = if prefix.is_empty() {
852 1.0
853 } else {
854 prefix
855 .parse::<f64>()
856 .map_err(|err| format!("invalid numeric expression '{raw}': {err}"))?
857 };
858 coefficient * std::f64::consts::TAU
859 } else {
860 factor
861 .parse::<f64>()
862 .map_err(|err| format!("invalid numeric expression '{raw}': {err}"))?
863 };
864 acc *= value;
865 }
866 Ok(acc)
867}
868
869/// Read an endpoint/period option as a numeric *expression* (`2*pi`, `tau`,
870/// `0.5*tau`, `6.283185307179586`, ...) — the same grammar that `period=` and
871/// `origin=` already accept via [`parse_numeric_expr`].
872///
873/// Returns `Ok(None)` when the key is absent, `Ok(Some(v))` when it parses, and
874/// a hard `Err` when the key is *present but unparseable*. The crucial contrast
875/// is with the lenient [`option_f64`], which collapses an unparseable value to
876/// `None` and lets the caller silently substitute the data range — wrapping a
877/// cyclic smooth at the wrong period with no diagnostic (the #815 failure mode).
878fn option_numeric_expr(
879 options: &BTreeMap<String, String>,
880 key: &str,
881) -> Result<Option<f64>, String> {
882 match options.get(key) {
883 None => Ok(None),
884 Some(raw) => parse_numeric_expr(raw)
885 .map(Some)
886 .map_err(|err| format!("option `{key}={raw}` is not a valid numeric value: {err}")),
887 }
888}
889
890fn parse_periods_option(
891 options: &BTreeMap<String, String>,
892 dim: usize,
893) -> Result<Option<Vec<Option<f64>>>, String> {
894 let Some(raw) = options.get("period") else {
895 return Ok(None);
896 };
897 let values = split_list_option(raw);
898 let mut periods = vec![None; dim];
899 if values.len() == 1 && dim == 1 {
900 periods[0] = Some(parse_numeric_expr(&values[0])?);
901 } else {
902 if values.len() != dim {
903 return Err(format!(
904 "period list length {} must match smooth dimension {}",
905 values.len(),
906 dim
907 ));
908 }
909 for (i, v) in values.iter().enumerate() {
910 if v.eq_ignore_ascii_case("none") {
911 continue;
912 }
913 periods[i] = Some(parse_numeric_expr(v)?);
914 }
915 }
916 Ok(Some(periods))
917}
918
919fn parse_periodic_axes_option(
920 options: &BTreeMap<String, String>,
921 dim: usize,
922) -> Result<Option<Vec<Option<f64>>>, String> {
923 let Some(raw_axes) = options.get("periodic") else {
924 return Ok(None);
925 };
926 let mut periods = parse_periods_option(options, dim)?.unwrap_or_else(|| vec![None; dim]);
927 // Scalar boolean form (`periodic=true` / `false`, `yes` / `no`) applies to
928 // every axis — the documented per-axis-flag broadcast (see the doc on
929 // `parse_periodic_axes`, the tensor sibling that already accepts it). A
930 // 1-D `duchon(x, periodic=true)` lands here: the cyclic *domain* is then
931 // resolved from the data range by `parse_cyclic_boundary` (the 1-D builder
932 // consults `boundary` first), so a finite explicit period is NOT required —
933 // we only need to NOT mis-read "true" as an axis index (#1074). `false`
934 // means no axis is periodic.
935 let lowered = raw_axes.trim().to_ascii_lowercase();
936 match lowered.as_str() {
937 "true" | "yes" | "y" => return Ok(Some(periods)),
938 // `false` means NO axis is periodic. Return `None` — NOT
939 // `Some(vec![None; dim])` — because the radial 1-D consumer treats a
940 // `Some([None])` as "periodicity requested, derive the wrap period from
941 // the data range" (see the Duchon builder arm below, which back-fills
942 // `axes[0] = data_span` for a lone `None`) and the 1-D builder routes on
943 // `spec.periodic.is_some()`. Emitting `Some([None])` here therefore
944 // silently produced a *periodic* smooth for an explicit `periodic=false`
945 // — the exact regression this arm now avoids, matching the bracketed
946 // `[false]` form handled by the per-axis boolean block below.
947 "false" | "no" | "n" => return Ok(None),
948 _ => {}
949 }
950 let axes = split_list_option(raw_axes);
951 if axes.is_empty() {
952 return Ok(Some(periods));
953 }
954
955 // Boolean forms `periodic=true` / `periodic=[true, false, ...]`, mirroring
956 // `parse_tensor_periodic_axes`. The radial 1-D builders (`duchon`/`tps`/
957 // `matern`) intentionally DERIVE the wrap period from the closed center
958 // lattice when none is supplied (`prepare_periodic_duchon_centers_1d_with_period`,
959 // gam#580: `None => span`), so a boolean-selected periodic axis legitimately
960 // omits `period`. Without this branch, `duchon(x, periodic=true)`-style
961 // radial formulas failed with the misleading "invalid periodic axis 'true'".
962 let is_bool = |t: &str| {
963 matches!(
964 t.to_ascii_lowercase().as_str(),
965 "true" | "yes" | "y" | "false" | "no" | "n"
966 )
967 };
968 let is_truthy = |t: &str| matches!(t.to_ascii_lowercase().as_str(), "true" | "yes" | "y");
969
970 // Scalar boolean: `periodic=true` / `periodic=false`.
971 if axes.len() == 1 && is_bool(&axes[0]) {
972 if !is_truthy(&axes[0]) {
973 // Non-periodic: return None so the 1-D builder (which routes on
974 // `spec.periodic.is_some()`) does NOT take the periodic path.
975 return Ok(None);
976 }
977 // Every axis periodic; honor any explicit per-axis period, else leave
978 // `None` for the caller (formula arm) / builder to derive the span.
979 return Ok(Some(periods));
980 }
981
982 // Per-axis boolean list: `periodic=[true, false, ...]` (length must match dim).
983 if axes.iter().all(|a| is_bool(a)) {
984 if axes.len() != dim {
985 return Err(format!(
986 "periodic flag list length {} must match smooth dimension {dim}",
987 axes.len()
988 ));
989 }
990 if !axes.iter().any(|a| is_truthy(a)) {
991 return Ok(None);
992 }
993 for (i, a) in axes.iter().enumerate() {
994 if !is_truthy(a) {
995 periods[i] = None;
996 }
997 }
998 return Ok(Some(periods));
999 }
1000
1001 // Index-list form: `periodic=[0, 2]`. Each listed axis must carry an
1002 // explicit finite period — an index gives no per-axis span-derive hint.
1003 for a in &axes {
1004 let axis = a
1005 .parse::<usize>()
1006 .map_err(|err| format!("invalid periodic axis '{a}': {err}"))?;
1007 if axis >= dim {
1008 return Err(format!(
1009 "periodic axis {axis} out of range for {dim}D smooth"
1010 ));
1011 }
1012 if periods[axis].is_none() {
1013 return Err(format!(
1014 "periodic axis {axis} requires period[{axis}] to be finite"
1015 ));
1016 }
1017 }
1018 // Axes not listed are non-periodic even if period list has a finite placeholder.
1019 let listed: std::collections::BTreeSet<usize> = axes
1020 .iter()
1021 .filter_map(|a| a.parse::<usize>().ok())
1022 .collect();
1023 for i in 0..dim {
1024 if !listed.contains(&i) {
1025 periods[i] = None;
1026 }
1027 }
1028 Ok(Some(periods))
1029}
1030
1031// ---------------------------------------------------------------------------
1032// Smooth basis spec construction
1033// ---------------------------------------------------------------------------
1034
1035fn parse_option_list(raw: &str) -> Vec<String> {
1036 let trimmed = raw.trim();
1037 // Accept both the Python/JSON list form `[a, b]` and mgcv's R vector form
1038 // `c(a, b)` (and a bare `(a, b)`) as the bracketed wrapper around a
1039 // comma-separated option list. mgcv writes per-margin options as
1040 // `bs=c('tp','tp')` / `m=c(2,2)`, so the `c(...)` form must round-trip
1041 // through the same splitter the `[...]` form uses.
1042 let inner = trimmed
1043 .strip_prefix('[')
1044 .and_then(|v| v.strip_suffix(']'))
1045 .or_else(|| {
1046 trimmed
1047 .strip_prefix("c(")
1048 .or_else(|| trimmed.strip_prefix("C("))
1049 .or_else(|| trimmed.strip_prefix('('))
1050 .and_then(|v| v.strip_suffix(')'))
1051 })
1052 .unwrap_or(trimmed);
1053 inner
1054 .split(',')
1055 .map(|v| {
1056 v.trim()
1057 .trim_matches('"')
1058 .trim_matches('\'')
1059 .to_ascii_lowercase()
1060 })
1061 .filter(|v| !v.is_empty())
1062 .collect()
1063}
1064
1065fn parse_periodic_axes(
1066 options: &BTreeMap<String, String>,
1067 dim: usize,
1068) -> Result<Vec<bool>, String> {
1069 let mut axes = vec![false; dim];
1070 if let Some(raw) = options.get("periodic").or_else(|| options.get("cyclic")) {
1071 let lowered = raw.trim().to_ascii_lowercase();
1072 match lowered.as_str() {
1073 "true" | "yes" | "y" => {
1074 axes.fill(true);
1075 return Ok(axes);
1076 }
1077 "false" | "no" | "n" => return Ok(axes),
1078 _ => {}
1079 }
1080 for axis_raw in parse_option_list(raw) {
1081 let axis = axis_raw
1082 .parse::<usize>()
1083 .map_err(|err| format!("invalid periodic axis '{axis_raw}': {err}"))?;
1084 if axis >= dim {
1085 return Err(format!(
1086 "periodic axis {axis} out of range for {dim}D smooth"
1087 ));
1088 }
1089 axes[axis] = true;
1090 }
1091 }
1092 if let Some(raw) = options.get("boundary").or_else(|| options.get("bc")) {
1093 let boundary = parse_option_list(raw);
1094 if boundary.len() == dim {
1095 for (axis, value) in boundary.iter().enumerate() {
1096 if matches!(value.as_str(), "periodic" | "cyclic" | "cc") {
1097 axes[axis] = true;
1098 }
1099 }
1100 } else if dim == 1
1101 && matches!(
1102 boundary.first().map(String::as_str),
1103 Some("periodic" | "cyclic" | "cc")
1104 )
1105 {
1106 axes[0] = true;
1107 }
1108 }
1109 Ok(axes)
1110}
1111
1112fn parse_optional_numeric_list(
1113 options: &BTreeMap<String, String>,
1114 keys: &[&str],
1115 dim: usize,
1116) -> Result<Vec<Option<f64>>, String> {
1117 let Some(raw) = keys.iter().find_map(|key| options.get(*key)) else {
1118 return Ok(vec![None; dim]);
1119 };
1120 let values = split_list_option(raw);
1121 let mut out = vec![None; dim];
1122 if values.len() == 1 && dim == 1 {
1123 if !values[0].eq_ignore_ascii_case("none") {
1124 out[0] = Some(parse_numeric_expr(&values[0])?);
1125 }
1126 return Ok(out);
1127 }
1128 if values.len() != dim {
1129 return Err(format!(
1130 "numeric option list length {} must match smooth dimension {}",
1131 values.len(),
1132 dim
1133 ));
1134 }
1135 for (i, value) in values.iter().enumerate() {
1136 if !value.eq_ignore_ascii_case("none") {
1137 out[i] = Some(parse_numeric_expr(value)?);
1138 }
1139 }
1140 Ok(out)
1141}
1142
1143fn parse_periods(
1144 options: &BTreeMap<String, String>,
1145 periodic_axes: &[bool],
1146) -> Result<Vec<Option<f64>>, String> {
1147 let dim = periodic_axes.len();
1148 // Broadcast a single-element `period=[v]` onto the lone periodic axis
1149 // of a multi-axis smooth (e.g. `te(th, h, bc=['periodic','natural'],
1150 // period=[2*pi])`): with only one periodic margin, the value can only
1151 // belong there.
1152 let lone_periodic_broadcast = options
1153 .get("period")
1154 .or_else(|| options.get("periods"))
1155 .and_then(|raw| {
1156 let values = split_list_option(raw);
1157 if values.len() != 1 || dim <= 1 {
1158 return None;
1159 }
1160 let mut iter = periodic_axes.iter().enumerate().filter(|(_, p)| **p);
1161 let first = iter.next()?;
1162 if iter.next().is_some() {
1163 return None;
1164 }
1165 Some((first.0, values.into_iter().next().unwrap()))
1166 });
1167 let periods = if let Some((axis, value)) = lone_periodic_broadcast {
1168 let mut out = vec![None; dim];
1169 if !value.eq_ignore_ascii_case("none") {
1170 out[axis] = Some(parse_numeric_expr(&value)?);
1171 }
1172 out
1173 } else {
1174 parse_optional_numeric_list(options, &["period", "periods"], dim)?
1175 };
1176 for (axis, (periodic, period)) in periodic_axes.iter().zip(periods.iter()).enumerate() {
1177 if *periodic
1178 && let Some(value) = period
1179 && (!value.is_finite() || *value <= 0.0)
1180 {
1181 return Err(format!(
1182 "period for periodic axis {axis} must be finite and positive, got {value}"
1183 ));
1184 }
1185 }
1186 Ok(periods)
1187}
1188
1189fn parse_period_origins(
1190 options: &BTreeMap<String, String>,
1191 periodic_axes: &[bool],
1192) -> Result<Vec<Option<f64>>, String> {
1193 parse_optional_numeric_list(
1194 options,
1195 &[
1196 "origin",
1197 "origins",
1198 "period_origin",
1199 "period-origin",
1200 "domain_origin",
1201 ],
1202 periodic_axes.len(),
1203 )
1204}
1205
1206/// Parse a per-axis periodic flag list for tensor smooths. Accepts three forms:
1207/// - `periodic=true` / `periodic=false` (scalar applied to every axis),
1208/// - `periodic=[true, false, ...]` (one flag per axis, length `dim`),
1209/// - `periodic=c(1, 1)` / `c(0, 0)` (a length-`dim` 0/1 mask, mgcv's
1210/// per-margin spelling — distinguished from an axis-index list by the
1211/// repeated 0/1 value), and
1212/// - `periodic=[0, 2, ...]` (axis indices that are periodic; others are not).
1213///
1214/// `boundary=[..., "periodic"/"cyclic"/"cc", ...]` may also flip individual
1215/// axes on; non-matching tokens leave the existing flag unchanged.
1216fn parse_tensor_periodic_axes(
1217 options: &BTreeMap<String, String>,
1218 dim: usize,
1219) -> Result<Vec<bool>, String> {
1220 let mut axes = vec![false; dim];
1221 if let Some(raw) = options.get("periodic").or_else(|| options.get("cyclic")) {
1222 let lowered = raw.trim().to_ascii_lowercase();
1223 match lowered.as_str() {
1224 "true" | "yes" | "y" => {
1225 axes.fill(true);
1226 }
1227 "false" | "no" | "n" => {
1228 // Already false; allow `boundary=` below to flip axes if set.
1229 }
1230 _ => {
1231 let entries = parse_option_list(raw);
1232 let all_bool = !entries.is_empty()
1233 && entries.iter().all(|v| {
1234 matches!(
1235 v.as_str(),
1236 "true" | "yes" | "y" | "false" | "no" | "n" | "none"
1237 )
1238 });
1239 // mgcv writes per-margin flag vectors as `periodic=c(1,1)` /
1240 // `periodic=c(0,0)` — a length-`dim` mask where each entry is a
1241 // 0/1 flag for THAT margin, not an axis index. A bare axis-index
1242 // list (`periodic=[0,1]`, `periodic=[0]`) lists DISTINCT margin
1243 // indices to turn on. The two collide only when the list is all
1244 // 0/1 of length `dim`; disambiguate by the repeated-value
1245 // signature `c(1,1)`/`c(0,0)` (a valid axis-index set never
1246 // repeats an index), which is the canonical mask spelling. This
1247 // is what makes the leading tensor margin honor its periodic flag
1248 // (#1751: `periodic=c(1,1)` previously parsed `1,1` as axis
1249 // indices, marking only axis 1 and dropping axis 0).
1250 let all_zero_one = !entries.is_empty()
1251 && entries.iter().all(|v| v == "0" || v == "1");
1252 let has_repeat = {
1253 let mut seen = std::collections::BTreeSet::new();
1254 !entries.iter().all(|v| seen.insert(v.clone()))
1255 };
1256 let numeric_mask = all_zero_one && entries.len() == dim && has_repeat;
1257 if all_bool || numeric_mask {
1258 if entries.len() != dim {
1259 return Err(format!(
1260 "periodic list length {} must match smooth dimension {}",
1261 entries.len(),
1262 dim
1263 ));
1264 }
1265 for (i, v) in entries.iter().enumerate() {
1266 axes[i] = matches!(v.as_str(), "true" | "yes" | "y" | "1");
1267 }
1268 } else {
1269 for axis_raw in entries {
1270 let axis = axis_raw
1271 .parse::<usize>()
1272 .map_err(|err| format!("invalid periodic axis '{axis_raw}': {err}"))?;
1273 if axis >= dim {
1274 return Err(format!(
1275 "periodic axis {axis} out of range for {dim}D smooth"
1276 ));
1277 }
1278 axes[axis] = true;
1279 }
1280 }
1281 }
1282 }
1283 }
1284 if let Some(raw) = options.get("boundary").or_else(|| options.get("bc")) {
1285 let boundary = parse_option_list(raw);
1286 if boundary.len() == dim {
1287 for (axis, value) in boundary.iter().enumerate() {
1288 if matches!(value.as_str(), "periodic" | "cyclic" | "cc") {
1289 axes[axis] = true;
1290 }
1291 }
1292 }
1293 }
1294 // A per-margin basis vector (`bs=c('cc','ps')` / `type=[...]`) declares each
1295 // margin's basis family, and a cyclic family (`cc`/`cp`/`cyclic`) makes THAT
1296 // margin periodic — exactly as the 1-D `s(x, bs='cc')` smooth wraps its lone
1297 // axis. Without this, the per-margin `cc` token was validated but discarded:
1298 // every `bs=c(...)` spelling collapsed to the same open B-spline tensor
1299 // (#1752). Only honor the vector form here; a scalar `bs='cc'` on a tensor is
1300 // ambiguous about which margins wrap, so it does not flip any axis on.
1301 if let Some(raw) = options.get("bs").or_else(|| options.get("type"))
1302 && bs_selector_is_vector(raw)
1303 {
1304 let per_margin = parse_option_list(raw);
1305 if per_margin.len() == dim {
1306 for (axis, margin_bs) in per_margin.iter().enumerate() {
1307 if matches!(
1308 canonicalize_smooth_type(margin_bs),
1309 "cc" | "cp" | "cyclic"
1310 ) {
1311 axes[axis] = true;
1312 }
1313 }
1314 }
1315 }
1316 Ok(axes)
1317}
1318
1319/// Validate the per-margin `boundary=`/`bc=` tokens on a tensor-product smooth.
1320///
1321/// The tensor `boundary`/`bc` list selects, per margin, whether the margin
1322/// *wraps* (a `periodic`/`cyclic`/`cc` token, consumed by
1323/// [`parse_tensor_periodic_axes`]) or is an ordinary non-periodic margin. In the
1324/// tensor DSL a *non-periodic* margin is spelled `clamped` — in the B-spline
1325/// sense of a **clamped knot vector**, i.e. the standard open spline that is
1326/// free at its two ends and does not wrap (exactly how the callers document it:
1327/// "non-periodic / clamped … free at the two ends, no wrap"). It is therefore an
1328/// inert marker here, not a zero-derivative endpoint reparameterization: a
1329/// cylinder `te(theta, z, boundary=['periodic','clamped'], …)` is a cyclic θ
1330/// margin tensor-producted with an ordinary open z margin, the direct analog of
1331/// mgcv `te(bs=c("cc","ps"))` / `te(bs=c("cc","cr"))`.
1332///
1333/// The periodic selectors and the inert non-periodic markers
1334/// (`clamped`/`open`/`natural`/`free`/`none`/empty) are accepted; anything else
1335/// (e.g. a genuine `anchored` zero-value endpoint constraint, which has no
1336/// ordinary-margin meaning in a tensor) is surfaced as a clean
1337/// unsupported-feature error rather than silently dropped. Previously `clamped`
1338/// itself was rejected, so the cylinder/torus mixed-boundary tensors — the exact
1339/// construction the manifold quality suite builds — could not be fit at all.
1340fn validate_tensor_boundary_tokens(
1341 options: &BTreeMap<String, String>,
1342 dim: usize,
1343) -> Result<(), String> {
1344 let Some(raw) = options.get("boundary").or_else(|| options.get("bc")) else {
1345 return Ok(());
1346 };
1347 let entries = parse_option_list(raw);
1348 for (axis, value) in entries.iter().enumerate() {
1349 let inert = matches!(
1350 value.trim().to_ascii_lowercase().as_str(),
1351 "clamped" | "open" | "natural" | "free" | "none" | "" | "periodic" | "cyclic" | "cc"
1352 );
1353 if !inert {
1354 return Err(TermBuilderError::unsupported_feature(format!(
1355 "tensor smooth margin {axis} boundary token '{value}' is not supported \
1356 (got bc/boundary={raw:?} on a {dim}-D tensor); tensor margins accept the periodic \
1357 selectors (periodic/cyclic/cc) or the non-periodic markers (clamped/open/natural/free). \
1358 Apply anchored/zero-value endpoint constraints with a 1-D s(x, bc=...) term instead."
1359 ))
1360 .to_string());
1361 }
1362 }
1363 Ok(())
1364}
1365
1366fn tensor_k_axis_option_axis(
1367 key: &str,
1368 cols: &[usize],
1369 ds: &Dataset,
1370) -> Result<Option<usize>, String> {
1371 let Some(suffix) = key.strip_prefix("k_") else {
1372 return Ok(None);
1373 };
1374 if suffix.is_empty() {
1375 return Err("tensor k axis option must be named k_<axis> or k_<variable>".to_string());
1376 }
1377 if let Ok(axis) = suffix.parse::<usize>() {
1378 return if axis < cols.len() {
1379 Ok(Some(axis))
1380 } else {
1381 Err(format!(
1382 "tensor k axis option `{key}` references axis {axis}, but the smooth has {} margins",
1383 cols.len()
1384 ))
1385 };
1386 }
1387
1388 let mut matches = cols
1389 .iter()
1390 .enumerate()
1391 .filter(|(_, col)| ds.headers.get(**col).is_some_and(|name| name == suffix))
1392 .map(|(axis, _)| axis);
1393 let first = matches.next();
1394 if matches.next().is_some() {
1395 return Err(format!(
1396 "tensor k axis option `{key}` matches more than one margin named `{suffix}`"
1397 ));
1398 }
1399 first.map(Some).ok_or_else(|| {
1400 let margin_names = cols
1401 .iter()
1402 .enumerate()
1403 .map(|(axis, col)| {
1404 let name = ds
1405 .headers
1406 .get(*col)
1407 .map(String::as_str)
1408 .unwrap_or("<unnamed>");
1409 format!("{axis}:{name}")
1410 })
1411 .collect::<Vec<_>>()
1412 .join(", ");
1413 format!(
1414 "tensor k axis option `{key}` does not match a margin index or name; tensor margins are [{margin_names}]"
1415 )
1416 })
1417}
1418
1419fn is_tensor_k_axis_option_key(key: &str) -> bool {
1420 key.strip_prefix("k_")
1421 .is_some_and(|suffix| !suffix.is_empty())
1422}
1423
1424/// Parse a per-margin basis dimension list (`k=<scalar>`, `k=[k0, k1, ...]`,
1425/// or axis aliases like `k_x=...` / `k_0=...`). A scalar is broadcast across
1426/// all axes; `None` returns the heuristic from the data column.
1427fn parse_tensor_k_list(
1428 options: &BTreeMap<String, String>,
1429 cols: &[usize],
1430 ds: &Dataset,
1431) -> Result<(Vec<usize>, bool), String> {
1432 let mut axis_values = vec![None; cols.len()];
1433 let mut saw_axis_alias = false;
1434 for (key, value) in options {
1435 let Some(axis) = tensor_k_axis_option_axis(key, cols, ds)? else {
1436 continue;
1437 };
1438 saw_axis_alias = true;
1439 if axis_values[axis].is_some() {
1440 return Err(format!("tensor k axis {axis} is specified more than once"));
1441 }
1442 let k: usize = value
1443 .parse()
1444 .map_err(|err| format!("invalid tensor k option `{key}={value}`: {err}"))?;
1445 axis_values[axis] = Some(k);
1446 }
1447
1448 let raw = options
1449 .get("k")
1450 .or_else(|| options.get("basis_dim"))
1451 .or_else(|| options.get("basis-dim"))
1452 .or_else(|| options.get("basisdim"));
1453 if saw_axis_alias {
1454 if raw.is_some() {
1455 return Err(
1456 "tensor k axis aliases cannot be combined with k= or basis_dim=".to_string(),
1457 );
1458 }
1459 if let Some(missing_axis) = axis_values.iter().position(Option::is_none) {
1460 let margin_name = cols
1461 .get(missing_axis)
1462 .and_then(|col| ds.headers.get(*col))
1463 .map(String::as_str)
1464 .unwrap_or("<unnamed>");
1465 return Err(format!(
1466 "tensor k axis aliases must specify every margin; missing axis {missing_axis} ({margin_name})"
1467 ));
1468 }
1469 return Ok((
1470 axis_values
1471 .into_iter()
1472 .map(|k| k.expect("missing axis values rejected above"))
1473 .collect(),
1474 false,
1475 ));
1476 }
1477 let Some(raw) = raw else {
1478 let inferred = heuristic_tensor_margin_knots(cols, ds);
1479 return Ok((inferred, true));
1480 };
1481 let entries = split_list_option(raw);
1482 if entries.len() == 1 {
1483 let k: usize = entries[0]
1484 .parse()
1485 .map_err(|err| format!("invalid tensor k '{}': {err}", entries[0]))?;
1486 return Ok((vec![k; cols.len()], false));
1487 }
1488 if entries.len() != cols.len() {
1489 return Err(format!(
1490 "tensor k list length {} must match smooth dimension {}",
1491 entries.len(),
1492 cols.len()
1493 ));
1494 }
1495 let mut out = Vec::with_capacity(entries.len());
1496 for entry in entries {
1497 let k: usize = entry
1498 .parse()
1499 .map_err(|err| format!("invalid tensor k '{entry}': {err}"))?;
1500 out.push(k);
1501 }
1502 Ok((out, false))
1503}
1504
1505/// Parse the `identifiability=` option for tensor-product smooths. Mirrors the
1506/// vocabulary of the Matern/Duchon parsers so the formula DSL is consistent.
1507///
1508/// `kind` selects the default identifiability when no explicit
1509/// `identifiability=` option is supplied: `te(...)` ([`SmoothKind::Te`]) keeps
1510/// the full-tensor sum-to-zero default, while `ti(...)` ([`SmoothKind::Ti`])
1511/// defaults to per-margin sum-to-zero so the marginal main effects are excluded
1512/// (the mgcv tensor-interaction semantics). An explicit option always wins.
1513fn parse_tensor_identifiability(
1514 options: &BTreeMap<String, String>,
1515 kind: SmoothKind,
1516) -> Result<TensorBSplineIdentifiability, String> {
1517 let Some(raw) = options.get("identifiability").map(String::as_str) else {
1518 return Ok(match kind {
1519 SmoothKind::Ti => TensorBSplineIdentifiability::MarginalSumToZero,
1520 _ => TensorBSplineIdentifiability::default(),
1521 });
1522 };
1523 match raw.trim().to_ascii_lowercase().as_str() {
1524 "none" => Ok(TensorBSplineIdentifiability::None),
1525 "sum_tozero" | "sum-to-zero" | "center_sum_tozero" | "center-sum-to-zero" | "centered"
1526 | "sumtozero" => Ok(TensorBSplineIdentifiability::SumToZero),
1527 "marginal_sum_tozero" | "marginal-sum-to-zero" | "marginal_sumtozero"
1528 | "marginalsumtozero" | "interaction" => {
1529 Ok(TensorBSplineIdentifiability::MarginalSumToZero)
1530 }
1531 other => Err(TermBuilderError::unsupported_feature(format!(
1532 "invalid tensor identifiability '{other}'; expected one of: none, sum_tozero, marginal_sum_tozero"
1533 ))
1534 .to_string()),
1535 }
1536}
1537
1538fn bspline_boundary_declares_periodic_axis(options: &BTreeMap<String, String>) -> bool {
1539 options
1540 .get("boundary")
1541 .or_else(|| options.get("bc"))
1542 .map(|raw| {
1543 parse_option_list(raw)
1544 .into_iter()
1545 .any(|value| matches!(value.as_str(), "periodic" | "cyclic" | "cc"))
1546 })
1547 .unwrap_or(false)
1548}
1549
1550/// Canonical-name lookup for the `bs=`/`type=` smooth selector.
1551///
1552/// User-facing names — including mgcv-compatible spellings whose semantics
1553/// match an existing gamfit smooth exactly — collapse to the engine-internal
1554/// canonical names used by the dispatch in [`build_smooth_basis`]. Adding a
1555/// new exactly-equivalent alias is a one-line entry here; the match arms
1556/// below remain the single dispatch site.
1557///
1558/// Aliases listed here MUST be true semantic equivalents of the canonical
1559/// target, not approximations. mgcv names whose semantics differ from any
1560/// gamfit smooth (e.g. `bs="ts"` shrinkage thin-plate, `bs="ad"` adaptive)
1561/// are intentionally NOT mapped here — they should reach the unsupported-type
1562/// path so users get a real diagnostic instead of a silent semantic
1563/// substitution. mgcv's `bs="cr"`/`"cs"` (cubic regression and its shrinkage
1564/// twin) are handled directly in the [`build_smooth_basis`] dispatch — they
1565/// are not aliased here because the `cr`/`cs` distinction controls a default
1566/// (`double_penalty`) that the canonical-name layer cannot see.
1567///
1568/// Unrecognised inputs pass through unchanged so the dispatch can produce its
1569/// usual "unsupported smooth type" error, preserving the existing diagnostic
1570/// surface for genuine typos.
1571pub(crate) fn canonicalize_smooth_type(raw: &str) -> &str {
1572 match raw {
1573 // Thin-plate spline. mgcv `bs="tp"` is the default thin-plate
1574 // regression spline — exact semantic equivalent of gamfit's `"tps"`.
1575 "tp" => "tps",
1576 // Gaussian process / Matérn. mgcv `bs="gp"` defaults to a Matérn
1577 // covariance kernel with REML smoothing parameter selection, which
1578 // matches gamfit's `"matern"` exactly (same kernel-Gram identity,
1579 // same REML route).
1580 "gp" => "matern",
1581 // Constant-curvature (M_κ) geodesic-kernel smooth (#944). All aliases
1582 // collapse to one canonical type so `bs="curv"`/`bs="mkappa"` cannot
1583 // diverge from `curv(...)`.
1584 "curv" | "constant_curvature" | "mkappa" => "curvature",
1585 // Measure-jet spline: multiscale local-jet-residual energy of the
1586 // empirical measure. No mgcv equivalent (mgcv has no measure-learned
1587 // geometry smooth), so no mgcv alias is mapped.
1588 "mjs" | "measure_jet" | "web" => "measurejet",
1589 other => other,
1590 }
1591}
1592
1593/// Is `margin_bs` a per-margin basis name that the tensor builder realizes as a
1594/// penalized 1-D B-spline margin?
1595///
1596/// gam's tensor product is built from penalized B-spline marginals. mgcv's
1597/// thin-plate (`tp`/`tps`), P-spline (`ps`), B-spline (`bs`), cubic-regression
1598/// (`cr`/`cs`), and cyclic (`cc`/`cp`/`cyclic`) marginals are all penalized
1599/// splines spanning the same per-axis smoothing space, so a B-spline margin
1600/// reproduces the same tensor smoothing class. Margin kinds with fundamentally
1601/// different structure (adaptive, random-effect, sphere) are NOT accepted as
1602/// tensor margins.
1603pub(crate) fn tensor_margin_bs_is_supported(margin_bs: &str) -> bool {
1604 matches!(
1605 canonicalize_smooth_type(margin_bs),
1606 "tps" | "ps" | "bs" | "bspline" | "cr" | "cs" | "cc" | "cp" | "cyclic"
1607 )
1608}
1609
1610/// Does the smooth request a periodic/cyclic axis via its options?
1611///
1612/// Mirrors the boundary-condition reading used by the periodic-aware dispatch
1613/// branches. Factored out so the type resolver and `build_smooth_basis` agree
1614/// on a single notion of "periodic requested".
1615pub(crate) fn smooth_options_declare_periodic(options: &BTreeMap<String, String>) -> bool {
1616 options.contains_key("periodic")
1617 || options.contains_key("cyclic")
1618 || options
1619 .get("boundary")
1620 .or_else(|| options.get("bc"))
1621 .map(|boundary| {
1622 boundary.to_ascii_lowercase().contains("periodic")
1623 || boundary.to_ascii_lowercase().contains("cyclic")
1624 })
1625 .unwrap_or(false)
1626}
1627
1628/// Resolve the canonical engine-internal smooth-type name for a term.
1629///
1630/// Reads the user-facing `type=`/`bs=` selector and collapses mgcv-compatible
1631/// aliases (`tp`→`tps`, `gp`→`matern`) via [`canonicalize_smooth_type`], or
1632/// derives the default from the smooth kind/arity when no selector is given.
1633/// This is the single source of truth for the dispatch in
1634/// [`build_smooth_basis`]; other call sites (e.g. predictor-specific basis
1635/// policy) use it so the classification never drifts from the dispatch.
1636/// Is the raw `bs=`/`type=` selector a vector literal (`c('tp','tp')`,
1637/// `['tp','tp']`, `(tp, tp)`) rather than a scalar smooth-type name?
1638///
1639/// mgcv's tensor smooths take a *per-margin* basis vector
1640/// (`te(x1, x2, bs=c('tp','tp'))`). Such a value is not a scalar canonical
1641/// type and must not be fed through [`canonicalize_smooth_type`] — it has to be
1642/// recognized as a tensor request and split into per-margin types. A scalar
1643/// selector (`bs="tp"`) is left untouched.
1644pub(crate) fn bs_selector_is_vector(raw: &str) -> bool {
1645 let trimmed = raw.trim();
1646 let bracketed = (trimmed.starts_with('[') && trimmed.ends_with(']'))
1647 || (trimmed.starts_with("c(") || trimmed.starts_with("C(")) && trimmed.ends_with(')')
1648 || (trimmed.starts_with('(') && trimmed.ends_with(')'));
1649 bracketed && !parse_option_list(trimmed).is_empty()
1650}
1651
1652pub fn resolve_smooth_type_name(
1653 kind: SmoothKind,
1654 n_cols: usize,
1655 options: &BTreeMap<String, String>,
1656) -> String {
1657 let selector = options.get("type").or_else(|| options.get("bs"));
1658 // A per-margin basis vector is a tensor request, never a scalar type. Route
1659 // it to the tensor builder, which reads the per-margin types out of the
1660 // same `bs=` option. (A vector on a non-tensor smooth is ill-formed and
1661 // falls through to the scalar path below so the existing diagnostic fires.)
1662 if let Some(raw) = selector
1663 && bs_selector_is_vector(raw)
1664 && matches!(kind, SmoothKind::Te | SmoothKind::Ti | SmoothKind::T2)
1665 {
1666 return "tensor".to_string();
1667 }
1668 selector
1669 .map(|s| canonicalize_smooth_type(&s.to_ascii_lowercase()).to_string())
1670 .unwrap_or_else(|| match kind {
1671 SmoothKind::Te | SmoothKind::Ti | SmoothKind::T2 => "tensor".to_string(),
1672 SmoothKind::S if n_cols == 1 => "bspline".to_string(),
1673 // Mixed periodic Euclidean radial kernels are not separable on the
1674 // cylinder. Use a tensor product with a cyclic margin so s(theta,h)
1675 // honors seam continuity while preserving the formula-level s(...).
1676 SmoothKind::S if smooth_options_declare_periodic(options) => "tensor".to_string(),
1677 SmoothKind::S => "tps".to_string(),
1678 })
1679}
1680
1681/// Does this canonical smooth type size its basis through the generous spatial
1682/// center heuristic ([`crate::basis::default_num_centers`])?
1683///
1684/// Only the radial spatial bases (thin-plate, Matérn/GP, Duchon) route their
1685/// default basis dimension through `plan_spatial_basis(.., Default, ..)`. The
1686/// B-spline, cyclic, tensor, and factor-smooth bases use their own modest
1687/// knot-based defaults, so they are unaffected by — and must not be perturbed
1688/// by — secondary-predictor basis-parsimony adjustments (#501).
1689pub fn smooth_type_uses_spatial_center_heuristic(canonical_type: &str) -> bool {
1690 matches!(canonical_type, "tps" | "matern" | "duchon")
1691}
1692
1693pub fn build_smooth_basis(
1694 kind: SmoothKind,
1695 vars: &[String],
1696 cols: &[usize],
1697 options: &BTreeMap<String, String>,
1698 ds: &Dataset,
1699 inference_notes: &mut Vec<String>,
1700 policy: &ResourcePolicy,
1701 smooth_coordinate_count: usize,
1702) -> Result<SmoothBasisSpec, String> {
1703 // Fail fast on degenerate input: a smooth whose (non-categorical) coordinate
1704 // columns collapse to a SINGLE distinct point can only ever fit the response
1705 // mean — its design matrix is rank-1. For a UNIVARIATE smooth this is exactly
1706 // "the one column is constant": `smooth(x)`/`matern(x)` on constant `x` would
1707 // otherwise silently fit the mean of `y` with no visible cue (Duchon already
1708 // errors loudly via the basis layer; this makes the diagnosis explicit and
1709 // uniform). For a MULTIVARIATE smooth (tensor, sphere, tps, ...) a single
1710 // constant coordinate is NOT degenerate — the basis still varies along the
1711 // other coordinate(s) and the penalty absorbs the rank-deficient direction
1712 // (e.g. a constant-longitude meridian arc on the sphere is a well-posed 1-D
1713 // slice of S²). Such a term is degenerate only when EVERY coordinate is
1714 // constant at once, i.e. the joint input is a single point. Test the JOINT
1715 // cardinality, not each column independently, so the loud diagnosis still
1716 // fires for the genuinely rank-1 case without rejecting well-posed
1717 // lower-dimensional slices.
1718 let coord_cols: Vec<(&String, usize)> = vars
1719 .iter()
1720 .zip(cols.iter().copied())
1721 .filter(|(_, col)| !matches!(ds.column_kinds.get(*col), Some(ColumnKindTag::Categorical)))
1722 .collect();
1723 if !coord_cols.is_empty() {
1724 let views: Vec<ArrayView1<'_, f64>> = coord_cols
1725 .iter()
1726 .map(|(_, col)| ds.values.column(*col))
1727 .collect();
1728 let n_rows = views[0].len();
1729 let mut distinct_points = std::collections::HashSet::<Vec<u64>>::new();
1730 for r in 0..n_rows {
1731 let key: Vec<u64> = views
1732 .iter()
1733 .map(|v| {
1734 let x = v[r];
1735 let norm = if x == 0.0 { 0.0 } else { x };
1736 norm.to_bits()
1737 })
1738 .collect();
1739 distinct_points.insert(key);
1740 if distinct_points.len() > 1 {
1741 break;
1742 }
1743 }
1744 if distinct_points.len() <= 1 {
1745 return Err(TermBuilderError::degenerate_data(if coord_cols.len() == 1 {
1746 let var = coord_cols[0].0;
1747 format!(
1748 "smooth term over '{var}' has only one unique value in the training data \
1749 — a smooth on a constant column is degenerate and would only fit the response mean. \
1750 Remove `{var}` from the smooth, drop the term, or check the data."
1751 )
1752 } else {
1753 let names = coord_cols
1754 .iter()
1755 .map(|(v, _)| v.as_str())
1756 .collect::<Vec<_>>()
1757 .join(", ");
1758 format!(
1759 "smooth term over ({names}) has only one unique joint coordinate in the training \
1760 data — every coordinate is constant, so the smooth is degenerate and would only \
1761 fit the response mean. Drop the term or check the data."
1762 )
1763 })
1764 .to_string());
1765 }
1766 }
1767 if let Some(by_name) = options.get("by").cloned() {
1768 let by_col = options
1769 .get("__by_col")
1770 .and_then(|raw| raw.parse::<usize>().ok())
1771 .or_else(|| vars.iter().position(|v| v == &by_name).map(|idx| cols[idx]))
1772 .ok_or_else(|| format!("unknown by= column '{by_name}'"))?;
1773 let mut inner_options = options.clone();
1774 inner_options.remove("by");
1775 inner_options.remove("__by_col");
1776 inner_options.remove("id");
1777 let inner = build_smooth_basis(
1778 kind,
1779 vars,
1780 cols,
1781 &inner_options,
1782 ds,
1783 inference_notes,
1784 policy,
1785 smooth_coordinate_count,
1786 )?;
1787 let by_kind = match ds.column_kinds.get(by_col).copied() {
1788 Some(ColumnKindTag::Categorical) => ByVarKind::Factor {
1789 feature_col: by_col,
1790 ordered: option_bool(options, "ordered").unwrap_or(false),
1791 frozen_levels: None,
1792 },
1793 Some(ColumnKindTag::Continuous | ColumnKindTag::Binary) => ByVarKind::Numeric {
1794 feature_col: by_col,
1795 },
1796 None => {
1797 return Err(format!(
1798 "internal column-kind lookup failed for by='{by_name}'"
1799 ));
1800 }
1801 };
1802 return Ok(SmoothBasisSpec::BySmooth {
1803 smooth: Box::new(inner),
1804 by_kind,
1805 });
1806 }
1807
1808 let smooth_double_penalty = option_bool(options, "double_penalty").unwrap_or(true);
1809 let type_opt = resolve_smooth_type_name(kind, cols.len(), options);
1810
1811 if matches!(type_opt.as_str(), "fs" | "sz" | "re") {
1812 validate_known_options(
1813 type_opt.as_str(),
1814 options,
1815 &[
1816 "type",
1817 "bs",
1818 "k",
1819 "basis_dim",
1820 "basis-dim",
1821 "basisdim",
1822 "knots",
1823 "knot_placement",
1824 "knot-placement",
1825 "knotplacement",
1826 "degree",
1827 "penalty_order",
1828 "m",
1829 "double_penalty",
1830 "ordered",
1831 ],
1832 )?;
1833 if cols.len() != 2 {
1834 return Err(format!(
1835 "{} factor-smooth currently expects exactly two variables (one numeric, one categorical)",
1836 type_opt
1837 ));
1838 }
1839 let kinds = cols
1840 .iter()
1841 .map(|&c| ds.column_kinds.get(c).copied())
1842 .collect::<Vec<_>>();
1843 let (cont_idx, group_idx) = if type_opt == "re" {
1844 // mgcv random-slope examples are often s(g, x, bs="re").
1845 match (kinds[0], kinds[1]) {
1846 (Some(ColumnKindTag::Categorical), _) => (1usize, 0usize),
1847 (_, Some(ColumnKindTag::Categorical)) => (0usize, 1usize),
1848 _ => (1usize, 0usize),
1849 }
1850 } else {
1851 match (kinds[0], kinds[1]) {
1852 (_, Some(ColumnKindTag::Categorical)) => (0usize, 1usize),
1853 (Some(ColumnKindTag::Categorical), _) => (1usize, 0usize),
1854 _ => {
1855 return Err(format!(
1856 "{} factor-smooth requires one categorical factor variable",
1857 type_opt
1858 ));
1859 }
1860 }
1861 };
1862 let c = cols[cont_idx];
1863 let (minv, maxv) = col_minmax(ds.values.column(c))?;
1864 let degree = if type_opt == "re" {
1865 1
1866 } else {
1867 option_usize(options, "degree").unwrap_or(DEFAULT_BSPLINE_DEGREE)
1868 };
1869 // For a factor smooth every group's curve is fit from THAT group's rows
1870 // alone, so the marginal's flexibility must respect the least-resolved
1871 // group, not the pooled column. The pooled heuristic can hand the marginal
1872 // a basis that saturates (or exceeds) a small group's sample — e.g. the
1873 // sleepstudy panel has 8 training days per subject, and a default cubic
1874 // basis of 8 functions interpolates each subject's 8 points, leaving no
1875 // room for the wiggliness penalty to collapse the curve toward the
1876 // per-subject line. The factor smooth then fits within-group noise and
1877 // extrapolates badly (held-out forecast worse than the population mean).
1878 //
1879 // Cap the marginal basis below the minimum per-group covariate resolution
1880 // so the penalty always retains residual degrees of freedom to shrink each
1881 // group's curvature toward its linear null space (the random-slope
1882 // estimand). This small-group cap composes with a separate upper bound at
1883 // mgcv's factor-smooth default k=10 (FACTOR_SMOOTH_DEFAULT_BASIS_DIM,
1884 // applied below), so even ample-data groups get the modest SHARED marginal
1885 // a factor smooth wants rather than the full pooled basis. The explicit
1886 // `re` random-effect form takes neither cap: it is a raw linear `[1, x]`
1887 // random effect (0 internal knots), handled in the branch above.
1888 let pooled_internal = heuristic_knots_for_column(ds.values.column(c));
1889 let default_internal = if type_opt == "re" {
1890 // `bs="re"` is a PARAMETRIC random effect, not a smooth of the
1891 // covariate: `s(x, g, bs="re")` is the mgcv random intercept+slope
1892 // `(1 + x | g)`, i.e. a per-group line `[1, x]`, penalized by an iid
1893 // ridge. A degree-1 marginal with ZERO internal knots spans exactly
1894 // that linear space (2 coefficients per group). Using the pooled
1895 // knot heuristic here instead turned the marginal into a
1896 // piecewise-linear B-spline (e.g. 6 functions/group on sleepstudy),
1897 // i.e. a *smooth* with kinks rather than a random slope — many extra
1898 // collinear-across-levels coefficients that ill-condition the joint
1899 // Newton/REML solve (minutes-long fits, and a singular block when
1900 // combined with a separate random intercept `s(g, bs="re")`). The
1901 // raw linear basis is both the correct `re` semantics and fast.
1902 0
1903 } else {
1904 let min_group_resolution =
1905 min_per_group_unique_count(ds.values.column(c), ds.values.column(cols[group_idx]));
1906 // Per-group basis dim = degree + 1 + internal. Hold it well below the
1907 // smallest group's resolution (leave at least two residual points per
1908 // group) so the smooth cannot interpolate that group and the
1909 // wiggliness penalty retains the room to collapse each curve toward
1910 // its linear null space. Never drop below `degree + 2`, which keeps
1911 // exactly the linear span plus a single curvature direction — the
1912 // minimal smoother that can still bend if the data demand it.
1913 let basis_cap = min_group_resolution.saturating_sub(2).max(degree + 2);
1914 let internal_cap = basis_cap.saturating_sub(degree + 1);
1915 let capped = pooled_internal.min(internal_cap.max(1));
1916 // A factor smooth (`fs` AND `sz`) shares ONE marginal across ALL
1917 // levels, each level's curve fit from that group's rows alone. The
1918 // pooled knot heuristic (driven by the full column's sample) hands it
1919 // a much richer basis than the shared signal needs — ~24
1920 // functions/group on the gam#903 factor-smooth-recovery fixtures — so
1921 // REML has the capacity to fit within-group noise and over-fits the
1922 // shared shape (fs: edf 58 vs mgcv's k=10/edf 39; sz: gam 0.068 vs
1923 // mgcv 0.046 truth RMSE), losing the truth-recovery head-to-head with
1924 // the mature tool. mgcv's factor-smooth default `k=10` embodies the
1925 // right convention: a modest shared marginal. Cap the marginal there
1926 // (basis ≈ degree+1+internal ≈ 10) for both flavours when the
1927 // small-group cap above is not already tighter, so REML is not handed
1928 // noise-fitting capacity it does not need. An explicit `k`/`basis_dim`
1929 // overrides this (parse_ps_internal_knots); `re` is the raw linear
1930 // effect handled above.
1931 let fs_default_internal = FACTOR_SMOOTH_DEFAULT_BASIS_DIM
1932 .saturating_sub(degree + 1)
1933 .max(1);
1934 capped.min(fs_default_internal)
1935 };
1936 let (n_knots, _, effective_degree) =
1937 parse_ps_internal_knots(options, degree, default_internal)?;
1938 let penalty_order = option_usize(options, "penalty_order")
1939 .unwrap_or(if effective_degree > 1 { 2 } else { 1 })
1940 .min(effective_degree);
1941 // All factor-smooth flavours (`fs`, `sz`, `re`) place their per-level
1942 // marginal on the SAME penalized B-spline (P-spline) basis. The flavours
1943 // differ ONLY in their penalty/constraint structure (handled below) —
1944 // sz: zero-sum deviation blocks with the per-level null space left
1945 // unpenalized; fs: random-effect double penalty; re: identity ridge.
1946 //
1947 // `sz` USED to route its default-degree marginal to a NATURAL cubic
1948 // regression spline (`cr`), on the belief that mgcv's `bs="sz"` does the
1949 // same and that cr recovers smooth signals more efficiently than the
1950 // (then uncapped) B-spline margin (#1074). That introduced a consistency
1951 // failure (#1605): the `cr` basis enforces the natural boundary
1952 // conditions f''(x_1)=f''(x_k)=0 and extrapolates linearly past the end
1953 // knots, so it CANNOT represent a per-group deviation curve with non-zero
1954 // curvature at the data boundary. Phase-shifted deviation shapes
1955 // (f''(0) = -(2π)² sin(φ) ≠ 0) are then biased toward "free linear +
1956 // anchored wiggle", under-shooting the amplitude — a bias that does NOT
1957 // vanish as n→∞ (n-independent: a genuine consistency failure, not
1958 // finite-sample shrinkage). The earlier #700/#1074 sz fixtures used
1959 // d_g ∝ sin(2πx), whose f'' happens to vanish at x=0 and x=1, so they
1960 // accidentally satisfied the natural BC and never exposed the gap; the
1961 // `fs` sibling, on this very B-spline marginal, recovers the SAME
1962 // phase-shifted data to the noise floor.
1963 //
1964 // The penalized B-spline marginal makes no boundary assumption, so it
1965 // represents arbitrary deviation shapes, and — with the
1966 // FACTOR_SMOOTH_DEFAULT_BASIS_DIM cap above already removing the
1967 // noise-fitting capacity that originally motivated leaving B-splines —
1968 // it recovers the BC-satisfying #700/#1074 signals just as well. Sharing
1969 // one marginal basis across all flavours also lets the B-spline degree/
1970 // knot degradation handle low-cardinality covariates uniformly (what
1971 // `fs` already does), so the `sz`-only cr data-support cap (#1541/#1542)
1972 // — and the asymmetry where only the cr-marginal `sz` spelling hard-
1973 // failed a 3-level ordinal — is no longer needed.
1974 let marginal_knotspec = resolve_nonperiodic_bspline_knotspec(
1975 options,
1976 ds.values.column(c),
1977 (minv, maxv),
1978 effective_degree,
1979 n_knots,
1980 )?;
1981 let marginal = BSplineBasisSpec {
1982 degree: effective_degree,
1983 penalty_order,
1984 knotspec: marginal_knotspec,
1985 // mgcv's `bs="fs"` is a random-effect-style smooth: EVERY per-level
1986 // coefficient, including the marginal null space, is penalized so
1987 // unobserved groups can be predicted — so `fs` keeps the null-space
1988 // (double) penalty. mgcv's `bs="sz"` is a pure across-level
1989 // *deviation* smooth that, under the default `select=FALSE`, leaves
1990 // the per-level null space UNPENALIZED; carrying the double penalty
1991 // there shrinks the genuine deviation signal and over-smooths the
1992 // recovered curves relative to mgcv (gam#700). `re` carries its own
1993 // identity ridge below and ignores this flag. Honour an explicit
1994 // user `double_penalty=` either way.
1995 double_penalty: option_bool(options, "double_penalty")
1996 .unwrap_or(type_opt.as_str() != "sz"),
1997 identifiability: BSplineIdentifiability::None,
1998 boundary_conditions: Default::default(),
1999 boundary: OneDimensionalBoundary::Open,
2000 };
2001 let flavour = match type_opt.as_str() {
2002 "fs" => FactorSmoothFlavour::Fs {
2003 m_null_penalty_orders: vec![
2004 option_usize(options, "m").unwrap_or(DEFAULT_PENALTY_ORDER),
2005 ],
2006 },
2007 "sz" => FactorSmoothFlavour::Sz,
2008 "re" => FactorSmoothFlavour::Re,
2009 // Outer `matches!` already restricts to fs/sz/re.
2010 other => {
2011 return Err(format!(
2012 "internal: factor-smooth flavour dispatch reached unexpected type `{}`",
2013 other
2014 ));
2015 }
2016 };
2017 return Ok(SmoothBasisSpec::FactorSmooth {
2018 spec: FactorSmoothSpec {
2019 continuous_cols: vec![c],
2020 group_col: cols[group_idx],
2021 marginal,
2022 flavour,
2023 group_frozen_levels: None,
2024 frozen_global_orthogonality: None,
2025 },
2026 });
2027 }
2028
2029 match type_opt.as_str() {
2030 "cyclic" | "cc" | "cp" | "cyclic-ps" => {
2031 validate_known_options(
2032 "cyclic",
2033 options,
2034 &[
2035 "type",
2036 "bs",
2037 "by",
2038 "k",
2039 "basis_dim",
2040 "basis-dim",
2041 "basisdim",
2042 "degree",
2043 "penalty_order",
2044 "period",
2045 "periods",
2046 "period_start",
2047 "period_end",
2048 "start",
2049 "end",
2050 "origin",
2051 "origins",
2052 "period_origin",
2053 "period-origin",
2054 "domain_origin",
2055 "double_penalty",
2056 "id",
2057 "__by_col",
2058 "identifiability",
2059 ],
2060 )?;
2061 if cols.len() != 1 {
2062 return Err(format!(
2063 "periodic smooth expects one variable, got {}",
2064 cols.len()
2065 ));
2066 }
2067 let c = cols[0];
2068 let (minv, maxv) = col_minmax(ds.values.column(c))?;
2069 let degree = option_usize(options, "degree").unwrap_or(DEFAULT_BSPLINE_DEGREE);
2070 let mut default_internal = heuristic_knots_for_column(ds.values.column(c));
2071 if ds.values.nrows() <= 32 && smooth_coordinate_count >= 5 {
2072 default_internal = default_internal.min(1);
2073 }
2074 // A periodic cubic spline has no free endpoint behaviour to spend
2075 // degrees of freedom on: the wrap constraint removes the ordinary
2076 // boundary wiggle, and the cyclic second-difference penalty leaves
2077 // only the constant direction (handled by the smooth
2078 // identifiability constraint). An over-rich default would give
2079 // small binomial/continuation-ratio fits a large penalized nuisance
2080 // space whose REML/LAML optimum is driven by finite-sample Bernoulli
2081 // noise rather than the low-frequency periodic signal. Cap the
2082 // cyclic default in the mgcv `bs="cc"` spirit: a modest basis unless
2083 // the caller explicitly requests `k=...`; high-frequency periodic
2084 // structure remains available through that explicit contract. Since
2085 // gam#1680 lowered the open-spline univariate default to ≈12
2086 // functions this cap and the open-spline default coincide, so it now
2087 // acts as an explicit floor/guard that keeps the cyclic default lean
2088 // even if the open-spline heuristic is later widened.
2089 let cyclic_default_basis_cap = CYCLIC_DEFAULT_BASIS_DIM.max(degree + 1);
2090 let default_basis = (default_internal + degree + 1).min(cyclic_default_basis_cap);
2091 let num_basis = option_usize_any(options, &["k", "basis_dim", "basis-dim", "basisdim"])
2092 .unwrap_or(default_basis);
2093 if num_basis < degree + 1 {
2094 return Err(format!(
2095 "periodic smooth: k={} too small for degree {}; expected k >= {}",
2096 num_basis,
2097 degree,
2098 degree + 1
2099 ));
2100 }
2101 // The cyclic arm is periodic on its single axis by construction, so
2102 // resolve the period exactly the way the `s()`/`ps` arm does: honour
2103 // `period=`/`periods=` first (with `origin=` setting the domain
2104 // start), and fall back to the `period_start`/`period_end` endpoint
2105 // form only when `period=` is absent. Previously this arm jumped
2106 // straight to `parse_periodic_domain_1d`, so a `period=<v>`
2107 // declaration was silently dropped and the smooth wrapped at the
2108 // data range (#816). All three helpers route through
2109 // `parse_numeric_expr`, so `period=2*pi` and `period_end=2*pi` parse
2110 // identically (#815).
2111 let periodic_axes = [true];
2112 let periods = parse_periods(options, &periodic_axes)?;
2113 let origins = parse_period_origins(options, &periodic_axes)?;
2114 // Distinguish a *cyclic basis selector* (`bs='cc'`/`cp'`/`cyclic`,
2115 // this whole arm) from a generic B-spline forced periodic by a
2116 // `periodic=`/`boundary=` flag (the `ps`/`bspline` arm). Only the
2117 // latter carries the sample-dependent off-by-ε seam that #1771's
2118 // guard in `parse_periodic_domain_1d` requires an explicit period
2119 // to avoid. A bare `s(x, bs='cc')` opts INTO mgcv's `bs="cc"`
2120 // semantics — the wrap IS the observed data range — exactly like
2121 // the tensor cc-margin fallback (`te(x, z, bs=c('cc','cc'))`). The
2122 // cyclic arm was left routing through the now-strict helper when
2123 // #1771 tightened it, so a bare cyclic smooth hard-errored with
2124 // "periodic B-spline smooth requires an explicit period" even
2125 // though its period is well-defined. Honor `period=`/`periods=`
2126 // first, then the half-open `period_start`/`period_end` endpoint
2127 // form, and only otherwise wrap at the observed `[min, max]` span.
2128 let has_endpoint_decl = ["period_start", "start", "period_end", "end"]
2129 .iter()
2130 .any(|key| options.contains_key(*key));
2131 let (domain_start, period) = if let Some(p) = periods[0] {
2132 (origins[0].unwrap_or(minv), p)
2133 } else if has_endpoint_decl {
2134 parse_periodic_domain_1d(options, minv, maxv)?
2135 } else {
2136 let span = maxv - minv;
2137 if !(span.is_finite() && span > 0.0) {
2138 return Err(format!(
2139 "cyclic smooth requires a positive observed data range to derive \
2140 its period, got [{minv}, {maxv}]"
2141 ));
2142 }
2143 (origins[0].unwrap_or(minv), span)
2144 };
2145 Ok(SmoothBasisSpec::BSpline1D {
2146 feature_col: c,
2147 spec: BSplineBasisSpec {
2148 degree,
2149 penalty_order: option_usize(options, "penalty_order")
2150 .unwrap_or(DEFAULT_PENALTY_ORDER),
2151 knotspec: BSplineKnotSpec::PeriodicUniform {
2152 data_range: (domain_start, domain_start + period),
2153 num_basis,
2154 },
2155 double_penalty: smooth_double_penalty,
2156 identifiability: BSplineIdentifiability::default(),
2157 boundary_conditions: Default::default(),
2158 boundary: OneDimensionalBoundary::Cyclic {
2159 start: domain_start,
2160 end: domain_start + period,
2161 },
2162 },
2163 })
2164 }
2165 "bspline" | "ps" | "p-spline" | "cr" | "cs" => {
2166 // mgcv's `bs="cr"` (cubic regression spline) and `bs="cs"` (its
2167 // shrinkage twin) are penalized cubic-regression smooths that span
2168 // the same per-axis function space as gamfit's `bspline` (cubic
2169 // B-spline, second-derivative penalty). Route both through the
2170 // 1-D B-spline arm; the only semantic difference is whether the
2171 // null space is shrunk: `cr` is the no-shrinkage form (mgcv's
2172 // default) and `cs` is the shrinkage form (mgcv's `cs`/gamfit's
2173 // double_penalty). Without this route, a stand-alone
2174 // `s(x, bs='cr')` (which is otherwise a routine 1-D smooth in
2175 // mgcv-compatible formulae) reached the dispatch's default arm
2176 // and aborted the whole fit with `unsupported smooth type 'cr'`,
2177 // even though the same name was already recognized as a tensor
2178 // margin (`tensor_margin_bs_is_supported`).
2179 let validation_name = match type_opt.as_str() {
2180 "cr" => "cr",
2181 "cs" => "cs",
2182 _ => "bspline",
2183 };
2184 validate_known_options(
2185 validation_name,
2186 options,
2187 &[
2188 "type",
2189 "bs",
2190 "by",
2191 "k",
2192 "basis_dim",
2193 "basis-dim",
2194 "basisdim",
2195 "knots",
2196 "knot_placement",
2197 "knot-placement",
2198 "knotplacement",
2199 "degree",
2200 "penalty_order",
2201 "boundary",
2202 "bc",
2203 "boundary_conditions",
2204 "bc_left",
2205 "bc_right",
2206 "left_bc",
2207 "right_bc",
2208 "start_bc",
2209 "end_bc",
2210 "side",
2211 "anchor",
2212 "anchor_value",
2213 "value",
2214 "anchor_left",
2215 "left_anchor",
2216 "anchor_right",
2217 "right_anchor",
2218 "periodic",
2219 "period",
2220 "periods",
2221 "period_start",
2222 "period_end",
2223 "origin",
2224 "double_penalty",
2225 "by",
2226 "id",
2227 "__by_col",
2228 "identifiability",
2229 "by",
2230 ],
2231 )?;
2232 if cols.len() != 1 {
2233 return Err(TermBuilderError::incompatible_config(format!(
2234 "bspline smooth expects one variable, got {}",
2235 cols.len()
2236 ))
2237 .to_string());
2238 }
2239 let c = cols[0];
2240 let (minv, maxv) = col_minmax(ds.values.column(c))?;
2241 let degree = option_usize(options, "degree").unwrap_or(DEFAULT_BSPLINE_DEGREE);
2242 let default_internal = heuristic_knots_for_column(ds.values.column(c));
2243 let (mut n_knots, inferred, effective_degree) =
2244 parse_ps_internal_knots(options, degree, default_internal)?;
2245 let periodic_axes = parse_periodic_axes(options, 1).map_err(|e| e.to_string())?;
2246 // Periodic margins still need enough basis functions to wrap, so
2247 // surface the per-axis degree reduction as a config error when the
2248 // user explicitly asked for a periodic-but-too-small basis. The
2249 // non-periodic path silently degrades degree to match mgcv.
2250 if periodic_axes[0] && effective_degree != degree {
2251 return Err(TermBuilderError::invalid_option(format!(
2252 "periodic smooth: k={} too small for degree {}; expected k >= {}",
2253 effective_degree + 1,
2254 degree,
2255 degree + 1
2256 ))
2257 .to_string());
2258 }
2259 if inferred && ds.values.nrows() <= 32 && smooth_coordinate_count >= 5 {
2260 n_knots = n_knots.min(1);
2261 }
2262 if inferred {
2263 let unique = unique_count_column(ds.values.column(c));
2264 let ceiling = ((unique as f64).cbrt() as usize).max(20);
2265 inference_notes.push(format!(
2266 "Automatically set {} internal knots for smooth '{}' from {} unique values (rule: clamp(unique/4, 4..max(20, cbrt(unique))) = clamp(unique/4, 4..{})). Override with knots=... or k=....",
2267 n_knots,
2268 vars.join(","),
2269 unique,
2270 ceiling,
2271 ));
2272 }
2273 let boundary_conditions =
2274 if periodic_axes[0] && bspline_boundary_declares_periodic_axis(options) {
2275 BSplineBoundaryConditions::default()
2276 } else {
2277 parse_bspline_boundary_conditions(options).map_err(|e| e.to_string())?
2278 };
2279 let periods = parse_periods(options, &periodic_axes).map_err(|e| e.to_string())?;
2280 let origins =
2281 parse_period_origins(options, &periodic_axes).map_err(|e| e.to_string())?;
2282 let (knotspec, boundary) = if periodic_axes[0] {
2283 if !boundary_conditions.is_free() {
2284 return Err(TermBuilderError::incompatible_config(
2285 "periodic B-splines cannot also declare endpoint boundary conditions",
2286 )
2287 .to_string());
2288 }
2289 {
2290 let (domain_start, p_value) = if periods[0].is_some() {
2291 (origins[0].unwrap_or(minv), periods[0].unwrap())
2292 } else {
2293 parse_periodic_domain_1d(options, minv, maxv).map_err(|e| e.to_string())?
2294 };
2295 let domain_end = domain_start + p_value;
2296 (
2297 BSplineKnotSpec::PeriodicUniform {
2298 data_range: (domain_start, domain_end),
2299 num_basis: n_knots + effective_degree + 1,
2300 },
2301 OneDimensionalBoundary::Cyclic {
2302 start: domain_start,
2303 end: domain_end,
2304 },
2305 )
2306 }
2307 } else if type_opt == "cr" || type_opt == "cs" {
2308 // mgcv `bs="cr"`/`"cs"`: a natural cubic regression spline whose
2309 // basis is indexed by `k` values at quantile-placed knots (#1074),
2310 // NOT a B-spline knot vector. Match gam's `k=` convention by
2311 // requesting the same total basis size the B-spline arm would
2312 // produce (`n_knots` internal + degree + 1), floored at the cr
2313 // minimum of 3 knots. `cr` vs `cs` (shrinkage) is carried by the
2314 // `double_penalty` flag resolved below, which the cr builder reads.
2315 //
2316 // Cap that request to the covariate's data support (#1541): a cr
2317 // basis cannot place more value-knots than there are distinct
2318 // covariate values, so an unclamped `k` on a low-cardinality
2319 // predictor (binary indicator, 3-level ordinal, small count) used
2320 // to hard-fail in `select_cr_knots` instead of reducing like mgcv
2321 // and gam's tensor path. Below the cr minimum (a binary covariate)
2322 // degrade to the B-spline marginal the default `s(x, k=..)` basis
2323 // already fits on the same data — never a hard error.
2324 let k_cr = (n_knots + effective_degree + 1).max(CR_MIN_KNOTS);
2325 let knotspec = match capped_cr_marginal_knotspec(
2326 ds.values.column(c),
2327 k_cr,
2328 &vars.join(","),
2329 inference_notes,
2330 )? {
2331 Some(cr_knotspec) => cr_knotspec,
2332 None => resolve_nonperiodic_bspline_knotspec(
2333 options,
2334 ds.values.column(c),
2335 (minv, maxv),
2336 effective_degree,
2337 n_knots,
2338 )?,
2339 };
2340 (knotspec, parse_cyclic_boundary(options, minv, maxv)?)
2341 } else {
2342 (
2343 resolve_nonperiodic_bspline_knotspec(
2344 options,
2345 ds.values.column(c),
2346 (minv, maxv),
2347 effective_degree,
2348 n_knots,
2349 )?,
2350 parse_cyclic_boundary(options, minv, maxv)?,
2351 )
2352 };
2353 // mgcv `bs="cr"` does not shrink the linear null space; only `cs`
2354 // (and the gamfit-flavoured `bspline`/`ps`) do. Honour an explicit
2355 // `double_penalty=` either way.
2356 let double_penalty = if type_opt == "cr" {
2357 option_bool(options, "double_penalty").unwrap_or(false)
2358 } else {
2359 smooth_double_penalty
2360 };
2361 // Clamp the marginal difference penalty to `<= effective_degree`
2362 // so it stays well-defined when the per-axis degree was reduced
2363 // (mirrors the tensor margin path: `create_difference_penalty_matrix`
2364 // requires order < num_basis_functions).
2365 let penalty_order = option_usize(options, "penalty_order")
2366 .unwrap_or(DEFAULT_PENALTY_ORDER)
2367 .min(effective_degree);
2368 Ok(SmoothBasisSpec::BSpline1D {
2369 feature_col: c,
2370 spec: BSplineBasisSpec {
2371 degree: effective_degree,
2372 penalty_order,
2373 knotspec,
2374 double_penalty,
2375 identifiability: BSplineIdentifiability::default(),
2376 boundary,
2377 boundary_conditions,
2378 },
2379 })
2380 }
2381 "tps" | "thinplate" | "thin-plate" => {
2382 validate_known_options(
2383 "thinplate",
2384 options,
2385 &[
2386 SECONDARY_CENTER_CAP_OPTION,
2387 "type",
2388 "bs",
2389 "by",
2390 "length_scale",
2391 "centers",
2392 "k",
2393 "basis_dim",
2394 "basis-dim",
2395 "basisdim",
2396 "knots",
2397 "include_intercept",
2398 "double_penalty",
2399 "by",
2400 "id",
2401 "__by_col",
2402 "identifiability",
2403 "by",
2404 "periodic",
2405 "cyclic",
2406 "period",
2407 "period_start",
2408 "period_end",
2409 "scale_dims",
2410 ],
2411 )?;
2412 let plan = plan_spatial_basis(
2413 ds.values.nrows(),
2414 cols.len(),
2415 CenterCountRequest::Default,
2416 DuchonNullspaceOrder::Linear,
2417 option_bool(options, "scale_dims").unwrap_or(false),
2418 policy,
2419 )
2420 .map_err(|e| e.to_string())?;
2421 // #1074: the mgcv-sized basis cap (`k = 10·3^(d-1)`) that used to live
2422 // here was DELETED. It masked the real defect — the n-scaling default
2423 // over-sizes a thin-plate field, producing a weakly-identified
2424 // two-penalty ρ-surface the outer optimizer stalls on (row-order
2425 // dependent, #1378), and surplus columns REML can't penalize away on
2426 // weak-signal fits. Capping the basis hid that stall instead of fixing
2427 // it. The default now uses the generic spatial center heuristic; the
2428 // root fix (a well-identified ρ-surface / optimizer that doesn't stall)
2429 // is tracked separately. Explicit `k`/`centers` still take full effect.
2430 let default_centers = plan.centers;
2431 let centers = parse_countwith_basis_alias(
2432 options,
2433 "centers",
2434 cap_default_spatial_centers(options, default_centers),
2435 )?;
2436 let center_strategy = if has_explicit_countwith_basis_alias(options, "centers") {
2437 spatial_center_strategy_for_dimension(centers, cols.len())
2438 } else {
2439 auto_spatial_center_strategy(centers, cols.len())
2440 };
2441 Ok(SmoothBasisSpec::ThinPlate {
2442 feature_cols: cols.to_vec(),
2443 spec: ThinPlateBasisSpec {
2444 center_strategy,
2445 periodic: parse_periodic_axes_option(options, cols.len())?,
2446 // Sentinel: leave at 0.0 when the user didn't pass an
2447 // explicit length_scale so `auto_init_length_scale_in_place`
2448 // can replace it with a data-derived initialization. The
2449 // old hard-coded 1.0 was the documented basin (see
2450 // smooth.rs `auto_init_length_scale_in_place`) that the
2451 // spatial optimizer could not escape, leaving TPS terms
2452 // initialized off the data scale.
2453 length_scale: option_f64(options, "length_scale").unwrap_or(0.0),
2454 double_penalty: smooth_double_penalty,
2455 identifiability: parse_spatial_identifiability(options)
2456 .map_err(|e| e.to_string())?,
2457 radial_reparam: None,
2458 },
2459 input_scales: None,
2460 })
2461 }
2462 "sphere" | "s2" | "sos" => {
2463 validate_known_options(
2464 "sphere",
2465 options,
2466 &[
2467 "type",
2468 "bs",
2469 "by",
2470 "centers",
2471 "k",
2472 "basis_dim",
2473 "basis-dim",
2474 "basisdim",
2475 "knots",
2476 "penalty_order",
2477 "m",
2478 "double_penalty",
2479 "id",
2480 "__by_col",
2481 "kernel",
2482 "method",
2483 "radians",
2484 "units",
2485 "degree",
2486 "l",
2487 "max_degree",
2488 "max-degree",
2489 ],
2490 )?;
2491 if cols.len() != 2 {
2492 return Err(format!(
2493 "sphere smooth expects exactly two variables (lat, lon), got {}",
2494 cols.len()
2495 ));
2496 }
2497 let radians = option_bool(options, "radians").unwrap_or_else(|| {
2498 options
2499 .get("units")
2500 .map(|u| u.eq_ignore_ascii_case("radian") || u.eq_ignore_ascii_case("radians"))
2501 .unwrap_or(false)
2502 });
2503 // An explicit `degree`/`l`/`max_degree` names a spherical-harmonic
2504 // truncation, so with no explicit kernel/method it selects the
2505 // Harmonic construction (the Wahba kernel ignores `degree` and would
2506 // silently emit a 1-column kernel design). An explicit kernel/method
2507 // still wins.
2508 let degree_requested = options.contains_key("degree")
2509 || options.contains_key("l")
2510 || options.contains_key("max_degree")
2511 || options.contains_key("max-degree");
2512 let kernel = options
2513 .get("kernel")
2514 .or_else(|| options.get("method"))
2515 .map(|raw| strip_quotes(raw).trim().to_ascii_lowercase())
2516 .unwrap_or_else(|| {
2517 if degree_requested {
2518 "harmonic".to_string()
2519 } else {
2520 "sobolev".to_string()
2521 }
2522 });
2523 let (method, wahba_kernel) = match kernel.as_str() {
2524 "sobolev" | "wahba" | "wahba_sobolev" | "wahba-sobolev" => {
2525 (SphereMethod::Wahba, SphereWahbaKernel::Sobolev)
2526 }
2527 "pseudo" | "mgcv" | "sos" | "wahba_pseudo" | "wahba-pseudo" => {
2528 (SphereMethod::Wahba, SphereWahbaKernel::Pseudo)
2529 }
2530 "harmonic" | "spherical_harmonic" | "spherical-harmonic" => {
2531 (SphereMethod::Harmonic, SphereWahbaKernel::Sobolev)
2532 }
2533 other => {
2534 return Err(format!(
2535 "unsupported sphere kernel '{other}'; expected sobolev, pseudo, or harmonic"
2536 ));
2537 }
2538 };
2539 let max_degree = if matches!(method, SphereMethod::Harmonic) {
2540 let degree =
2541 option_usize_any(options, &["degree", "l", "max_degree", "max-degree"])
2542 .or_else(|| option_usize(options, "centers"))
2543 .or_else(|| {
2544 option_usize_any(options, &["k", "basis_dim", "basis-dim", "basisdim"])
2545 .and_then(|k| (1..=128).find(|&l| l * (l + 2) >= k))
2546 })
2547 .unwrap_or_else(|| default_spherical_harmonic_degree(ds.values.nrows()));
2548 if degree == 0 {
2549 return Err("sphere smooth requires degree/max_degree >= 1".to_string());
2550 }
2551 if degree > 32 {
2552 return Err(format!(
2553 "sphere smooth max_degree={} is too large for the dense harmonic engine (limit 32)",
2554 degree
2555 ));
2556 }
2557 Some(degree)
2558 } else {
2559 None
2560 };
2561 let penalty_order = option_usize(options, "penalty_order")
2562 .or_else(|| option_usize(options, "m"))
2563 .unwrap_or(DEFAULT_PENALTY_ORDER);
2564 let center_strategy = if matches!(method, SphereMethod::Wahba) {
2565 let mut centers = parse_countwith_basis_alias(
2566 options,
2567 "centers",
2568 default_num_centers(ds.values.nrows(), cols.len()),
2569 )?;
2570 if penalty_order >= 4 {
2571 centers = centers.max(30);
2572 }
2573 CenterStrategy::FarthestPoint {
2574 num_centers: centers,
2575 }
2576 } else {
2577 CenterStrategy::FarthestPoint { num_centers: 0 }
2578 };
2579 Ok(SmoothBasisSpec::Sphere {
2580 feature_cols: cols.to_vec(),
2581 spec: SphericalSplineBasisSpec {
2582 center_strategy,
2583 penalty_order,
2584 double_penalty: smooth_double_penalty,
2585 radians,
2586 method,
2587 max_degree,
2588 wahba_kernel,
2589 identifiability: SphericalSplineIdentifiability::CenterSumToZero,
2590 },
2591 })
2592 }
2593 "curvature" => {
2594 // Constant-curvature (M_κ) geodesic-kernel smooth (#944): the
2595 // κ-generic sibling of the intrinsic S² smooth above. The feature
2596 // columns are κ-stereographic chart coordinates; `kappa=` is the
2597 // fixed sectional curvature (default 0 = flat), and the geometry
2598 // comes from `geometry::constant_curvature::ConstantCurvature`.
2599 validate_known_options(
2600 "curvature",
2601 options,
2602 &[
2603 "type",
2604 "bs",
2605 "by",
2606 "centers",
2607 "k",
2608 "basis_dim",
2609 "basis-dim",
2610 "basisdim",
2611 "knots",
2612 "kappa",
2613 "length_scale",
2614 "double_penalty",
2615 "id",
2616 "__by_col",
2617 ],
2618 )?;
2619 let kappa = option_f64(options, "kappa").unwrap_or(0.0);
2620 if !kappa.is_finite() {
2621 return Err("curvature smooth requires a finite kappa".to_string());
2622 }
2623 let length_scale = option_f64(options, "length_scale").unwrap_or(0.0);
2624 if !length_scale.is_finite() || length_scale < 0.0 {
2625 return Err(format!(
2626 "curvature smooth length_scale must be positive (or omitted for auto); got {length_scale}"
2627 ));
2628 }
2629 let centers = parse_countwith_basis_alias(
2630 options,
2631 "centers",
2632 default_num_centers(ds.values.nrows(), cols.len()),
2633 )?;
2634 if centers < 2 {
2635 return Err("curvature smooth requires at least 2 centers".to_string());
2636 }
2637 Ok(SmoothBasisSpec::ConstantCurvature {
2638 feature_cols: cols.to_vec(),
2639 spec: ConstantCurvatureBasisSpec {
2640 center_strategy: CenterStrategy::FarthestPoint {
2641 num_centers: centers,
2642 },
2643 kappa,
2644 // 0.0 sentinel = κ-independent auto initialization in the
2645 // basis builder (median chart center spacing, doubled).
2646 length_scale,
2647 // Curvature smooth defaults to NO double-penalty ridge
2648 // (#1464): the curvature-blind ridge `I` absorbs the data fit
2649 // independently of κ and rails the fitted curvature to the
2650 // +chart bound (hyperbolic truth recovered as spherical). The
2651 // RKHS Gram penalty is already full-rank PD, so the ridge adds
2652 // no stability. Honour an EXPLICIT `double_penalty=` only.
2653 double_penalty: option_bool(options, "double_penalty").unwrap_or(false),
2654 identifiability: ConstantCurvatureIdentifiability::CenterSumToZero,
2655 },
2656 })
2657 }
2658 "measurejet" => {
2659 // Measure-jet spline: multiscale local-jet-residual energy of the
2660 // empirical measure. The feature columns are ambient coordinates
2661 // of data concentrated near an unknown low-dimensional set; the
2662 // geometry (centers, masses, scale band) is read off the measure
2663 // at build time — magic by default, every option optional.
2664 validate_known_options(
2665 "measurejet",
2666 options,
2667 &[
2668 "type",
2669 "bs",
2670 "by",
2671 "centers",
2672 "k",
2673 "basis_dim",
2674 "basis-dim",
2675 "basisdim",
2676 "knots",
2677 "s",
2678 "alpha",
2679 "tau",
2680 "scales",
2681 "length_scale",
2682 "double_penalty",
2683 "multiscale",
2684 "learn_length_scale",
2685 "id",
2686 "__by_col",
2687 ],
2688 )?;
2689 let order_s = option_f64(options, "s").unwrap_or(0.0);
2690 // 0.0 = auto sentinel; explicit values must sit inside the
2691 // admissible order interval of the affine-jet (r = 2) energy.
2692 if !(order_s.is_finite() && (order_s == 0.0 || (order_s > 0.0 && order_s < 2.0))) {
2693 return Err(format!(
2694 "measurejet smooth s must lie in (0, 2) (or be omitted for auto); got {order_s}"
2695 ));
2696 }
2697 // Default to the spec Default (α = 1, density-WEIGHTED Hessian
2698 // energy — the module-header default). The density-free α = 3/2
2699 // (q^{−2}) over-smooths low-intrinsic-dimension manifolds where the
2700 // local mass q is tiny and varies along the stratum (#1116:
2701 // 13×-worse-than-matérn on a 1-D curve in 3-D); α = 1's q^{−1} is
2702 // gentler and robust across intrinsic dimensions. An explicit
2703 // `alpha=` still overrides for full-dimensional density-free use.
2704 let alpha =
2705 option_f64(options, "alpha").unwrap_or(MeasureJetBasisSpec::default().alpha);
2706 if !alpha.is_finite() {
2707 return Err("measurejet smooth requires a finite alpha".to_string());
2708 }
2709 let tau0 = option_f64(options, "tau").unwrap_or(1e-3);
2710 if !(tau0.is_finite() && tau0 >= 0.0) {
2711 return Err(format!(
2712 "measurejet smooth tau must be finite and nonnegative; got {tau0}"
2713 ));
2714 }
2715 let num_scales = option_usize(options, "scales").unwrap_or(0);
2716 let length_scale = option_f64(options, "length_scale").unwrap_or(0.0);
2717 if !length_scale.is_finite() || length_scale < 0.0 {
2718 return Err(format!(
2719 "measurejet smooth length_scale must be positive (or omitted for auto); got {length_scale}"
2720 ));
2721 }
2722 let centers = parse_countwith_basis_alias(
2723 options,
2724 "centers",
2725 default_num_centers(ds.values.nrows(), cols.len()),
2726 )?;
2727 if centers < 3 {
2728 return Err("measurejet smooth requires at least 3 centers".to_string());
2729 }
2730 // Multiscale (per-scale spectral split + (α, lnτ) ψ dials + the
2731 // affine-preserving ridge) is an explicit opt-in (#1116): default
2732 // single-scale at any center count, the Duchon/Matérn footprint.
2733 let multiscale = option_bool(options, "multiscale").unwrap_or(false);
2734 // REML-learning the representer range ℓ is an explicit opt-in.
2735 // The stable default freezes ℓ at the auto/user value; the
2736 // design-moving coordinate is expensive and can overfit low-signal
2737 // surfaces when enabled implicitly.
2738 let learn_length_scale = option_bool(options, "learn_length_scale").unwrap_or(false);
2739 Ok(SmoothBasisSpec::MeasureJet {
2740 feature_cols: cols.to_vec(),
2741 spec: MeasureJetBasisSpec {
2742 center_strategy: CenterStrategy::FarthestPoint {
2743 num_centers: centers,
2744 },
2745 order_s,
2746 alpha,
2747 tau0,
2748 num_scales,
2749 // 0.0 sentinel = auto initialization in the basis builder
2750 // (median nearest-center spacing).
2751 length_scale,
2752 double_penalty: smooth_double_penalty,
2753 learn_length_scale,
2754 multiscale,
2755 identifiability: MeasureJetIdentifiability::CenterSumToZero,
2756 frozen_quadrature: None,
2757 },
2758 input_scales: None,
2759 })
2760 }
2761 "matern" => {
2762 // Catch typos like `lengt_scale=` / `nyu=` / `centerz=` before
2763 // they get silently ignored and the user wonders why their
2764 // option had no effect. The matern() term accepts exactly
2765 // these options.
2766 validate_known_options(
2767 "matern",
2768 options,
2769 &[
2770 SECONDARY_CENTER_CAP_OPTION,
2771 "type",
2772 "bs",
2773 "by",
2774 "nu",
2775 "length_scale",
2776 "centers",
2777 "k",
2778 "basis_dim",
2779 "basis-dim",
2780 "basisdim",
2781 "knots",
2782 "include_intercept",
2783 "double_penalty",
2784 "by",
2785 "id",
2786 "__by_col",
2787 "identifiability",
2788 "by",
2789 "periodic",
2790 "cyclic",
2791 "period",
2792 "period_start",
2793 "period_end",
2794 "scale_dims",
2795 ],
2796 )?;
2797 let plan = plan_spatial_basis(
2798 ds.values.nrows(),
2799 cols.len(),
2800 CenterCountRequest::Default,
2801 DuchonNullspaceOrder::Zero,
2802 option_bool(options, "scale_dims").unwrap_or(false),
2803 policy,
2804 )
2805 .map_err(|e| e.to_string())?;
2806 let centers = parse_countwith_basis_alias(
2807 options,
2808 "centers",
2809 cap_default_spatial_centers(
2810 options,
2811 default_matern_center_count(ds.values.nrows(), cols.len(), plan.centers),
2812 ),
2813 )?;
2814 let center_strategy = if has_explicit_countwith_basis_alias(options, "centers") {
2815 spatial_center_strategy_for_dimension(centers, cols.len())
2816 } else {
2817 auto_spatial_center_strategy(centers, cols.len())
2818 };
2819 let nu = parse_matern_nu(options.get("nu").map(String::as_str).unwrap_or("5/2"))?;
2820 // The exponential (ν = 1/2) Matérn kernel has a singular Laplacian
2821 // at zero in d ≥ 2, so the operator-collocation penalty machinery
2822 // hits a non-invertible matrix during fit. Surface the cause
2823 // up-front instead of letting the user see the generic
2824 // "Matrix conditioning issue detected" wrapper from PIRLS.
2825 if matches!(nu, MaternNu::Half) && cols.len() >= 2 {
2826 return Err(TermBuilderError::unsupported_feature(format!(
2827 "matern() with nu=1/2 is not supported for d>=2 (got {} covariates): \
2828 the exponential kernel's Laplacian is singular at center collisions, \
2829 which makes the operator-collocation penalty non-invertible. \
2830 Choose nu>=3/2 (e.g. nu=3/2 or the default nu=5/2) for multi-dimensional smooths.",
2831 cols.len()
2832 ))
2833 .to_string());
2834 }
2835 let aniso_log_scales = if option_bool(options, "scale_dims").unwrap_or(false) {
2836 Some(vec![0.0; cols.len()])
2837 } else {
2838 None
2839 };
2840 Ok(SmoothBasisSpec::Matern {
2841 feature_cols: cols.to_vec(),
2842 spec: MaternBasisSpec {
2843 center_strategy,
2844 periodic: parse_periodic_axes_option(options, cols.len())?,
2845 // Sentinel: leave at 0.0 when the user didn't pass an
2846 // explicit length_scale so the planner's
2847 // `auto_init_length_scale_in_place` can replace it with the
2848 // SAME data-derived wiggly-side initialization the thin-plate
2849 // path uses (`max_range / sqrt(n)`), then let the κ-optimizer
2850 // refine from there.
2851 //
2852 // gam#1629: the previous `default_matern_length_scale` seeded
2853 // the FULL data diameter — the maximally over-smoothed corner.
2854 // Because that value is non-zero, the `0.0`-gated auto-init was
2855 // a no-op for Matérn, so the κ-optimizer started in the flat
2856 // over-smoothed basin and parked there, leaving high-frequency
2857 // 2-D surfaces unresolved (truth-RMSE ~6× worse than
2858 // thin-plate/tensor on identical data, and insensitive to `k`).
2859 // Routing Matérn through the same `0.0` sentinel as thin-plate
2860 // (see the ThinPlate branch above) starts REML in the resolving
2861 // regime it can actually escape from.
2862 length_scale: option_f64(options, "length_scale").unwrap_or(0.0),
2863 nu,
2864 include_intercept: option_bool(options, "include_intercept").unwrap_or(false),
2865 double_penalty: smooth_double_penalty,
2866 identifiability: parse_matern_identifiability(options)
2867 .map_err(|e| e.to_string())?,
2868 aniso_log_scales,
2869 // Cold build: let the bootstrap-κ spectral test decide whether
2870 // the double-penalty nullspace shrinkage survives; the freeze
2871 // step then pins that decision into the FrozenTransform so the
2872 // κ-optimizer's rebuilds keep the count invariant (gam#787/#860).
2873 nullspace_shrinkage_survived: None,
2874 },
2875 input_scales: None,
2876 })
2877 }
2878 "duchon" => {
2879 validate_known_options(
2880 "duchon",
2881 options,
2882 &[
2883 SECONDARY_CENTER_CAP_OPTION,
2884 "type",
2885 "bs",
2886 "by",
2887 "length_scale",
2888 "centers",
2889 "k",
2890 "basis_dim",
2891 "basis-dim",
2892 "basisdim",
2893 "knots",
2894 "power",
2895 "p",
2896 "nullspace_order",
2897 "order",
2898 "identifiability",
2899 "by",
2900 "periodic",
2901 "cyclic",
2902 "period",
2903 "period_start",
2904 "period_end",
2905 "scale_dims",
2906 "double_penalty",
2907 "by",
2908 "id",
2909 "__by_col",
2910 ],
2911 )?;
2912 if options.contains_key("double_penalty") {
2913 return Err(TermBuilderError::incompatible_config(format!(
2914 "Duchon smooth '{}' does not support double_penalty; the Duchon smoother already ships its native reproducing-norm penalty plus a null-space shrinkage ridge.",
2915 vars.join(", ")
2916 ))
2917 .to_string());
2918 }
2919 let requested_nullspace_order = parse_duchon_order(options)?;
2920 let length_scale = option_f64_strict(options, "length_scale")?;
2921 // Resolve `(nullspace_order, power)`. The default (magic) path is a
2922 // structural amplitude/slope/curvature smoother: an affine (`Linear`)
2923 // polynomial nullspace and spectral power `s = (d - 1)/2`, giving the
2924 // cubic kernel `r^3` in 1D. There is no nullspace-order escalation —
2925 // the structural cubic smoother is well-defined for every dimension.
2926 //
2927 // Explicit `power=...` honors the user's value verbatim against their
2928 // requested nullspace order; the kernel validator emits a precise
2929 // diagnostic for any inadmissible combination. In the scale-free
2930 // (non-hybrid) regime fractional powers are admitted and threaded as
2931 // `f64`. The hybrid Duchon-Matérn kernel (`length_scale=Some`) is
2932 // restricted to integer powers.
2933 let (nullspace_order, power) = match parse_duchon_power_policy(options)? {
2934 DuchonPowerPolicy::Explicit(req_power) => {
2935 if length_scale.is_some() && req_power.fract() != 0.0 {
2936 return Err(TermBuilderError::incompatible_config(format!(
2937 "hybrid Duchon-Matern smooth '{}' (length_scale=...) requires an integer power, got power={}; \
2938 drop length_scale to use the scale-free structural kernel with a fractional power.",
2939 vars.join(", "),
2940 req_power,
2941 ))
2942 .to_string());
2943 }
2944 (requested_nullspace_order, req_power)
2945 }
2946 DuchonPowerPolicy::CubicStructuralDefault => {
2947 // Magic cubic rule (REQUEST-LAYER default): no explicit power ⇒
2948 // affine null space + fractional spectral power s = (d-1)/2, i.e.
2949 // the Duchon kernel φ(r)=r³ in every dimension. An EXPLICIT
2950 // `power=0` is handled above and is honored as the s=0 Duchon
2951 // kernel (r²·log r ≡ the thin-plate kernel in even d) — the magic
2952 // default lives here, not in the basis builder.
2953 match length_scale {
2954 None => crate::basis::duchon_cubic_default(cols.len()),
2955 Some(_) => {
2956 // The hybrid Matérn-blended kernel (`length_scale=Some`)
2957 // requires an INTEGER spectral power `s` (the partial-
2958 // fraction split `1/(ρ^{2p}(κ²+ρ²)^s)` is only defined for
2959 // integer `s`). The fractional cubic default `s=(d-1)/2` is
2960 // a half-integer for even `d`, and the basis builder's
2961 // `power_as_usize` maps a NON-integer to `0` (not its
2962 // floor) — so for even `d ≥ 4` the realized kernel has
2963 // `2(p+s) = 2p = 4 ≤ d`, which is non-finite at the origin
2964 // and crashes the fit (historically a non-finite
2965 // eigendecomposition; now a fit-time validation error).
2966 //
2967 // Rather than emit the fractional cubic and let it truncate
2968 // into an inadmissible kernel, resolve the SMALLEST
2969 // admissible integer `(nullspace, s)` at the requested
2970 // nullspace order, honoring the collocation order of the
2971 // default operator penalties (mass + tension ⇒ D1). This
2972 // recovers the canonical thin-plate smoothness order
2973 // `m = p + s = ⌊d/2⌋ + 1` for the hybrid kernel and agrees
2974 // with the fractional cubic default for odd `d` (where the
2975 // collocation floor already forces `s = (d-1)/2`).
2976 let max_op = crate::basis::duchon_max_active_operator_derivative_order(
2977 &DuchonOperatorPenaltySpec::default(),
2978 );
2979 let (ns, s) = crate::basis::resolve_duchon_orders(
2980 cols.len(),
2981 requested_nullspace_order,
2982 max_op,
2983 length_scale,
2984 );
2985 (ns, s as f64)
2986 }
2987 }
2988 }
2989 };
2990 let plan = plan_spatial_basis(
2991 ds.values.nrows(),
2992 cols.len(),
2993 CenterCountRequest::Default,
2994 nullspace_order,
2995 option_bool(options, "scale_dims").unwrap_or(false),
2996 policy,
2997 )
2998 .map_err(|e| e.to_string())?;
2999 let centers_explicit = has_explicit_countwith_basis_alias(options, "centers");
3000 let requested_centers = parse_countwith_basis_alias(
3001 options,
3002 "centers",
3003 cap_default_spatial_centers(options, plan.centers),
3004 )?;
3005 let polynomial_cols = match nullspace_order {
3006 DuchonNullspaceOrder::Zero => 1,
3007 DuchonNullspaceOrder::Linear => cols.len() + 1,
3008 DuchonNullspaceOrder::Degree(degree) => {
3009 crate::basis::duchon_nullspace_dimension(cols.len(), degree)
3010 }
3011 };
3012 if requested_centers <= polynomial_cols {
3013 return Err(TermBuilderError::incompatible_config(format!(
3014 "Duchon smooth '{}' requested basis dimension {} but order={:?} in {}D needs {} polynomial null-space columns; choose centers/k > {}",
3015 vars.join(", "),
3016 requested_centers,
3017 nullspace_order,
3018 cols.len(),
3019 polynomial_cols,
3020 polynomial_cols,
3021 ))
3022 .to_string());
3023 }
3024 let mut centers = requested_centers;
3025 if !centers_explicit && ds.values.nrows() <= 32 && smooth_coordinate_count >= 5 {
3026 centers = centers.max(polynomial_cols + 4);
3027 }
3028 let center_strategy = if centers_explicit {
3029 spatial_center_strategy_for_dimension(centers, cols.len())
3030 } else {
3031 auto_spatial_center_strategy(centers, cols.len())
3032 };
3033 let aniso_log_scales = if option_bool(options, "scale_dims").unwrap_or(false) {
3034 Some(vec![0.0; cols.len()])
3035 } else {
3036 None
3037 };
3038 // The default is the full Hilbert scale (curvature `Primary` + trend
3039 // ridge + mass + tension); REML deselects what the data don't support.
3040 let operator_penalties = DuchonOperatorPenaltySpec::default();
3041 // For a 1-D periodic Duchon with no EXPLICIT period, anchor the wrap
3042 // to the covariate DATA range rather than letting the basis builder
3043 // derive it from the (k-subsampled) center span. The center span is a
3044 // strict subset of the data and undershoots the true period, seaming
3045 // the curve (f(0) ≠ f(2π)); the data range is the caller's actual
3046 // domain. Honors any explicit `period=` (parse_periodic_axes_option
3047 // already threaded it) and leaves multi-D / non-periodic untouched.
3048 let mut periodic = parse_periodic_axes_option(options, cols.len())?;
3049 if cols.len() == 1
3050 && let Some(axes) = periodic.as_mut()
3051 && axes.len() == 1
3052 && axes[0].is_none()
3053 {
3054 let (minv, maxv) = col_minmax(ds.values.column(cols[0]))?;
3055 if maxv > minv {
3056 axes[0] = Some(maxv - minv);
3057 }
3058 }
3059 Ok(SmoothBasisSpec::Duchon {
3060 feature_cols: cols.to_vec(),
3061 spec: DuchonBasisSpec {
3062 center_strategy,
3063 periodic,
3064 length_scale,
3065 power,
3066 nullspace_order,
3067 identifiability: parse_spatial_identifiability(options)
3068 .map_err(|e| e.to_string())?,
3069 aniso_log_scales,
3070 operator_penalties,
3071 boundary: if cols.len() == 1 {
3072 let c = cols[0];
3073 let (minv, maxv) = col_minmax(ds.values.column(c))?;
3074 parse_cyclic_boundary(options, minv, maxv)?
3075 } else {
3076 OneDimensionalBoundary::Open
3077 },
3078 radial_reparam: None,
3079 },
3080 input_scales: None,
3081 })
3082 }
3083 "tensor" | "te" | "ti" | "t2" => {
3084 validate_known_options(
3085 "tensor",
3086 options,
3087 &[
3088 "type",
3089 "bs",
3090 "by",
3091 "k",
3092 "basis_dim",
3093 "basis-dim",
3094 "basisdim",
3095 "knot_placement",
3096 "knot-placement",
3097 "knotplacement",
3098 "degree",
3099 "penalty_order",
3100 "double_penalty",
3101 "periodic",
3102 "cyclic",
3103 "period",
3104 "periods",
3105 "period_start",
3106 "period_end",
3107 "origin",
3108 "origins",
3109 "period_origin",
3110 "period-origin",
3111 "domain_origin",
3112 "boundary",
3113 "bc",
3114 "identifiability",
3115 "id",
3116 "__by_col",
3117 ],
3118 )?;
3119 if cols.len() < 2 {
3120 return Err(TermBuilderError::incompatible_config(format!(
3121 "tensor smooth expects at least 2 variables, got {}",
3122 cols.len()
3123 ))
3124 .to_string());
3125 }
3126 let dim = cols.len();
3127
3128 // Tensor-product contract (#1082). `te(x1, x2, ...)` ALWAYS builds a
3129 // genuine anisotropic tensor product of per-margin bases (the arm
3130 // below), exactly as mgcv's `te()` does — one smoothing parameter per
3131 // margin, a marginal-Kronecker-sum penalty, and the bilinear null
3132 // space left unpenalized under the default `select = FALSE`. A margin
3133 // vector `bs=c('tp','tp')` requests a thin-plate FUNCTION SPACE per
3134 // axis; the tensor realizes each axis as a 1-D penalized B-spline
3135 // margin spanning that same per-axis space (tp/ps/cr/bs/cc all share
3136 // it). We deliberately do NOT silently swap the requested tensor for a
3137 // single multi-D ISOTROPIC thin-plate radial smooth (`s(x,y,bs='tp')`):
3138 // that is a different model — one isotropic smoothing parameter, no
3139 // per-margin anisotropy — and substituting it while the user wrote a
3140 // tensor formula is dishonest. A user who genuinely wants the isotropic
3141 // radial smooth asks for it directly with `s(x1, x2, bs='tp')`.
3142 // Per-margin basis vector (`bs=c('tp','tp')` / `bs=['ps','cr']`):
3143 // validate each requested margin is a penalized-spline basis that
3144 // the tensor product realizes as a 1-D B-spline margin. mgcv's
3145 // `tp`/`ps`/`cr`/`bs`/`cc` margins are all penalized splines over
3146 // the same per-axis function space, so a B-spline margin recovers
3147 // the same tensor smoothing space; genuinely different margin kinds
3148 // (e.g. adaptive `ad`, random `re`) are rejected loudly rather than
3149 // silently substituted.
3150 if let Some(raw) = options.get("bs").or_else(|| options.get("type"))
3151 && bs_selector_is_vector(raw)
3152 {
3153 let per_margin = parse_option_list(raw);
3154 if per_margin.len() != dim {
3155 return Err(TermBuilderError::invalid_option(format!(
3156 "tensor smooth per-margin bs vector has {} entries but the smooth has {} margins",
3157 per_margin.len(),
3158 dim
3159 ))
3160 .to_string());
3161 }
3162 for (axis, margin_bs) in per_margin.iter().enumerate() {
3163 if !tensor_margin_bs_is_supported(margin_bs) {
3164 return Err(TermBuilderError::unsupported_feature(format!(
3165 "tensor smooth margin {axis} basis '{margin_bs}' is not a supported penalized-spline margin; \
3166 tensor margins accept tp/tps/ps/bs/cr/cc"
3167 ))
3168 .to_string());
3169 }
3170 }
3171 }
3172 let periodic_axes = parse_tensor_periodic_axes(options, dim)?;
3173 validate_tensor_boundary_tokens(options, dim)?;
3174 let periods_opt = parse_periods(options, &periodic_axes)?;
3175 let origins_opt = parse_period_origins(options, &periodic_axes)?;
3176 let degree = option_usize(options, "degree").unwrap_or(DEFAULT_BSPLINE_DEGREE);
3177 let penalty_order =
3178 option_usize(options, "penalty_order").unwrap_or(if degree > 1 { 2 } else { 1 });
3179 let (mut k_list, k_inferred) = parse_tensor_k_list(options, cols, ds)?;
3180 if ds.values.nrows() <= 32 && smooth_coordinate_count >= 5 {
3181 for k in &mut k_list {
3182 *k = (*k).min(degree + 2);
3183 }
3184 }
3185 if k_inferred {
3186 inference_notes.push(format!(
3187 "Automatically set per-margin basis sizes {:?} for tensor smooth '{}' \
3188 (dimension-aware tensor budget: total ∏k kept near the mgcv-te default \
3189 and within the data support, distributed geometrically across margins and \
3190 capped per margin by each column's resolution). \
3191 Override with k=<int> or k=[k0,k1,...].",
3192 k_list,
3193 vars.join(",")
3194 ));
3195 }
3196 // Per-axis requested marginal basis family. mgcv's `te()`/`ti()`
3197 // default marginal basis is the cubic regression spline (`cr`), and
3198 // the te_3d quality gap (#1074) is precisely the marginal-basis
3199 // resolution at small `k`: a `cr` margin places k value-knots at
3200 // data quantiles (finer interior resolution under natural boundary
3201 // constraints) where the cubic B-spline margin has only
3202 // `k-degree-1` interior knots. Resolve each axis to either an
3203 // explicit per-margin `bs` (vector `bs=c('cr','ps')`), a single
3204 // scalar `bs`, or the unset default — and route
3205 // `cr`/`cs`/unset/`tp`/`tps` margins through the natural cubic
3206 // regression builder (`NaturalCubicRegression` knotspec), keeping
3207 // explicit `ps`/`bs`/`bspline` on the B-spline margin.
3208 let per_axis_bs: Vec<Option<String>> =
3209 match options.get("bs").or_else(|| options.get("type")) {
3210 Some(raw) if bs_selector_is_vector(raw) => {
3211 let list = parse_option_list(raw);
3212 (0..dim).map(|a| list.get(a).cloned()).collect()
3213 }
3214 Some(raw) => {
3215 let scalar = raw
3216 .trim()
3217 .trim_matches('"')
3218 .trim_matches('\'')
3219 .to_ascii_lowercase();
3220 vec![Some(scalar); dim]
3221 }
3222 None => vec![None; dim],
3223 };
3224 // A margin is realized as a natural cubic regression spline when it
3225 // is the (unset) mgcv default, an explicit `cr`/`cs`, or a
3226 // `tp`/`tps` (same per-axis penalized-spline space). Explicit
3227 // B-spline-family margins (`ps`/`bs`/`bspline`/`p-spline`) keep the
3228 // open B-spline margin.
3229 let margin_wants_cr = |bs: &Option<String>| -> bool {
3230 matches!(
3231 bs.as_deref(),
3232 None | Some("cr") | Some("cs") | Some("tp") | Some("tps")
3233 )
3234 };
3235 let mut margins: Vec<BSplineBasisSpec> = Vec::with_capacity(dim);
3236 let mut emitted_periods: Vec<Option<f64>> = Vec::with_capacity(dim);
3237 for axis in 0..dim {
3238 let c = cols[axis];
3239 let (data_min, data_max) = col_minmax(ds.values.column(c))?;
3240 // mgcv reduces a tensor margin's basis dimension to what its data
3241 // can support: a cr or B-spline margin cannot place more value
3242 // knots / basis functions than there are DISTINCT covariate
3243 // values on that axis. Without this cap an explicit `k` on a
3244 // low-cardinality margin — e.g. the binary `badh ∈ {0,1}` in
3245 // `te(age, badh, k=5)` — hard-failed in `select_cr_knots` ("cubic
3246 // regression spline with k=5 requires at least 5 distinct values,
3247 // got 2") instead of degrading to the 2-function (linear) margin
3248 // mgcv builds there. The auto-`k` path already caps per margin via
3249 // `heuristic_tensor_margin_knots`; mirror that for explicit `k`.
3250 // The cap propagates correctly: every per-axis quantity below
3251 // (effective degree, knot set, penalty order) is derived from
3252 // `k_axis`, and the marginal basis size is read from the resulting
3253 // knot spec — never from `k_list`. Floor at 2 so a margin still
3254 // carries at least a linear basis (tensor margins require k >= 2).
3255 let k_requested = k_list[axis];
3256 let n_distinct_axis = unique_count_column(ds.values.column(c));
3257 let k_axis = k_requested.min(n_distinct_axis).max(2);
3258 if k_axis < k_requested {
3259 log::info!(
3260 "tensor smooth: margin axis {axis} requested k={k_requested}, but the \
3261 covariate has only {n_distinct_axis} distinct value(s); reducing this \
3262 margin to k={k_axis} (mgcv-style data-support cap on the per-axis basis)."
3263 );
3264 }
3265 // Per-axis effective spline degree. The B-spline basis with `k`
3266 // functions is well-defined for any `degree <= k - 1`; mgcv's
3267 // `te(...)` exploits this so a binary tensor margin
3268 // (`k=2` → linear basis) or a ternary margin (`k=3` → quadratic)
3269 // can coexist with a smoother continuous margin under one
3270 // shared `degree=` request. We mirror that: if the caller
3271 // explicitly asks for `k < degree + 1`, drop the degree on
3272 // THAT axis only to the largest feasible spline, and track the
3273 // penalty order so the marginal difference penalty stays
3274 // well-defined (`order < num_basis_functions` is required by
3275 // `create_difference_penalty_matrix`). Periodic axes still
3276 // need enough basis functions to wrap; reject k there.
3277 if k_axis < 2 {
3278 return Err(TermBuilderError::invalid_option(format!(
3279 "tensor smooth: k[{axis}]={k_axis} too small; tensor margins require k >= 2"
3280 ))
3281 .to_string());
3282 }
3283 if periodic_axes[axis] && k_axis < degree + 1 {
3284 return Err(TermBuilderError::invalid_option(format!(
3285 "tensor smooth: periodic axis {axis} requires k >= {} for degree {degree}, got k={k_axis}",
3286 degree + 1
3287 ))
3288 .to_string());
3289 }
3290 let effective_degree = degree.min(k_axis - 1).max(1);
3291 let effective_penalty_order = penalty_order.min(effective_degree);
3292 // A `cc`/`cp`/`cyclic` per-margin basis declares periodicity
3293 // without necessarily supplying a `period=`: mgcv's `bs="cc"`
3294 // wraps at the covariate's observed data range. Mirror the 1-D
3295 // cyclic fallback (`parse_periodic_domain_1d`) here so a bare
3296 // `te(x, z, bs=c('cc','cc'))` wraps each margin on its own
3297 // [min, max] span instead of hard-erroring (#1752).
3298 let margin_is_cc = matches!(
3299 canonicalize_smooth_type(per_axis_bs[axis].as_deref().unwrap_or("")),
3300 "cc" | "cp" | "cyclic"
3301 );
3302 let (knotspec, boundary, axis_period) = if periodic_axes[axis] {
3303 // A `cc`/`cp`/`cyclic` per-margin basis declares periodicity
3304 // without necessarily supplying a `period=`; in that case wrap
3305 // at the covariate's observed [min, max] span, mirroring the
3306 // 1-D cyclic fallback (`parse_periodic_domain_1d`) so a bare
3307 // `te(x, z, bs=c('cc','cc'))` wraps each margin on its own
3308 // range instead of hard-erroring (#1752). An axis made
3309 // periodic by an explicit `periodic=`/`boundary=` selector
3310 // (not a cyclic margin basis) still requires an explicit
3311 // `period=`: a data-derived period there is a sample-dependent
3312 // off-by-ε seam and is not inferred.
3313 let (domain_start, period_value) = match periods_opt[axis] {
3314 Some(period_value) => {
3315 if !period_value.is_finite() || period_value <= 0.0 {
3316 return Err(format!(
3317 "tensor smooth axis {axis}: period must be a positive finite value, got {period_value}"
3318 ));
3319 }
3320 (origins_opt[axis].unwrap_or(data_min), period_value)
3321 }
3322 None if margin_is_cc => {
3323 let span = data_max - data_min;
3324 if !span.is_finite() || span <= 0.0 {
3325 return Err(format!(
3326 "tensor smooth axis {axis}: cyclic margin requires a positive \
3327 observed data range to derive its period, got [{data_min}, {data_max}]"
3328 ));
3329 }
3330 (origins_opt[axis].unwrap_or(data_min), span)
3331 }
3332 None => {
3333 return Err(format!(
3334 "tensor smooth axis {axis} is periodic but requires an explicit \
3335 period: pass period=<value> (scalar) or period=[..., <value>, ...]. \
3336 Deriving the period from the observed data range is sample-dependent \
3337 (off-by-ε seam), so it is not inferred."
3338 ));
3339 }
3340 };
3341 let domain_end = domain_start + period_value;
3342 (
3343 BSplineKnotSpec::PeriodicUniform {
3344 data_range: (domain_start, domain_end),
3345 num_basis: k_axis,
3346 },
3347 OneDimensionalBoundary::Cyclic {
3348 start: domain_start,
3349 end: domain_end,
3350 },
3351 Some(period_value),
3352 )
3353 } else if margin_wants_cr(&per_axis_bs[axis]) && k_axis >= 3 {
3354 // mgcv `te()`/`ti()` default cr margin: place exactly
3355 // `k_axis` Lancaster–Salkauskas value-knots at data
3356 // quantiles. The cr basis dimension equals the knot count,
3357 // so this reproduces the requested per-margin `k` directly.
3358 // A natural cubic regression spline needs at least 3 knots
3359 // (one interior); a `k_axis < 3` margin (e.g. a binary
3360 // tensor axis requesting a linear margin) falls through to
3361 // the B-spline branch below, exactly as before #1074 — mgcv
3362 // likewise does not build a `cr` margin below k=3.
3363 let cr_knots =
3364 crate::basis::select_cr_knots(ds.values.column(c), k_axis)
3365 .map_err(|e| e.to_string())?;
3366 (
3367 BSplineKnotSpec::NaturalCubicRegression { knots: cr_knots },
3368 OneDimensionalBoundary::Open,
3369 None,
3370 )
3371 } else {
3372 // `num_internal_knots = k - degree - 1` reproduces the
3373 // requested basis size exactly when degree was reduced for
3374 // a low-cardinality margin; keep the legacy `.max(1)`
3375 // floor on the un-reduced path so the existing knot
3376 // geometry is unchanged whenever the user already passed
3377 // k >= degree + 1.
3378 let num_internal_knots = if effective_degree < degree {
3379 k_axis.saturating_sub(effective_degree + 1)
3380 } else {
3381 k_axis.saturating_sub(degree + 1).max(1)
3382 };
3383 let knotspec = match parse_knot_placement(options)? {
3384 crate::basis::BSplineKnotPlacement::Uniform => BSplineKnotSpec::Generate {
3385 data_range: (data_min, data_max),
3386 num_internal_knots,
3387 },
3388 crate::basis::BSplineKnotPlacement::Quantile => {
3389 crate::basis::auto_knot_vector_1d_quantile(
3390 ds.values.column(c),
3391 num_internal_knots,
3392 effective_degree,
3393 )
3394 .map_err(|e| e.to_string())?;
3395 BSplineKnotSpec::Automatic {
3396 num_internal_knots: Some(num_internal_knots),
3397 placement: crate::basis::BSplineKnotPlacement::Quantile,
3398 }
3399 }
3400 };
3401 (knotspec, OneDimensionalBoundary::Open, None)
3402 };
3403 // A `cr` margin fixes cubic regression geometry; the cr builder
3404 // reads only the knot set + `double_penalty`. Enable null-space
3405 // shrinkage for an explicit `cs` margin. B-spline margins keep
3406 // the resolved effective degree / penalty order with no extra
3407 // null-space penalty (mgcv `select = FALSE` tensor default).
3408 let is_cr_margin =
3409 matches!(knotspec, BSplineKnotSpec::NaturalCubicRegression { .. });
3410 let margin_double_penalty =
3411 is_cr_margin && matches!(per_axis_bs[axis].as_deref(), Some("cs"));
3412 margins.push(BSplineBasisSpec {
3413 degree: effective_degree,
3414 penalty_order: effective_penalty_order,
3415 knotspec,
3416 double_penalty: margin_double_penalty,
3417 identifiability: BSplineIdentifiability::None,
3418 boundary,
3419 boundary_conditions: BSplineBoundaryConditions::default(),
3420 });
3421 emitted_periods.push(axis_period);
3422 }
3423 // #1593: canonicalize the margin order so a tensor smooth is invariant
3424 // to the typed order of its covariates. `te(x, z)` and `te(z, x)` span
3425 // the IDENTICAL tensor-product space under the identical per-margin
3426 // penalty family, but the design is the Khatri–Rao product
3427 // `B_first ⊙ B_second`, so the typed order permutes the design columns
3428 // (and the per-margin penalty blocks `S_first⊗I`, `I⊗S_second`). That
3429 // permutation is a pure relabelling in exact arithmetic — REML is
3430 // invariant to it — yet it reorders the penalized normal-equation / REML
3431 // eigen/Cholesky linear algebra, and the resulting sub-ULP differences
3432 // route the outer λ optimizer to a different terminal point in te's flat
3433 // REML valley (the over-smoothed margin rails to the ρ bound while the
3434 // other lands on a materially different λ̂). So the shipped surface
3435 // drifted ~2–6 % of range with a cosmetic swap of the covariate order
3436 // (the #1378 row-permutation / #1456 rotation flat-valley gauge family).
3437 // Sorting the margins by their source feature-column index makes the same
3438 // physical model build the identical problem regardless of typed order,
3439 // so the fit — and every prediction rebuilt from the resolved spec — is
3440 // genuinely order-invariant. `ti`/`t2` share this arm and become exactly
3441 // invariant too (they were already ~1e-5 by centring each margin
3442 // separately; canonicalization makes the swap bit-identical).
3443 let canon_cols: Vec<usize> = {
3444 let mut perm: Vec<usize> = (0..dim).collect();
3445 perm.sort_by_key(|&a| cols[a]);
3446 if perm.iter().enumerate().any(|(i, &a)| i != a) {
3447 margins = perm.iter().map(|&a| margins[a].clone()).collect();
3448 emitted_periods = perm.iter().map(|&a| emitted_periods[a]).collect();
3449 }
3450 perm.iter().map(|&a| cols[a]).collect()
3451 };
3452 let any_periodic = emitted_periods.iter().any(|p| p.is_some());
3453 let periods_vec = if any_periodic {
3454 emitted_periods
3455 } else {
3456 Vec::new()
3457 };
3458 // Tensor smooths (`te`/`ti`/`t2`) must match mgcv's DEFAULT
3459 // `select = FALSE`: the joint null space of the per-margin
3460 // penalties — the bilinear, low-order interaction directions that
3461 // no marginal roughness operator can see — is left UNPENALIZED.
3462 // mgcv only adds a null-space shrinkage penalty there under the
3463 // opt-in `select = TRUE` (which gam exposes as `double_penalty`).
3464 //
3465 // The general smooth default (`smooth_double_penalty`, true) is
3466 // calibrated for 1-D `s()` terms; carrying it into tensors silently
3467 // shrinks the genuinely-present bilinear interaction signal, so
3468 // REML places positive weight on the extra ridge and systematically
3469 // OVER-SMOOTHS the recovered surface relative to mgcv's plain
3470 // `te`/`ti` (gam#700/#701/#702/#703). Default tensors to no extra
3471 // null-space penalty; an explicit user `double_penalty=`/`select=`
3472 // still wins.
3473 let tensor_double_penalty = option_bool(options, "double_penalty").unwrap_or(false);
3474 Ok(SmoothBasisSpec::TensorBSpline {
3475 feature_cols: canon_cols,
3476 spec: TensorBSplineSpec {
3477 marginalspecs: margins,
3478 periods: periods_vec,
3479 double_penalty: tensor_double_penalty,
3480 identifiability: parse_tensor_identifiability(options, kind)?,
3481 // `t2` selects mgcv's separable (Wood, Scheipl & Faraway
3482 // 2013) decomposition. It can arrive either as the `t2(...)`
3483 // function form (`SmoothKind::T2`) or as a `type="t2"` /
3484 // `bs="t2"` option on an `s(...)`/`te(...)` term, in which
3485 // case `kind` is *not* `T2` but the resolved type string is
3486 // "t2". Keying only off `kind` silently aliased the option
3487 // form to `te`'s Kronecker-sum penalty (gam#1185); key off
3488 // the resolved type string as well so both routes build the
3489 // separable penalty.
3490 penalty_decomposition: if matches!(kind, SmoothKind::T2)
3491 || type_opt.as_str() == "t2"
3492 {
3493 TensorBSplinePenaltyDecomposition::Separable
3494 } else {
3495 TensorBSplinePenaltyDecomposition::MarginalKroneckerSum
3496 },
3497 },
3498 })
3499 }
3500 "pca" => {
3501 validate_known_options(
3502 "pca",
3503 options,
3504 &[
3505 "type",
3506 "bs",
3507 "by",
3508 "k",
3509 "basis_dim",
3510 "basis-dim",
3511 "basisdim",
3512 "lazy_path",
3513 "path",
3514 "pca_basis_path",
3515 "chunk_size",
3516 "smooth_penalty",
3517 "centered",
3518 "double_penalty",
3519 "id",
3520 "__by_col",
3521 ],
3522 )?;
3523 let path = options
3524 .get("lazy_path")
3525 .or_else(|| options.get("pca_basis_path"))
3526 .or_else(|| options.get("path"))
3527 .map(|raw| PathBuf::from(strip_quotes(raw)));
3528 let Some(path) = path else {
3529 return Err(TermBuilderError::incompatible_config(
3530 "pca smooth requires lazy_path=... on the formula path",
3531 )
3532 .to_string());
3533 };
3534 let k = option_usize_any(options, &["k", "basis_dim", "basis-dim", "basisdim"])
3535 .unwrap_or(0);
3536 let chunk_size = option_usize(options, "chunk_size").unwrap_or(DEFAULT_PCA_CHUNK_SIZE);
3537 Ok(SmoothBasisSpec::Pca {
3538 feature_cols: cols.to_vec(),
3539 basis_matrix: Array2::<f64>::zeros((cols.len(), k)),
3540 centered: option_bool(options, "centered").unwrap_or(true),
3541 smooth_penalty: option_f64(options, "smooth_penalty").unwrap_or(1.0),
3542 center_mean: None,
3543 pca_basis_path: Some(path),
3544 chunk_size,
3545 })
3546 }
3547 other => Err(TermBuilderError::unsupported_feature(format!(
3548 "unsupported smooth type '{other}'"
3549 ))
3550 .to_string()),
3551 }
3552}
3553
3554/// Initialise per-axis anisotropic log-scales on eligible spatial smooth specs.
3555pub fn enable_scale_dimensions(spec: &mut TermCollectionSpec) {
3556 for smooth in spec.smooth_terms.iter_mut() {
3557 // A multi-axis thin-plate term cannot carry per-axis anisotropy on its
3558 // single curvature penalty, so `scale_dimensions` was historically a
3559 // silent no-op for `bs="tp"` (gam#1676). Rewrite it to the
3560 // mathematically-equivalent anisotropic s=0 Duchon spline first; the
3561 // Duchon arm below then sees an already-seeded `aniso_log_scales` and
3562 // leaves it untouched.
3563 promote_thin_plate_for_scale_dimensions(&mut smooth.basis);
3564 match &mut smooth.basis {
3565 SmoothBasisSpec::Matern {
3566 feature_cols,
3567 spec: matern,
3568 ..
3569 } => {
3570 if matern.aniso_log_scales.is_none() {
3571 let d = feature_cols.len();
3572 matern.aniso_log_scales = Some(vec![0.0; d]);
3573 }
3574 }
3575 SmoothBasisSpec::Duchon {
3576 feature_cols,
3577 spec: duchon,
3578 ..
3579 } => {
3580 if duchon.aniso_log_scales.is_none() {
3581 let d = feature_cols.len();
3582 duchon.aniso_log_scales = Some(vec![0.0; d]);
3583 }
3584 }
3585 _ => {}
3586 }
3587 }
3588}
3589
3590/// Rewrite a multi-axis thin-plate term into the mathematically-equivalent
3591/// anisotropic s=0 Duchon spline so that `scale_dimensions` genuinely engages
3592/// (gam#1676).
3593///
3594/// ## Why a rewrite rather than a new field on the TPS builder
3595///
3596/// A canonical thin-plate regression spline carries a *single* curvature
3597/// penalty — the exact `∫|Dᵐ f|²` reproducing-kernel Gram. That penalty has no
3598/// per-axis structure to make one direction more or less relevant than another,
3599/// so per-axis anisotropy (`scale_dimensions`) cannot be expressed on it. The
3600/// flag was therefore a silent no-op for `bs="tp"` while it engaged for
3601/// `duchon()`/`matern()`.
3602///
3603/// The thin-plate kernel `r^{2m−d}` (the `r²·log r` log-case in even `d`) is
3604/// *exactly* the s=0 Duchon kernel (`DuchonBasisSpec::power = 0`,
3605/// `length_scale = None`) at the matching polynomial null-space order
3606/// `m = thin_plate_penalty_order(d)`. The Duchon polyharmonic family already
3607/// carries the per-axis tension ARD that `scale_dimensions` requests: its
3608/// isotropic first-order roughness penalty `Σ‖∇f‖²` splits into `d` directional
3609/// penalties `Σ(∂f/∂x_a)²`, each with its own REML `λ_a`
3610/// (`duchon_operator_penalty_candidates`). So the well-posed *anisotropic
3611/// thin-plate spline is the anisotropic s=0 Duchon spline*. Rewriting to that
3612/// representation reuses the battle-tested Duchon anisotropy / ψ-derivative /
3613/// freeze / predict machinery instead of duplicating it onto the TPS metadata
3614/// path, and keeps the polyharmonic family internally consistent. The codebase
3615/// already promotes infeasible-`k` TPS to Duchon for the same reason (the
3616/// canonical TPS single curvature penalty cannot deliver a requested
3617/// capability); per-axis anisotropy is another such capability.
3618///
3619/// This fires *only* when the user opts into `scale_dimensions`; the default
3620/// thin-plate path (`scale_dimensions` off) is left bit-for-bit unchanged.
3621/// A 1-D thin-plate term is left untouched — anisotropy is meaningless on a
3622/// single axis (its `Σ η = 0` contrast vector is empty), exactly as for a 1-D
3623/// Matérn/Duchon term.
3624fn promote_thin_plate_for_scale_dimensions(basis: &mut SmoothBasisSpec) {
3625 let SmoothBasisSpec::ThinPlate {
3626 feature_cols,
3627 spec,
3628 input_scales,
3629 } = &*basis
3630 else {
3631 return;
3632 };
3633 let d = feature_cols.len();
3634 if d <= 1 {
3635 return;
3636 }
3637 // m = thin_plate_penalty_order(d) is the TPS penalty order; the Duchon
3638 // null-space order naming is `Zero → m=1`, `Linear → m=2`,
3639 // `Degree(g) → m=g+1`, so the s=0 Duchon kernel exponent
3640 // `2(p+s) − d = 2m − d` reproduces the TPS kernel exactly.
3641 let m = thin_plate_penalty_order(d);
3642 let nullspace_order = match m {
3643 0 | 1 => DuchonNullspaceOrder::Zero,
3644 2 => DuchonNullspaceOrder::Linear,
3645 _ => DuchonNullspaceOrder::Degree(m - 1),
3646 };
3647 let duchon_spec = DuchonBasisSpec {
3648 center_strategy: spec.center_strategy.clone(),
3649 periodic: spec.periodic.clone(),
3650 // Pure, scale-free Duchon — the thin-plate kernel has no length scale
3651 // (a global TPS kernel scale is non-identifiable once REML learns the
3652 // smoothing penalty: gam#718/#721/#731/#732). The per-axis relevance
3653 // the user asked for is carried by the tension-ARD `λ_a`, not a κ axis.
3654 length_scale: None,
3655 // s = 0 ⇒ thin-plate kernel `r^{2m−d}`.
3656 power: 0.0,
3657 nullspace_order,
3658 identifiability: spec.identifiability.clone(),
3659 // All-zero geometry seed sentinel: `auto_seed_aniso_contrasts` resolves
3660 // it from the (standardized) knot cloud, and the per-axis tension split
3661 // engages on `aniso.is_some()`.
3662 aniso_log_scales: Some(vec![0.0; d]),
3663 operator_penalties: DuchonOperatorPenaltySpec::default(),
3664 boundary: OneDimensionalBoundary::Open,
3665 radial_reparam: None,
3666 };
3667 let feature_cols = feature_cols.clone();
3668 let input_scales = input_scales.clone();
3669 // All borrows of `*basis` (the `&*basis` destructure above) end with the
3670 // clones on the two preceding lines, so the reassignment is sound.
3671 *basis = SmoothBasisSpec::Duchon {
3672 feature_cols,
3673 spec: duchon_spec,
3674 input_scales,
3675 };
3676}
3677
3678// ---------------------------------------------------------------------------
3679// Data-aware helpers
3680// ---------------------------------------------------------------------------
3681
3682pub fn spatial_center_strategy_for_dimension(num_centers: usize, d: usize) -> CenterStrategy {
3683 if d <= 3 {
3684 // In low-dimensional spatial smooths, an explicit `k` is a resolution
3685 // request rather than a request for marginal quantile-midpoint centers.
3686 // Use deterministic maximin geometry so Matérn/GP and Duchon REML see a
3687 // well-resolved native kernel block with small fill distance instead of
3688 // compensating for holes or endpoint under-resolution by over-smoothing
3689 // low-noise signals (#504).
3690 CenterStrategy::FarthestPoint { num_centers }
3691 } else {
3692 default_spatial_center_strategy(num_centers, d)
3693 }
3694}
3695
3696pub fn col_minmax(col: ArrayView1<'_, f64>) -> Result<(f64, f64), String> {
3697 let min = col.iter().fold(f64::INFINITY, |a, &b| a.min(b));
3698 let max = col.iter().fold(f64::NEG_INFINITY, |a, &b| a.max(b));
3699 if !min.is_finite() || !max.is_finite() {
3700 return Err(TermBuilderError::degenerate_data(
3701 "non-finite data encountered while inferring knot range",
3702 )
3703 .to_string());
3704 }
3705 if (max - min).abs() < 1e-12 {
3706 Ok((min, min + 1e-6))
3707 } else {
3708 Ok((min, max))
3709 }
3710}
3711
3712pub fn unique_count_column(col: ArrayView1<'_, f64>) -> usize {
3713 use std::collections::HashSet;
3714 let mut set = HashSet::<u64>::with_capacity(col.len());
3715 for &v in col {
3716 let norm = if v == 0.0 { 0.0 } else { v };
3717 set.insert(norm.to_bits());
3718 }
3719 set.len().max(1)
3720}
3721
3722/// Minimum knot count for a natural cubic regression spline: `select_cr_knots`
3723/// places one value-knot per basis function and needs at least an interior knot,
3724/// so the sparsest representable cr basis is `{const, linear, curvature}` at
3725/// three knots. Below this a cr spline is not constructible and the caller must
3726/// degrade to the linear B-spline marginal.
3727pub(crate) const CR_MIN_KNOTS: usize = 3;
3728
3729/// Build a cubic-regression marginal knot spec capped to the covariate's data
3730/// support, mgcv-style.
3731///
3732/// A `cr`/`cs`/`sz` marginal places exactly one basis function per value-knot,
3733/// so `select_cr_knots` cannot place more knots than the covariate has DISTINCT
3734/// values — it `bail`s with "cubic regression spline with k=N requires at least
3735/// N distinct values" otherwise. An unclamped `k` on an ordinary low-cardinality
3736/// covariate (a binary indicator, a 3-level ordinal/Likert score, a small count)
3737/// therefore hard-failed the whole fit instead of reducing the basis the way
3738/// mgcv — and gam's own tensor-margin path (996f829d7, `term_builder.rs:2986` /
3739/// the `k_axis >= 3` cr gate at `:3047`) — do. This is the univariate / factor-
3740/// smooth sibling of that tensor cap (#1541, #1542).
3741///
3742/// Returns:
3743/// - `Some(NaturalCubicRegression { .. })` with `k = min(k_requested, n_distinct)`
3744/// value-knots when the data supports a cr spline (`n_distinct >= CR_MIN_KNOTS`).
3745/// A cr basis of exactly `n_distinct` knots is full-rank for the data — it can
3746/// represent any per-distinct-value structure (e.g. 3 arbitrary group means on
3747/// a ternary covariate) — so the cap never costs recoverable signal.
3748/// - `None` when `n_distinct < CR_MIN_KNOTS` (a binary covariate): too few
3749/// distinct values for ANY cr spline, so the caller degrades to the linear
3750/// B-spline marginal — exactly what the default `s(x, k=..)` basis already
3751/// builds on the same data, and what the tensor path's `< 3` branch builds.
3752///
3753/// `inference_notes` records any reduction so the user sees that `k` was capped
3754/// (mgcv emits a warning in the same situation).
3755fn capped_cr_marginal_knotspec(
3756 col: ArrayView1<'_, f64>,
3757 k_cr_requested: usize,
3758 label: &str,
3759 inference_notes: &mut Vec<String>,
3760) -> Result<Option<BSplineKnotSpec>, String> {
3761 let n_distinct = unique_count_column(col);
3762 let k_cr = k_cr_requested.min(n_distinct);
3763 if k_cr < CR_MIN_KNOTS {
3764 inference_notes.push(format!(
3765 "Smooth '{label}': cubic-regression ('cr'/'cs'/'sz') basis requested k={k_cr_requested}, \
3766 but the covariate has only {n_distinct} distinct value(s) — too few to support a cubic \
3767 regression spline (needs >= {CR_MIN_KNOTS} distinct values). Degraded to the linear \
3768 B-spline marginal the default basis builds on the same data."
3769 ));
3770 return Ok(None);
3771 }
3772 if k_cr < k_cr_requested {
3773 inference_notes.push(format!(
3774 "Smooth '{label}': cubic-regression ('cr'/'cs'/'sz') basis reduced from k={k_cr_requested} \
3775 to k={k_cr} to match the covariate's {n_distinct} distinct value(s) (mgcv-style \
3776 data-support cap; a cr basis cannot place more value-knots than the data has)."
3777 ));
3778 }
3779 let cr_knots = crate::basis::select_cr_knots(col, k_cr).map_err(|e| e.to_string())?;
3780 Ok(Some(BSplineKnotSpec::NaturalCubicRegression {
3781 knots: cr_knots,
3782 }))
3783}
3784
3785/// Smallest number of distinct covariate values seen within any single group
3786/// of `group_col`. For a factor smooth this is the resolution that bounds the
3787/// marginal basis: a group with `m` distinct covariate values can only inform
3788/// `m` basis coefficients, so a marginal richer than that interpolates the
3789/// group instead of estimating a penalized trend. Bits are compared exactly so
3790/// integer-valued covariates (days, dose levels) collapse to their true count.
3791fn min_per_group_unique_count(
3792 feature_col: ArrayView1<'_, f64>,
3793 group_col: ArrayView1<'_, f64>,
3794) -> usize {
3795 use std::collections::{HashMap, HashSet};
3796 let mut per_group: HashMap<u64, HashSet<u64>> = HashMap::new();
3797 for (xi, gi) in feature_col.iter().zip(group_col.iter()) {
3798 let xnorm = if *xi == 0.0 { 0.0 } else { *xi };
3799 let gnorm = if *gi == 0.0 { 0.0 } else { *gi };
3800 per_group
3801 .entry(gnorm.to_bits())
3802 .or_default()
3803 .insert(xnorm.to_bits());
3804 }
3805 per_group
3806 .values()
3807 .map(|s| s.len())
3808 .min()
3809 .unwrap_or(1)
3810 .max(1)
3811}
3812
3813/// Default internal-knot count for an *additive* univariate smooth, derived
3814/// from the column's unique-value count.
3815///
3816/// The basis dimension is `internal_knots + degree + 1`, so the cap below maps
3817/// to a default cubic basis of ~12 functions — deliberately close to mgcv's
3818/// univariate default (`k = 10`). A penalized smooth controls its wiggliness
3819/// through the *penalty*, not the basis size: REML/LAML shrinks a too-rich
3820/// basis toward the null, but it cannot do so cleanly when the basis is so
3821/// over-sized that the design becomes weakly identified. Growing the basis with
3822/// `n` (the old `n^(1/3)`-ceilinged `unique/4` rule, which pinned to 20 internal
3823/// knots ⇒ a 24-function basis for any column with ≥80 unique values) therefore
3824/// *hurts* recovery on finite, weak-signal fits: a 4-smooth additive model on
3825/// n=120 asks for ~92 coefficients, the outer optimizer stalls on the resulting
3826/// flat two-penalty (range + null-space) REML surface, and the truth leaks into
3827/// surplus columns the penalty can't shrink away (gam#1680; the same defect was
3828/// documented for thin-plate fields in gam#1074). A k-sweep on the #1680 design
3829/// confirms a basis of ~10–15 recovers truth at RMSE ≈ 0.12 while the old
3830/// 24-function default lands at ≈ 0.39 (~3× worse) — *whether or not* the
3831/// covariates are collinear, so this is basis over-richness, not collinearity.
3832///
3833/// The cap is flat in `n`: a user who genuinely needs a wigglier fit raises `k`
3834/// explicitly (mgcv's contract — opt *in* to more flexibility), and the SPEC
3835/// requires the default to allow recovering the null rather than forcing the
3836/// user to opt out of overfitting. The 4-knot floor stays put because we still
3837/// need enough basis functions to fit a non-trivial smooth at all, and the
3838/// `unique/4` growth below the cap keeps small/sparse columns (n ≤ 32, where
3839/// `unique/4 ≤ 8`) on exactly their previous knot count.
3840pub fn heuristic_knots_for_column(col: ArrayView1<'_, f64>) -> usize {
3841 /// Default cubic basis ≈ `MAX_DEFAULT_INTERNAL_KNOTS + degree + 1` = 12
3842 /// functions, matching mgcv's lean univariate default.
3843 const MAX_DEFAULT_INTERNAL_KNOTS: usize = 8;
3844 let unique = unique_count_column(col);
3845 (unique / 4).clamp(4, MAX_DEFAULT_INTERNAL_KNOTS)
3846}
3847
3848/// Per-margin basis sizes for a tensor-product smooth (`te`/`ti`/`t2`).
3849///
3850/// The 1-D heuristic [`heuristic_knots_for_column`] is calibrated for an
3851/// *additive* margin: a well-resolved column asks for the lean univariate
3852/// default (≈12 basis functions, the mgcv-like cap of 8 internal knots; see
3853/// gam#1680), which is sensible for a single `s(x)` term.
3854/// A tensor product, however, multiplies the per-margin sizes:
3855/// `p = ∏_d k_d`. Reusing the 1-D rule per margin makes `p` explode with the
3856/// tensor dimension — a 3-D `te(x,y,z)` at the 1-D ceiling of 12/margin is
3857/// `12³ ≈ 1728` columns, and every REML evaluation pays an O(p³) dense
3858/// penalty reparameterization (the full-tensor sum-to-zero constraint is not
3859/// Kronecker-factorable), turning model selection over tensor candidates into
3860/// a multi-minute single-threaded stall (gam#813). It also requests far more
3861/// coefficients than the data can identify whenever `p ≫ n`.
3862///
3863/// mgcv's `te(...)` uses a small per-margin default (`k = 5`, i.e. `5^d`).
3864/// We match that spirit while staying data-adaptive: budget the *total* tensor
3865/// column count `p_target` and distribute it geometrically across the margins
3866/// so `∏ k_d ≈ p_target`, never asking a margin for more functions than its
3867/// own unique values (and the data set) can support.
3868fn heuristic_tensor_margin_knots(cols: &[usize], ds: &Dataset) -> Vec<usize> {
3869 let d = cols.len().max(1);
3870 let degree = DEFAULT_BSPLINE_DEGREE;
3871 let min_k = degree + 2; // smallest margin that carries a difference penalty
3872 let n = ds.values.nrows();
3873
3874 // Per-margin 1-D ceiling: never request more basis functions than the
3875 // margin's own resolution (unique values) supports. This caps each axis
3876 // independently before the joint budget is applied.
3877 let per_margin_cap: Vec<usize> = cols
3878 .iter()
3879 .map(|&c| heuristic_knots_for_column(ds.values.column(c)).max(min_k))
3880 .collect();
3881
3882 // Total-basis budget. A tensor with ∏k ≫ n coefficients is rank-deficient
3883 // and pure REML cost; cap the product at a generous fraction of n while
3884 // honoring mgcv's small default for the common small-d case. The budget
3885 // grows with n but the geometric split below keeps each margin modest.
3886 // d=2 → up to ~7²=49 (mgcv-`te`-like), d=3 → ~5³=125, larger d shrinks
3887 // per-margin further so the product never blows past the data support.
3888 let mgcv_like_per_margin = match d {
3889 2 => 7usize,
3890 3 => 5usize,
3891 _ => 4usize,
3892 };
3893 let mgcv_like_total = (mgcv_like_per_margin as f64).powi(d as i32);
3894 let data_budget = (n as f64) * 0.8;
3895 let p_target = mgcv_like_total
3896 .max(min_k.pow(d as u32) as f64)
3897 .min(data_budget);
3898
3899 // Geometric per-margin target so ∏k ≈ p_target, then clamp each margin to
3900 // its own 1-D resolution cap and the difference-penalty floor.
3901 let geo_per_margin = p_target.powf(1.0 / d as f64).round() as usize;
3902 let unclamped: Vec<usize> = per_margin_cap
3903 .iter()
3904 .map(|&cap| geo_per_margin.clamp(min_k, cap))
3905 .collect();
3906
3907 // The per-margin clamps can pull some axes below `geo_per_margin` (a
3908 // low-resolution column), leaving headroom in the joint budget. Redistribute
3909 // that headroom to the margins that can still grow, so the realized ∏k stays
3910 // close to p_target instead of systematically under-shooting it.
3911 let mut k_list = unclamped;
3912 loop {
3913 let product: f64 = k_list.iter().map(|&k| k as f64).product();
3914 if product >= p_target {
3915 break;
3916 }
3917 // Grow the axis with the most remaining headroom (cap − current),
3918 // breaking ties toward the largest cap. Stop when none can grow.
3919 let Some(idx) = k_list
3920 .iter()
3921 .zip(per_margin_cap.iter())
3922 .enumerate()
3923 .filter(|&(_, (k, cap))| k < cap)
3924 .max_by_key(|&(_, (k, cap))| (cap - k, *cap))
3925 .map(|(i, _)| i)
3926 else {
3927 break;
3928 };
3929 k_list[idx] += 1;
3930 }
3931 k_list
3932}
3933
3934pub fn heuristic_centers(n: usize, d: usize) -> usize {
3935 default_num_centers(n, d)
3936}
3937
3938// ---------------------------------------------------------------------------
3939// Smooth option parsers
3940// ---------------------------------------------------------------------------
3941
3942fn parse_endpoint_side(
3943 value: &str,
3944 context: &str,
3945) -> Result<BSplineEndpointBoundaryCondition, String> {
3946 match value.trim().to_ascii_lowercase().as_str() {
3947 "" | "none" | "open" | "unconstrained" | "free" => {
3948 Ok(BSplineEndpointBoundaryCondition::Free)
3949 }
3950 "clamped" | "clamp" | "zero_derivative" | "zero-derivative" => {
3951 Ok(BSplineEndpointBoundaryCondition::Clamped)
3952 }
3953 "anchored" | "anchor" | "zero" | "zero_value" | "zero-value" => {
3954 Ok(BSplineEndpointBoundaryCondition::Anchored { value: 0.0 })
3955 }
3956 other => Err(format!(
3957 "unsupported {context} boundary condition '{other}'; expected free, clamped, or anchored"
3958 )),
3959 }
3960}
3961
3962fn boundary_anchor_value(
3963 options: &BTreeMap<String, String>,
3964 side: &str,
3965 fallback: Option<f64>,
3966) -> Option<f64> {
3967 [
3968 format!("anchor_{side}"),
3969 format!("{side}_anchor"),
3970 format!("anchor-value-{side}"),
3971 ]
3972 .iter()
3973 .find_map(|key| option_f64(options, key))
3974 .or(fallback)
3975}
3976
3977fn apply_anchor_value(
3978 cond: BSplineEndpointBoundaryCondition,
3979 value: Option<f64>,
3980) -> BSplineEndpointBoundaryCondition {
3981 match cond {
3982 BSplineEndpointBoundaryCondition::Anchored { .. } => {
3983 BSplineEndpointBoundaryCondition::Anchored {
3984 value: value.unwrap_or(0.0),
3985 }
3986 }
3987 other => other,
3988 }
3989}
3990
3991fn parse_bspline_boundary_conditions(
3992 options: &BTreeMap<String, String>,
3993) -> Result<BSplineBoundaryConditions, String> {
3994 let fallback_anchor = option_f64(options, "anchor")
3995 .or_else(|| option_f64(options, "anchor_value"))
3996 .or_else(|| option_f64(options, "value"));
3997 let global_boundary_conditions = options
3998 .get("boundary_conditions")
3999 .or_else(|| options.get("bc"));
4000 let mut boundary_conditions = BSplineBoundaryConditions::default();
4001
4002 if let Some(raw_boundary_conditions) = global_boundary_conditions {
4003 let cond = parse_endpoint_side(raw_boundary_conditions, "boundary_conditions")?;
4004 let side = options
4005 .get("side")
4006 .map(|s| s.trim().to_ascii_lowercase())
4007 .unwrap_or_else(|| "both".to_string());
4008 match side.as_str() {
4009 "both" | "all" | "endpoints" => {
4010 boundary_conditions.left = cond;
4011 boundary_conditions.right = cond;
4012 }
4013 "left" | "start" | "lower" => boundary_conditions.left = cond,
4014 "right" | "end" | "upper" => boundary_conditions.right = cond,
4015 other => {
4016 return Err(format!(
4017 "unsupported B-spline boundary side '{other}'; expected left, right, or both"
4018 ));
4019 }
4020 }
4021 }
4022
4023 if let Some(raw) = options
4024 .get("bc_left")
4025 .or_else(|| options.get("left_bc"))
4026 .or_else(|| options.get("bc_start"))
4027 .or_else(|| options.get("start_bc"))
4028 {
4029 boundary_conditions.left = parse_endpoint_side(raw, "left endpoint")?;
4030 }
4031 if let Some(raw) = options
4032 .get("bc_right")
4033 .or_else(|| options.get("right_bc"))
4034 .or_else(|| options.get("bc_end"))
4035 .or_else(|| options.get("end_bc"))
4036 {
4037 boundary_conditions.right = parse_endpoint_side(raw, "right endpoint")?;
4038 }
4039
4040 boundary_conditions.left = apply_anchor_value(
4041 boundary_conditions.left,
4042 boundary_anchor_value(options, "left", fallback_anchor),
4043 );
4044 boundary_conditions.right = apply_anchor_value(
4045 boundary_conditions.right,
4046 boundary_anchor_value(options, "right", fallback_anchor),
4047 );
4048
4049 // Non-zero anchors require an affine offset term that the current basis
4050 // builder does not synthesize (see `build_bspline_basis_1d` in
4051 // src/terms/basis.rs). Surface the rejection at parse time with the side
4052 // and value in the diagnostic, instead of letting the value-only error
4053 // emerge deep inside the basis builder where the user has no context
4054 // about which anchor key (`anchor`, `left_anchor`, `right_anchor`, …)
4055 // routed into which endpoint.
4056 reject_nonzero_anchor("left", boundary_conditions.left)?;
4057 reject_nonzero_anchor("right", boundary_conditions.right)?;
4058
4059 Ok(boundary_conditions)
4060}
4061
4062fn reject_nonzero_anchor(side: &str, cond: BSplineEndpointBoundaryCondition) -> Result<(), String> {
4063 if let BSplineEndpointBoundaryCondition::Anchored { value } = cond {
4064 if value.abs() > 1e-12 {
4065 return Err(format!(
4066 "non-zero {side} anchor {value} requires an affine offset term that is not yet supported; only anchored value 0 is accepted at parse time"
4067 ));
4068 }
4069 }
4070 Ok(())
4071}
4072
4073/// Resolve the requested internal-knot count and effective spline degree for
4074/// a 1-D penalized B-spline smooth. This mirrors the tensor-margin per-axis
4075/// degree-reduction policy: a 1-D B-spline basis with `k` functions
4076/// is well-defined for any `degree <= k - 1`, so an explicit
4077/// `s(x, bs="ps", k=3)` with default `degree=3` is interpreted as the
4078/// largest representable spline (`effective_degree = k - 1 = 2`, quadratic)
4079/// rather than rejected. The `penalty_order` carried by the caller must be
4080/// clamped to `<= effective_degree` so the marginal difference penalty
4081/// stays well-defined; the returned `effective_degree` makes that explicit.
4082///
4083/// Mirrors the tensor margin treatment in the `te(...)` builder so a
4084/// standalone smooth, a factor smooth, and a tensor margin all interpret
4085/// "small k" the same way.
4086fn parse_ps_internal_knots(
4087 options: &BTreeMap<String, String>,
4088 degree: usize,
4089 default_internal_knots: usize,
4090) -> Result<(usize, bool, usize), String> {
4091 const MIN_EXPRESSIVE_INTERNAL_KNOTS: usize = 2;
4092 // Strict variants: reject `k=-1`, `k=1.5`, `knots=-2` etc. with a
4093 // focused error instead of silently dropping the value and using the
4094 // default. Lenient `option_usize` / `option_usize_any` silently swallow
4095 // unparseable values, which leaves the user thinking they configured
4096 // something when they did not.
4097 // A list-valued `knots=[...]` carries explicit internal positions, not a
4098 // count; it is consumed by `parse_explicit_internal_knots`. Treat it as
4099 // "count not specified" here so the strict integer parse does not reject
4100 // the bracketed value (the Provided path ignores the returned count).
4101 let knots_internal = if knots_option_is_list(options) {
4102 None
4103 } else {
4104 option_usize_strict(options, "knots")?
4105 };
4106 let basis_dim = option_usize_any_strict(options, &["k", "basis_dim", "basis-dim", "basisdim"])?;
4107 if knots_internal.is_some() && basis_dim.is_some() {
4108 return Err(TermBuilderError::incompatible_config(
4109 "ps/bspline smooth: specify either knots=<internal_knots> or k=<basis_dim> (not both)",
4110 )
4111 .to_string());
4112 }
4113 if let Some(k) = basis_dim {
4114 if k < 2 {
4115 return Err(TermBuilderError::invalid_option(format!(
4116 "ps/bspline smooth: k={} too small; B-spline basis requires k >= 2",
4117 k
4118 ))
4119 .to_string());
4120 }
4121 // `degree <= k - 1` is required for the B-spline basis to be
4122 // well-defined; reduce on this axis only when the user asked for
4123 // a smaller k than the cubic default supports. This matches mgcv's
4124 // behaviour (e.g. `s(x, bs="ps", k=3)` becomes a quadratic basis)
4125 // and the per-axis reduction the tensor builder already does.
4126 let effective_degree = degree.min(k - 1).max(1);
4127 let num_internal_knots = if effective_degree < degree {
4128 // Reproduce the requested basis size exactly when degree was
4129 // reduced for a low-cardinality axis: num_basis = k.
4130 k.saturating_sub(effective_degree + 1)
4131 } else {
4132 (k - degree - 1).max(MIN_EXPRESSIVE_INTERNAL_KNOTS)
4133 };
4134 Ok((num_internal_knots, false, effective_degree))
4135 } else {
4136 Ok((
4137 knots_internal.unwrap_or(default_internal_knots),
4138 knots_internal.is_none(),
4139 degree,
4140 ))
4141 }
4142}
4143
4144/// True when the `knots` option value is a *list* literal (`[...]`, `c(...)`,
4145/// or `(...)`) rather than a scalar count. mgcv's `knots=` accepts both: a
4146/// single integer is an internal-knot count, while a vector is explicit
4147/// internal knot positions. We disambiguate purely on the wrapper syntax so a
4148/// bare `knots=5` keeps its historical count meaning.
4149fn knots_option_is_list(options: &BTreeMap<String, String>) -> bool {
4150 options
4151 .get("knots")
4152 .map(|raw| {
4153 let t = raw.trim();
4154 t.starts_with('[') || t.starts_with("c(") || t.starts_with("C(") || t.starts_with('(')
4155 })
4156 .unwrap_or(false)
4157}
4158
4159/// Parse `knots=[k0, k1, ...]` (or `c(...)` / `(...)`) into explicit internal
4160/// knot positions. Returns `Ok(None)` when `knots` is absent or a scalar count
4161/// (handled by [`parse_ps_internal_knots`]); `Ok(Some(positions))` when it is a
4162/// non-empty numeric list; and an error for an empty or unparseable list.
4163fn parse_explicit_internal_knots(
4164 options: &BTreeMap<String, String>,
4165) -> Result<Option<Vec<f64>>, String> {
4166 if !knots_option_is_list(options) {
4167 return Ok(None);
4168 }
4169 let raw = options
4170 .get("knots")
4171 .expect("knots_option_is_list implies the key is present");
4172 let tokens = split_list_option(raw);
4173 if tokens.is_empty() {
4174 return Err(TermBuilderError::invalid_option(format!(
4175 "knots={raw} is an empty list; supply at least one internal knot position \
4176 (e.g. knots=[0.2, 0.5, 0.8]) or a scalar count (e.g. knots=8)"
4177 ))
4178 .to_string());
4179 }
4180 let mut positions = Vec::with_capacity(tokens.len());
4181 for tok in &tokens {
4182 let value = parse_numeric_expr(tok).map_err(|err| {
4183 TermBuilderError::invalid_option(format!(
4184 "knots list entry '{tok}' is not a numeric position: {err}"
4185 ))
4186 .to_string()
4187 })?;
4188 positions.push(value);
4189 }
4190 Ok(Some(positions))
4191}
4192
4193/// Resolve the `knot_placement=` option for an automatically generated knot
4194/// vector. Accepts `"uniform"` (the default, equal spacing on the data range)
4195/// and `"quantile"` (interior knots at empirical data quantiles, better for
4196/// skewed covariates). Unknown values are rejected so typos do not silently
4197/// fall back to uniform.
4198fn parse_knot_placement(
4199 options: &BTreeMap<String, String>,
4200) -> Result<crate::basis::BSplineKnotPlacement, String> {
4201 use crate::basis::BSplineKnotPlacement;
4202 match options
4203 .get("knot_placement")
4204 .or_else(|| options.get("knot-placement"))
4205 .or_else(|| options.get("knotplacement"))
4206 {
4207 None => Ok(BSplineKnotPlacement::Uniform),
4208 Some(raw) => match raw
4209 .trim()
4210 .trim_matches('"')
4211 .trim_matches('\'')
4212 .to_ascii_lowercase()
4213 .as_str()
4214 {
4215 "uniform" | "even" | "equal" => Ok(BSplineKnotPlacement::Uniform),
4216 "quantile" | "quantiles" | "data" | "empirical" => Ok(BSplineKnotPlacement::Quantile),
4217 other => Err(TermBuilderError::invalid_option(format!(
4218 "knot_placement={other} is not recognised; expected \"uniform\" or \"quantile\""
4219 ))
4220 .to_string()),
4221 },
4222 }
4223}
4224
4225/// Build the non-periodic 1D B-spline knot spec for the `ps`/`bspline` and
4226/// factor-smooth marginal paths, honoring (in priority order):
4227/// 1. `knots=[...]` explicit internal positions → [`BSplineKnotSpec::Provided`]
4228/// 2. `knot_placement="quantile"` → [`BSplineKnotSpec::Automatic`]
4229/// 3. uniform generation → [`BSplineKnotSpec::Generate`]
4230///
4231/// `data` is the covariate column (used to clamp explicit positions to the
4232/// observed range and to drive quantile placement); `n_knots` is the resolved
4233/// internal-knot count from [`parse_ps_internal_knots`] used for the automatic
4234/// strategies.
4235fn resolve_nonperiodic_bspline_knotspec(
4236 options: &BTreeMap<String, String>,
4237 data: ArrayView1<'_, f64>,
4238 data_range: (f64, f64),
4239 degree: usize,
4240 n_knots: usize,
4241) -> Result<BSplineKnotSpec, String> {
4242 use crate::basis::{BSplineKnotPlacement, clamped_knot_vector_from_internal_positions};
4243 if let Some(positions) = parse_explicit_internal_knots(options)? {
4244 if option_usize_any_strict(options, &["k", "basis_dim", "basis-dim", "basisdim"])?.is_some()
4245 {
4246 return Err(TermBuilderError::incompatible_config(
4247 "ps/bspline smooth: specify either explicit knots=[...] positions or \
4248 k=<basis_dim> (not both); the basis size is fixed by the knot vector",
4249 )
4250 .to_string());
4251 }
4252 let knots = clamped_knot_vector_from_internal_positions(data_range, &positions, degree)
4253 .map_err(|e| e.to_string())?;
4254 return Ok(BSplineKnotSpec::Provided(knots));
4255 }
4256 match parse_knot_placement(options)? {
4257 BSplineKnotPlacement::Uniform => Ok(BSplineKnotSpec::Generate {
4258 data_range,
4259 num_internal_knots: n_knots,
4260 }),
4261 BSplineKnotPlacement::Quantile => {
4262 // Validate the column up-front so an unfittable request surfaces a
4263 // user-correctable error at parse time rather than deep in basis
4264 // construction. The same data drives the eventual quantile knots.
4265 crate::basis::auto_knot_vector_1d_quantile(data, n_knots, degree)
4266 .map_err(|e| e.to_string())?;
4267 Ok(BSplineKnotSpec::Automatic {
4268 num_internal_knots: Some(n_knots),
4269 placement: BSplineKnotPlacement::Quantile,
4270 })
4271 }
4272 }
4273}
4274
4275/// Reject unknown option keys with a focused error that names the term and
4276/// the offending key, plus suggests near-matches from the known-key list.
4277/// Without this, typos like `lengt_scale=0.1` or `nyu=5/2` are silently
4278/// dropped, the term uses the default, and the user has no idea why their
4279/// option had no effect.
4280pub fn validate_known_options(
4281 term_name: &str,
4282 options: &BTreeMap<String, String>,
4283 known: &[&str],
4284) -> Result<(), String> {
4285 let known_set: std::collections::BTreeSet<&&str> = known.iter().collect();
4286 for key in options.keys() {
4287 if !known_set.contains(&key.as_str()) {
4288 if term_name == "tensor" && is_tensor_k_axis_option_key(key) {
4289 continue;
4290 }
4291 // Suggest near-matches (substring or shared prefix ≥ 3).
4292 let key_l = key.to_ascii_lowercase();
4293 let mut suggestions: Vec<&str> = known
4294 .iter()
4295 .filter(|k| {
4296 let kl = k.to_ascii_lowercase();
4297 kl.contains(&key_l) || key_l.contains(&kl) || {
4298 let n = kl
4299 .chars()
4300 .zip(key_l.chars())
4301 .take_while(|(a, b)| a == b)
4302 .count();
4303 n >= 3
4304 }
4305 })
4306 .copied()
4307 .collect();
4308 suggestions.sort_unstable();
4309 suggestions.dedup();
4310 let hint = if suggestions.is_empty() {
4311 String::new()
4312 } else {
4313 format!(" — did you mean one of [{}]?", suggestions.join(", "))
4314 };
4315 return Err(TermBuilderError::invalid_option(format!(
4316 "{term_name}() does not accept option `{key}`{hint}. Valid options: [{}]",
4317 {
4318 let mut sorted = known.to_vec();
4319 sorted.sort_unstable();
4320 sorted.join(", ")
4321 }
4322 ))
4323 .to_string());
4324 }
4325 }
4326 Ok(())
4327}
4328
4329/// Private (engine-injected) option that caps the *default* spatial center
4330/// count for a secondary (distributional) predictor's smooth — see
4331/// `solver::fit_orchestration::apply_secondary_predictor_basis_parsimony` and #501.
4332///
4333/// It is deliberately NOT one of the user-facing count aliases recognised by
4334/// [`has_explicit_countwith_basis_alias`], so it never flips the spatial basis
4335/// onto the explicit (hard) center-placement strategy: the cap lowers the
4336/// *default* count while the `Auto` strategy is retained, so the count is still
4337/// softly reduced when the data can't support it.
4338pub const SECONDARY_CENTER_CAP_OPTION: &str = "__secondary_center_cap";
4339
4340/// Apply the secondary-predictor center cap to a *default* spatial center
4341/// count. A no-op when the cap option is absent (the common case) or when the
4342/// user supplied an explicit count (then `default_count` is ignored downstream
4343/// by [`parse_countwith_basis_alias`] anyway).
4344pub(crate) fn cap_default_spatial_centers(
4345 options: &BTreeMap<String, String>,
4346 default_count: usize,
4347) -> usize {
4348 match option_usize(options, SECONDARY_CENTER_CAP_OPTION) {
4349 Some(cap) => default_count.min(cap),
4350 None => default_count,
4351 }
4352}
4353
4354fn default_matern_center_count(n: usize, d: usize, planned_count: usize) -> usize {
4355 // #1074: the mgcv-sized basis cap (`k = 10·3^(d-1)`) was DELETED here too — it
4356 // masked the same over-sizing/under-penalization defect by shrinking the basis
4357 // rather than fixing the optimizer. The default now uses the generic n-scaling
4358 // plan. A small-n floor against a numerically-fragile two-column kernel block
4359 // is a legitimate degenerate guard and is kept. Explicit `k`/`centers` still
4360 // take full effect upstream.
4361 let low_n_floor = (d + 4).min(n);
4362 planned_count.max(low_n_floor).max(1)
4363}
4364
4365pub fn parse_countwith_basis_alias(
4366 options: &BTreeMap<String, String>,
4367 primarykey: &str,
4368 default_count: usize,
4369) -> Result<usize, String> {
4370 // Strict: reject unparseable values (e.g. `centers=many`, `centers=-1`,
4371 // `centers=1.5`) instead of silently dropping them and falling through
4372 // to the default. Without this the user gets the auto-inferred count
4373 // silently and never realizes their explicit option was ignored.
4374 let primary = option_usize_strict(options, primarykey)?;
4375 let basis_dim = option_usize_any_strict(
4376 options,
4377 &["k", "basis_dim", "basis-dim", "basisdim", "knots"],
4378 )?;
4379 if primary.is_some() && basis_dim.is_some() {
4380 return Err(TermBuilderError::incompatible_config(format!(
4381 "specify either {}=<count> or k=<basis_dim> (not both)",
4382 primarykey
4383 ))
4384 .to_string());
4385 }
4386 Ok(primary.or(basis_dim).unwrap_or(default_count))
4387}
4388
4389pub fn has_explicit_countwith_basis_alias(
4390 options: &BTreeMap<String, String>,
4391 primarykey: &str,
4392) -> bool {
4393 options.contains_key(primarykey)
4394 || ["k", "basis_dim", "basis-dim", "basisdim", "knots"]
4395 .iter()
4396 .any(|alias| options.contains_key(*alias))
4397}
4398
4399pub fn parse_cyclic_boundary(
4400 options: &BTreeMap<String, String>,
4401 minv: f64,
4402 maxv: f64,
4403) -> Result<OneDimensionalBoundary, String> {
4404 let cyclic = option_bool(options, "cyclic")
4405 .or_else(|| option_bool(options, "periodic"))
4406 .unwrap_or(false);
4407 if !cyclic {
4408 return Ok(OneDimensionalBoundary::Open);
4409 }
4410 let start = match option_numeric_expr(options, "period_start")? {
4411 Some(v) => v,
4412 None => option_numeric_expr(options, "start")?.unwrap_or(minv),
4413 };
4414 let end = match option_numeric_expr(options, "period_end")? {
4415 Some(v) => v,
4416 None => option_numeric_expr(options, "end")?.unwrap_or(maxv),
4417 };
4418 if end <= start {
4419 return Err(format!(
4420 "cyclic smooth requires period_end/end ({end}) > period_start/start ({start})"
4421 ));
4422 }
4423 Ok(OneDimensionalBoundary::Cyclic { start, end })
4424}
4425
4426/// Parse the periodic-uniform domain for a one-dimensional cyclic smooth.
4427///
4428/// Returns the `(domain_start, period)` pair derived from
4429/// `period_start` / `start`, `period_end` / `end`, falling back to the
4430/// data range `[minv, maxv)` when neither bound is provided. The period
4431/// must be strictly positive.
4432pub fn parse_periodic_domain_1d(
4433 options: &BTreeMap<String, String>,
4434 minv: f64,
4435 maxv: f64,
4436) -> Result<(f64, f64), String> {
4437 let start_opt = match option_numeric_expr(options, "period_start")? {
4438 Some(v) => Some(v),
4439 None => option_numeric_expr(options, "start")?,
4440 };
4441 let end_opt = match option_numeric_expr(options, "period_end")? {
4442 Some(v) => Some(v),
4443 None => option_numeric_expr(options, "end")?,
4444 };
4445 // Reject the pure data-range fallback. A B-spline periodic smooth that takes
4446 // its wrap from the observed [min, max] is sample-dependent and silently
4447 // wrong: uniform draws on a true period of 2π land on [ε, 2π−ε], so using
4448 // (max−min) as the period seams the curve with an off-by-ε discontinuity and
4449 // the fit drifts with the sample. (Unlike the radial closed-lattice Duchon
4450 // path, whose centers DO tile a full period, so its span-derive is exact —
4451 // see `parse_periodic_axes_option`.) Require the caller to name the period
4452 // explicitly via `period=`/`period_end`. The end is only defaulted to `maxv`
4453 // when a `period_start`/`start` was given (a half-open declaration); a bare
4454 // periodic smooth with neither bound is an error.
4455 if end_opt.is_none() && start_opt.is_none() {
4456 return Err(
4457 "periodic B-spline smooth requires an explicit period: pass period=<value> \
4458 (e.g. period=2*pi) or period_start=/period_end=. Deriving the period from the \
4459 observed data range is sample-dependent and produces an off-by-ε seam, so it is \
4460 not inferred."
4461 .to_string(),
4462 );
4463 }
4464 let start = start_opt.unwrap_or(minv);
4465 let end = end_opt.unwrap_or(maxv);
4466 if !(start.is_finite() && end.is_finite()) {
4467 return Err(format!(
4468 "periodic smooth domain requires finite endpoints, got ({start}, {end})"
4469 ));
4470 }
4471 if end <= start {
4472 return Err(format!(
4473 "periodic smooth requires period_end/end ({end}) > period_start/start ({start})"
4474 ));
4475 }
4476 Ok((start, end - start))
4477}
4478
4479fn parse_matern_nu(raw: &str) -> Result<MaternNu, String> {
4480 let trimmed = raw.trim();
4481 let lowered = trimmed.to_ascii_lowercase();
4482 match lowered.as_str() {
4483 "1/2" | "0.5" | "half" => return Ok(MaternNu::Half),
4484 "3/2" | "1.5" => return Ok(MaternNu::ThreeHalves),
4485 "5/2" | "2.5" => return Ok(MaternNu::FiveHalves),
4486 "7/2" | "3.5" => return Ok(MaternNu::SevenHalves),
4487 "9/2" | "4.5" => return Ok(MaternNu::NineHalves),
4488 _ => {}
4489 }
4490
4491 let value = if let Some((num, den)) = trimmed.split_once('/') {
4492 let num = num
4493 .trim()
4494 .parse::<f64>()
4495 .map_err(|err| format!("{}: {err}", unsupported_matern_nu_message(raw)))?;
4496 let den = den
4497 .trim()
4498 .parse::<f64>()
4499 .map_err(|err| format!("{}: {err}", unsupported_matern_nu_message(raw)))?;
4500 if den == 0.0 || !num.is_finite() || !den.is_finite() {
4501 return Err(unsupported_matern_nu_message(raw));
4502 }
4503 num / den
4504 } else {
4505 trimmed
4506 .parse::<f64>()
4507 .map_err(|err| format!("{}: {err}", unsupported_matern_nu_message(raw)))?
4508 };
4509
4510 const TOL: f64 = 1e-12;
4511 if (value - 0.5).abs() <= TOL {
4512 Ok(MaternNu::Half)
4513 } else if (value - 1.5).abs() <= TOL {
4514 Ok(MaternNu::ThreeHalves)
4515 } else if (value - 2.5).abs() <= TOL {
4516 Ok(MaternNu::FiveHalves)
4517 } else if (value - 3.5).abs() <= TOL {
4518 Ok(MaternNu::SevenHalves)
4519 } else if (value - 4.5).abs() <= TOL {
4520 Ok(MaternNu::NineHalves)
4521 } else {
4522 Err(unsupported_matern_nu_message(raw))
4523 }
4524}
4525
4526fn unsupported_matern_nu_message(raw: &str) -> String {
4527 TermBuilderError::unsupported_feature(format!(
4528 "unsupported Matern nu '{raw}'; supported half-integer values are 1/2, 3/2, 5/2, 7/2, and 9/2"
4529 ))
4530 .to_string()
4531}
4532
4533#[derive(Clone, Debug, serde::Serialize, serde::Deserialize)]
4534pub enum DuchonPowerPolicy {
4535 Explicit(f64),
4536 /// No explicit `power=` given: defer to the cubic structural default, which
4537 /// the builder resolves dimension-aware as `s = (d − 1)/2` (so `φ(r) = r³`
4538 /// in every dimension). There is no triple-operator minimum any more.
4539 CubicStructuralDefault,
4540}
4541
4542pub fn parse_duchon_power_policy(
4543 options: &BTreeMap<String, String>,
4544) -> Result<DuchonPowerPolicy, String> {
4545 if let Some(raw_nu) = options.get("nu") {
4546 return Err(TermBuilderError::incompatible_config(format!(
4547 "Duchon smooths use power=<number>, not nu='{}'. Use power=1.5, power=2, etc.",
4548 raw_nu
4549 ))
4550 .to_string());
4551 }
4552 match options.get("power") {
4553 Some(raw) => {
4554 let value = raw.parse::<f64>().map_err(|err| {
4555 TermBuilderError::invalid_option(format!(
4556 "invalid Duchon power '{}'; expected a non-negative number such as power=1.5 or power=2: {}",
4557 raw, err
4558 ))
4559 .to_string()
4560 })?;
4561 if !value.is_finite() || value < 0.0 {
4562 return Err(TermBuilderError::invalid_option(format!(
4563 "invalid Duchon power '{}'; expected a finite non-negative number such as power=1.5 or power=2",
4564 raw
4565 ))
4566 .to_string());
4567 }
4568 Ok(DuchonPowerPolicy::Explicit(value))
4569 }
4570 None => Ok(DuchonPowerPolicy::CubicStructuralDefault),
4571 }
4572}
4573
4574pub fn parse_duchon_power(options: &BTreeMap<String, String>) -> Result<f64, String> {
4575 match parse_duchon_power_policy(options)? {
4576 DuchonPowerPolicy::Explicit(power) => Ok(power),
4577 // Context-free placeholder: the bare option parser has no column count,
4578 // so it cannot compute the dimension-aware cubic power `s = (d − 1)/2`.
4579 // The dimension-aware resolution happens later in `build_smooth_basis`;
4580 // this 1.5 is only a stand-in for callers that need a concrete number
4581 // without data context (e.g. round-trip parser tests).
4582 DuchonPowerPolicy::CubicStructuralDefault => Ok(1.5),
4583 }
4584}
4585
4586pub fn parse_duchon_order(
4587 options: &BTreeMap<String, String>,
4588) -> Result<DuchonNullspaceOrder, String> {
4589 match options.get("order") {
4590 // Structural cubic Duchon is affine-by-default: an unspecified order is
4591 // the `Linear` (constant + linear) null space, matching the magic
4592 // default. An explicit `order=0` still selects the constant-only space.
4593 None => Ok(DuchonNullspaceOrder::Linear),
4594 Some(raw) => match raw.parse::<usize>() {
4595 Ok(0) => Ok(DuchonNullspaceOrder::Zero),
4596 Ok(1) => Ok(DuchonNullspaceOrder::Linear),
4597 Ok(other) => Ok(DuchonNullspaceOrder::Degree(other)),
4598 Err(_) => Err(TermBuilderError::invalid_option(format!(
4599 "invalid Duchon order '{}'; expected a non-negative integer such as order=0, order=1, or order=2",
4600 raw
4601 ))
4602 .to_string()),
4603 },
4604 }
4605}
4606
4607fn parse_matern_identifiability(
4608 options: &BTreeMap<String, String>,
4609) -> Result<MaternIdentifiability, TermBuilderError> {
4610 let Some(raw) = options.get("identifiability").map(String::as_str) else {
4611 return Ok(MaternIdentifiability::default());
4612 };
4613 match raw.trim().to_ascii_lowercase().as_str() {
4614 "none" => Ok(MaternIdentifiability::None),
4615 "sum_tozero" | "sum-to-zero" | "center_sum_tozero" | "center-sum-to-zero" | "centered" => {
4616 Ok(MaternIdentifiability::CenterSumToZero)
4617 }
4618 "linear" | "center_linear_orthogonal" | "center-linear-orthogonal" => {
4619 Ok(MaternIdentifiability::CenterLinearOrthogonal)
4620 }
4621 other => Err(TermBuilderError::unsupported_feature(format!(
4622 "invalid Matérn identifiability '{other}'; expected one of: none, sum_tozero, linear"
4623 ))),
4624 }
4625}
4626
4627fn parse_spatial_identifiability(
4628 options: &BTreeMap<String, String>,
4629) -> Result<SpatialIdentifiability, TermBuilderError> {
4630 let Some(raw) = options.get("identifiability").map(String::as_str) else {
4631 return Ok(SpatialIdentifiability::default());
4632 };
4633 match raw.trim().to_ascii_lowercase().as_str() {
4634 "none" => Ok(SpatialIdentifiability::None),
4635 "orthogonal"
4636 | "orthogonal_to_parametric"
4637 | "orthogonal-to-parametric"
4638 | "parametric_orthogonal" => Ok(SpatialIdentifiability::OrthogonalToParametric),
4639 "frozen" => Err(TermBuilderError::unsupported_feature(
4640 "spatial identifiability 'frozen' is internal-only; use none or orthogonal_to_parametric",
4641 )),
4642 other => Err(TermBuilderError::unsupported_feature(format!(
4643 "invalid spatial identifiability '{other}'; expected one of: none, orthogonal_to_parametric"
4644 ))),
4645 }
4646}
4647
4648#[cfg(test)]
4649mod tests {
4650 use super::*;
4651 use crate::inference::formula_dsl::parse_formula;
4652 use gam_data::{DataSchema, SchemaColumn};
4653 use ndarray::Array2;
4654 use std::collections::BTreeMap;
4655
4656 fn continuous_dataset(headers: &[&str], rows: Vec<Vec<f64>>) -> Dataset {
4657 let nrows = rows.len();
4658 let ncols = headers.len();
4659 let values = Array2::from_shape_vec(
4660 (nrows, ncols),
4661 rows.into_iter().flat_map(|row| row.into_iter()).collect(),
4662 )
4663 .expect("rectangular test data");
4664 Dataset {
4665 headers: headers.iter().map(|name| name.to_string()).collect(),
4666 values,
4667 schema: DataSchema {
4668 columns: headers
4669 .iter()
4670 .map(|name| SchemaColumn {
4671 name: name.to_string(),
4672 kind: ColumnKindTag::Continuous,
4673 levels: vec![],
4674 })
4675 .collect(),
4676 },
4677 column_kinds: vec![ColumnKindTag::Continuous; ncols],
4678 }
4679 }
4680
4681 fn factor_dataset() -> Dataset {
4682 let rows = (0..24)
4683 .map(|i| {
4684 let x = i as f64 / 23.0;
4685 let g = (i % 2) as f64;
4686 vec![x + g, x, g]
4687 })
4688 .collect::<Vec<_>>();
4689 Dataset {
4690 headers: vec!["y".into(), "x".into(), "g".into()],
4691 values: Array2::from_shape_vec(
4692 (rows.len(), 3),
4693 rows.into_iter().flat_map(|row| row.into_iter()).collect(),
4694 )
4695 .expect("rectangular factor test data"),
4696 schema: DataSchema {
4697 columns: vec![
4698 SchemaColumn {
4699 name: "y".into(),
4700 kind: ColumnKindTag::Continuous,
4701 levels: vec![],
4702 },
4703 SchemaColumn {
4704 name: "x".into(),
4705 kind: ColumnKindTag::Continuous,
4706 levels: vec![],
4707 },
4708 SchemaColumn {
4709 name: "g".into(),
4710 kind: ColumnKindTag::Categorical,
4711 levels: vec!["a".into(), "b".into()],
4712 },
4713 ],
4714 },
4715 column_kinds: vec![
4716 ColumnKindTag::Continuous,
4717 ColumnKindTag::Continuous,
4718 ColumnKindTag::Categorical,
4719 ],
4720 }
4721 }
4722
4723 /// #1378: the DEFAULT univariate `s(x, bs="tp")` must build a *modest*
4724 /// mgcv-sized basis, not the n-scaled spatial heuristic. The oversized
4725 /// default basis left the two-penalty REML ρ-surface with a flat valley
4726 /// whose optimizer landing point depended on row order, breaking
4727 /// row-permutation invariance. Pin the default 1-D center count so a
4728 /// regression that reinstates the n-scaled default trips here, fast, with
4729 /// no fit/optimizer in the loop.
4730 #[test]
4731 fn default_univariate_thinplate_basis_dim_is_modest() {
4732 // n = 300 (the #1378 scenario): the n-scaled spatial heuristic would
4733 // request ~75 centers here. The modest default must stay near k = 10.
4734 let n = 300usize;
4735 let rows: Vec<Vec<f64>> = (0..n)
4736 .map(|i| {
4737 let x = -3.0 + 6.0 * (i as f64) / ((n - 1) as f64);
4738 vec![x.sin(), x]
4739 })
4740 .collect();
4741 let ds = continuous_dataset(&["y", "x"], rows);
4742
4743 let mut options = BTreeMap::new();
4744 options.insert("bs".to_string(), "tp".to_string());
4745
4746 let mut notes = Vec::new();
4747 let basis = build_smooth_basis(
4748 SmoothKind::S,
4749 &["x".to_string()],
4750 &[1],
4751 &options,
4752 &ds,
4753 &mut notes,
4754 &ResourcePolicy::default_library(),
4755 1,
4756 )
4757 .expect("build default univariate tp smooth");
4758
4759 let centers = match &basis {
4760 SmoothBasisSpec::ThinPlate { spec, .. } => match &spec.center_strategy {
4761 CenterStrategy::Auto(inner) => match inner.as_ref() {
4762 CenterStrategy::FarthestPoint { num_centers }
4763 | CenterStrategy::EqualMass { num_centers }
4764 | CenterStrategy::EqualMassCovarRepresentative { num_centers }
4765 | CenterStrategy::KMeans { num_centers, .. } => *num_centers,
4766 other => panic!("unexpected auto inner center strategy: {other:?}"),
4767 },
4768 CenterStrategy::FarthestPoint { num_centers }
4769 | CenterStrategy::EqualMass { num_centers }
4770 | CenterStrategy::EqualMassCovarRepresentative { num_centers }
4771 | CenterStrategy::KMeans { num_centers, .. } => *num_centers,
4772 other => panic!("unexpected center strategy: {other:?}"),
4773 },
4774 other => panic!("expected ThinPlate basis, got {other:?}"),
4775 };
4776
4777 // #1074: the mgcv-sized basis-dim ceiling assertion was removed with the
4778 // cap it tested. The default tp basis is now n-scaled; we only assert it
4779 // still builds a usable basis.
4780 assert!(
4781 centers >= 1,
4782 "default univariate tp must still build a usable basis (centers={centers})",
4783 );
4784 }
4785
4786 /// gam#1629: a default 2-D `matern(x1, x2)` (no explicit `length_scale`)
4787 /// must leave the length-scale at the `0.0` auto sentinel — NOT the full
4788 /// data diameter — so the planner's `auto_init_length_scale_in_place` seeds
4789 /// it on the wiggly/resolving side (`max_range / sqrt(n)`), the same regime
4790 /// thin-plate uses. The previous `default_matern_length_scale` returned the
4791 /// full diameter, which is non-zero, so the `0.0`-gated auto-init was a
4792 /// no-op and the κ-optimizer started in the over-smoothed corner and parked
4793 /// there (truth-RMSE ~6× worse than thin-plate/tensor on identical
4794 /// high-frequency 2-D surfaces, insensitive to `k`). This pins the corrected
4795 /// seed geometry without a fit/optimizer in the loop.
4796 #[test]
4797 fn default_matern_2d_seeds_resolving_length_scale_not_overscaled_diameter() {
4798 // A fine multi-frequency 2-D grid (the #1629 reproduction shape): the
4799 // data diameter is O(1.4) in each axis; the resolving seed must be far
4800 // smaller than the diameter so high-frequency structure stays reachable.
4801 let side = 24usize; // n = 576
4802 let mut rows: Vec<Vec<f64>> = Vec::with_capacity(side * side);
4803 for i in 0..side {
4804 for j in 0..side {
4805 let x1 = i as f64 / (side - 1) as f64; // [0, 1]
4806 let x2 = j as f64 / (side - 1) as f64; // [0, 1]
4807 let y = (6.0 * x1).sin() * (6.0 * x2).cos();
4808 rows.push(vec![y, x1, x2]);
4809 }
4810 }
4811 let n = rows.len();
4812 let ds = continuous_dataset(&["y", "x1", "x2"], rows);
4813
4814 let mut options = BTreeMap::new();
4815 options.insert("bs".to_string(), "gp".to_string()); // gp ⇒ Matérn
4816 let mut notes = Vec::new();
4817 let mut basis = build_smooth_basis(
4818 SmoothKind::S,
4819 &["x1".to_string(), "x2".to_string()],
4820 &[1, 2],
4821 &options,
4822 &ds,
4823 &mut notes,
4824 &ResourcePolicy::default_library(),
4825 1,
4826 )
4827 .expect("build default 2-D matern smooth");
4828
4829 // (1) The builder must emit the auto sentinel, not a baked-in diameter.
4830 let (feature_cols, seeded_length_scale) = match &basis {
4831 SmoothBasisSpec::Matern {
4832 feature_cols, spec, ..
4833 } => (feature_cols.clone(), spec.length_scale),
4834 other => panic!("expected Matern basis, got {other:?}"),
4835 };
4836 assert_eq!(
4837 seeded_length_scale, 0.0,
4838 "default matern() must leave length_scale at the 0.0 auto sentinel \
4839 (got {seeded_length_scale}); a non-zero diameter default re-enters the \
4840 over-smoothed basin and disables the planner's wiggly-side auto-init",
4841 );
4842
4843 // (2) After the shared auto-init runs, the realized length-scale must
4844 // land in the resolving regime: `max_range / sqrt(n)`, far below the
4845 // data diameter. This is the seed the κ-optimizer starts REML from.
4846 crate::smooth::auto_init_length_scale_in_basis(ds.values.view(), &mut basis);
4847 let realized = match &basis {
4848 SmoothBasisSpec::Matern { spec, .. } => spec.length_scale,
4849 other => panic!("expected Matern basis after auto-init, got {other:?}"),
4850 };
4851 let expected =
4852 crate::smooth::auto_initial_length_scale(ds.values.view(), &feature_cols);
4853 assert!(
4854 (realized - expected).abs() <= 1e-12,
4855 "auto-init must seed the wiggly-side length scale max_range/sqrt(n) \
4856 (expected {expected}, got {realized})",
4857 );
4858
4859 // Sanity: the resolving seed is well below the per-axis range (≈1.0).
4860 // Before the fix the seed was the full diameter (≈√2 ≈ 1.414); the
4861 // resolving seed here is ≈ 1.0 / sqrt(576) ≈ 0.042, ~30× smaller.
4862 let max_range = 1.0_f64; // each axis spans [0, 1]
4863 assert!(
4864 realized < max_range / 4.0,
4865 "matern seed length_scale {realized} must be in the resolving regime, \
4866 not the over-smoothed diameter corner (n={n}, max_range≈{max_range})",
4867 );
4868 }
4869
4870 /// gam#1778: `matern(..., periodic=true)` and `thinplate(..., periodic=true)`
4871 /// must be ACCEPTED. The squash-merge that wired periodic support into the
4872 /// matern/thinplate basis specs forgot to add the periodic option keys to
4873 /// those two builders' `validate_known_options` whitelists (only `duchon`
4874 /// got both), so `periodic=`/`period=`/`cyclic=`/`period_start=`/`period_end=`
4875 /// were rejected as unknown options even though the spec/builder consume them.
4876 /// Before the whitelist fix this returned an "unknown option" error.
4877 #[test]
4878 fn matern_and_thinplate_accept_periodic_option() {
4879 let n = 200usize;
4880 let rows: Vec<Vec<f64>> = (0..n)
4881 .map(|i| {
4882 let x = -3.0 + 6.0 * (i as f64) / ((n - 1) as f64);
4883 vec![x.sin(), x]
4884 })
4885 .collect();
4886 let ds = continuous_dataset(&["y", "x"], rows);
4887
4888 // matern() with periodic=true must build without an unknown-option error.
4889 let mut matern_opts = BTreeMap::new();
4890 matern_opts.insert("bs".to_string(), "gp".to_string()); // gp ⇒ Matérn
4891 matern_opts.insert("periodic".to_string(), "true".to_string());
4892 let mut notes = Vec::new();
4893 let matern_basis = build_smooth_basis(
4894 SmoothKind::S,
4895 &["x".to_string()],
4896 &[1],
4897 &matern_opts,
4898 &ds,
4899 &mut notes,
4900 &ResourcePolicy::default_library(),
4901 1,
4902 )
4903 .expect("matern(x, periodic=true) must be accepted");
4904 match &matern_basis {
4905 SmoothBasisSpec::Matern { spec, .. } => assert!(
4906 spec.periodic.is_some(),
4907 "periodic=true must thread a Some(periodic) into the matern spec",
4908 ),
4909 other => panic!("expected Matern basis, got {other:?}"),
4910 }
4911
4912 // thinplate()/tps() with periodic=true must likewise be accepted.
4913 let mut tps_opts = BTreeMap::new();
4914 tps_opts.insert("bs".to_string(), "tp".to_string());
4915 tps_opts.insert("periodic".to_string(), "true".to_string());
4916 let mut notes = Vec::new();
4917 let tps_basis = build_smooth_basis(
4918 SmoothKind::S,
4919 &["x".to_string()],
4920 &[1],
4921 &tps_opts,
4922 &ds,
4923 &mut notes,
4924 &ResourcePolicy::default_library(),
4925 1,
4926 )
4927 .expect("thinplate(x, periodic=true) must be accepted");
4928 match &tps_basis {
4929 SmoothBasisSpec::ThinPlate { spec, .. } => assert!(
4930 spec.periodic.is_some(),
4931 "periodic=true must thread a Some(periodic) into the thinplate spec",
4932 ),
4933 other => panic!("expected ThinPlate basis, got {other:?}"),
4934 }
4935 }
4936
4937 /// Regression: an explicit scalar `periodic=false` on a radial spatial smooth
4938 /// must build a NON-periodic basis. The scalar-boolean shortcut used to emit
4939 /// `Some(vec![None; dim])`, which the 1-D radial builders route on via
4940 /// `spec.periodic.is_some()` (and the Duchon arm even back-fills the data
4941 /// range into a lone `None`), so `periodic=false` silently produced a
4942 /// *periodic* smooth — the opposite of what was asked. The spec's `periodic`
4943 /// field must be `None` for every radial base (matern / thinplate / duchon),
4944 /// matching the bracketed `[false]` form.
4945 #[test]
4946 fn scalar_periodic_false_builds_non_periodic_radial_smooth() {
4947 let n = 200usize;
4948 let rows: Vec<Vec<f64>> = (0..n)
4949 .map(|i| {
4950 let x = -3.0 + 6.0 * (i as f64) / ((n - 1) as f64);
4951 vec![x.sin(), x]
4952 })
4953 .collect();
4954 let ds = continuous_dataset(&["y", "x"], rows);
4955
4956 let build = |bs: &str| -> SmoothBasisSpec {
4957 let mut opts = BTreeMap::new();
4958 opts.insert("bs".to_string(), bs.to_string());
4959 opts.insert("periodic".to_string(), "false".to_string());
4960 let mut notes = Vec::new();
4961 build_smooth_basis(
4962 SmoothKind::S,
4963 &["x".to_string()],
4964 &[1],
4965 &opts,
4966 &ds,
4967 &mut notes,
4968 &ResourcePolicy::default_library(),
4969 1,
4970 )
4971 .unwrap_or_else(|e| panic!("s(x, bs={bs}, periodic=false) must be accepted: {e}"))
4972 };
4973
4974 match &build("gp") {
4975 SmoothBasisSpec::Matern { spec, .. } => assert!(
4976 spec.periodic.is_none(),
4977 "periodic=false must leave the matern spec non-periodic, got {:?}",
4978 spec.periodic
4979 ),
4980 other => panic!("expected Matern basis, got {other:?}"),
4981 }
4982 match &build("tp") {
4983 SmoothBasisSpec::ThinPlate { spec, .. } => assert!(
4984 spec.periodic.is_none(),
4985 "periodic=false must leave the thinplate spec non-periodic, got {:?}",
4986 spec.periodic
4987 ),
4988 other => panic!("expected ThinPlate basis, got {other:?}"),
4989 }
4990 match &build("duchon") {
4991 SmoothBasisSpec::Duchon { spec, .. } => assert!(
4992 spec.periodic.is_none(),
4993 "periodic=false must leave the duchon spec non-periodic (no data-range \
4994 back-fill), got {:?}",
4995 spec.periodic
4996 ),
4997 other => panic!("expected Duchon basis, got {other:?}"),
4998 }
4999 }
5000
5001 fn inferred_tensor_basis_product(ds: &Dataset) -> usize {
5002 let parsed = parse_formula("y ~ te(theta, h)").expect("parse tensor formula");
5003 let col_map = ds.column_map();
5004 let mut notes = Vec::new();
5005 let terms = build_termspec(
5006 &parsed.terms,
5007 ds,
5008 &col_map,
5009 &mut notes,
5010 &ResourcePolicy::default_library(),
5011 )
5012 .expect("build tensor termspec");
5013 let SmoothBasisSpec::TensorBSpline { spec, .. } = &terms.smooth_terms[0].basis else {
5014 panic!("expected tensor smooth");
5015 };
5016 spec.marginalspecs
5017 .iter()
5018 .map(|marginal| match marginal.knotspec {
5019 BSplineKnotSpec::Generate {
5020 num_internal_knots, ..
5021 } => num_internal_knots + marginal.degree + 1,
5022 BSplineKnotSpec::PeriodicUniform { num_basis, .. } => num_basis,
5023 BSplineKnotSpec::Automatic {
5024 num_internal_knots: Some(num_internal_knots),
5025 ..
5026 } => num_internal_knots + marginal.degree + 1,
5027 BSplineKnotSpec::Automatic {
5028 num_internal_knots: None,
5029 ..
5030 } => panic!("test helper cannot infer automatic knot count"),
5031 BSplineKnotSpec::Provided(ref knots) => {
5032 knots.len().saturating_sub(marginal.degree + 1)
5033 }
5034 // cr basis dimension equals the knot count (no degree offset).
5035 BSplineKnotSpec::NaturalCubicRegression { ref knots } => knots.len(),
5036 })
5037 .product()
5038 }
5039
5040 fn tensor_margin_basis_sizes(ds: &Dataset, formula: &str) -> Vec<usize> {
5041 let parsed = parse_formula(formula).expect("parse tensor formula");
5042 let col_map = ds.column_map();
5043 let mut notes = Vec::new();
5044 let terms = build_termspec(
5045 &parsed.terms,
5046 ds,
5047 &col_map,
5048 &mut notes,
5049 &ResourcePolicy::default_library(),
5050 )
5051 .expect("build tensor termspec");
5052 let SmoothBasisSpec::TensorBSpline { spec, .. } = &terms.smooth_terms[0].basis else {
5053 panic!("expected tensor smooth");
5054 };
5055 spec.marginalspecs
5056 .iter()
5057 .map(|marginal| match marginal.knotspec {
5058 BSplineKnotSpec::Generate {
5059 num_internal_knots, ..
5060 } => num_internal_knots + marginal.degree + 1,
5061 BSplineKnotSpec::PeriodicUniform { num_basis, .. } => num_basis,
5062 BSplineKnotSpec::Automatic {
5063 num_internal_knots: Some(num_internal_knots),
5064 ..
5065 } => num_internal_knots + marginal.degree + 1,
5066 BSplineKnotSpec::Automatic {
5067 num_internal_knots: None,
5068 ..
5069 } => panic!("test helper cannot infer automatic knot count"),
5070 BSplineKnotSpec::Provided(ref knots) => {
5071 knots.len().saturating_sub(marginal.degree + 1)
5072 }
5073 // cr basis dimension equals the knot count (no degree offset).
5074 BSplineKnotSpec::NaturalCubicRegression { ref knots } => knots.len(),
5075 })
5076 .collect()
5077 }
5078
5079 #[test]
5080 fn validate_known_options_lists_valid_option_names_for_unknown_parameter() {
5081 let mut options = BTreeMap::new();
5082 options.insert("lengt_scale".to_string(), "0.25".to_string());
5083 let err = validate_known_options(
5084 "matern",
5085 &options,
5086 &["type", "bs", "length_scale", "centers", "k", "nu"],
5087 )
5088 .expect_err("unknown smooth option should be rejected");
5089 assert!(
5090 err.contains("matern() does not accept option `lengt_scale`"),
5091 "error should name the invalid option, got: {err}"
5092 );
5093 assert!(
5094 err.contains("did you mean one of [length_scale]"),
5095 "error should suggest the closest valid option, got: {err}"
5096 );
5097 assert!(
5098 err.contains("Valid options: ["),
5099 "error should list valid option names, got: {err}"
5100 );
5101 }
5102
5103 #[test]
5104 fn tensor_k_accepts_square_bracket_per_margin_list() {
5105 let ds = continuous_dataset(
5106 &["y", "x", "z"],
5107 (0..40)
5108 .map(|i| {
5109 let x = i as f64 / 39.0;
5110 let z = ((i * 7) % 40) as f64 / 39.0;
5111 vec![x.sin() + z.cos(), x, z]
5112 })
5113 .collect(),
5114 );
5115
5116 assert_eq!(
5117 tensor_margin_basis_sizes(&ds, "y ~ te(x, z, k=[5, 6])"),
5118 vec![5, 6],
5119 "square-bracket k lists should materialize the requested per-margin values"
5120 );
5121 }
5122
5123 /// #1776 / #1752: a bare doubly-cyclic tensor `te(x, z, bs=c('cc','cc'))`
5124 /// with NO explicit `period=` must build — each cyclic margin wraps on its
5125 /// own observed `[min, max]` data span (mirroring mgcv's `bs="cc"` and the
5126 /// 1-D cyclic fallback), instead of hard-erroring "periodic but requires an
5127 /// explicit period". The periodic-radial refactor (c8c3192fa) replaced that
5128 /// fallback with an unconditional `period=`-required error and orphaned the
5129 /// `margin_is_cc` binding that drives it (the #1776 dead-binding `-D
5130 /// warnings` build break). This pins the restored data-range derivation so a
5131 /// regression that drops the `None if margin_is_cc` branch trips here, fast,
5132 /// with no fit/optimizer in the loop.
5133 #[test]
5134 fn bare_doubly_cyclic_tensor_derives_period_from_data_range_1776() {
5135 let ds = continuous_dataset(
5136 &["y", "x", "z"],
5137 (0..40)
5138 .map(|i| {
5139 let x = i as f64 / 39.0;
5140 let z = ((i * 7) % 40) as f64 / 39.0;
5141 vec![x.sin() + z.cos(), x, z]
5142 })
5143 .collect(),
5144 );
5145
5146 let parsed = parse_formula("y ~ te(x, z, bs=c('cc','cc'))")
5147 .expect("parse doubly-cyclic tensor formula");
5148 let col_map = ds.column_map();
5149 let mut notes = Vec::new();
5150 // Must NOT hard-error: the bare cyclic margins derive their period from
5151 // the observed data range (the restored #1752 fallback).
5152 let terms = build_termspec(
5153 &parsed.terms,
5154 &ds,
5155 &col_map,
5156 &mut notes,
5157 &ResourcePolicy::default_library(),
5158 )
5159 .expect(
5160 "bare cc-cc tensor must build via the data-range period fallback (#1776/#1752), \
5161 not hard-error on a missing explicit period",
5162 );
5163 let SmoothBasisSpec::TensorBSpline { spec, .. } = &terms.smooth_terms[0].basis else {
5164 panic!("expected tensor smooth");
5165 };
5166 assert_eq!(
5167 spec.marginalspecs.len(),
5168 2,
5169 "te(x, z) builds exactly two tensor margins"
5170 );
5171 for (axis, marginal) in spec.marginalspecs.iter().enumerate() {
5172 assert!(
5173 matches!(marginal.knotspec, BSplineKnotSpec::PeriodicUniform { .. }),
5174 "cyclic margin {axis} must build a periodic (wrapped) knotspec from the \
5175 data range, got {:?}",
5176 marginal.knotspec
5177 );
5178 }
5179 }
5180
5181 #[test]
5182 fn parse_cylinder_periodic_options_match_requested_forms() {
5183 let mut opts = BTreeMap::new();
5184 opts.insert("periodic".to_string(), "[0]".to_string());
5185 opts.insert("period".to_string(), "[2*pi, None]".to_string());
5186 let axes = parse_periodic_axes(&opts, 2).expect("axes");
5187 let periods = parse_periods(&opts, &axes).expect("periods");
5188 assert_eq!(axes, vec![true, false]);
5189 assert!((periods[0].unwrap() - 2.0 * std::f64::consts::PI).abs() < 1e-12);
5190 assert_eq!(periods[1], None);
5191
5192 let mut boundary_opts = BTreeMap::new();
5193 boundary_opts.insert(
5194 "boundary".to_string(),
5195 "['periodic', 'natural']".to_string(),
5196 );
5197 boundary_opts.insert("period".to_string(), "[2*pi, None]".to_string());
5198 let boundary_axes = parse_periodic_axes(&boundary_opts, 2).expect("boundary axes");
5199 let boundary_periods =
5200 parse_periods(&boundary_opts, &boundary_axes).expect("boundary periods");
5201 assert_eq!(boundary_axes, vec![true, false]);
5202 assert!((boundary_periods[0].unwrap() - 2.0 * std::f64::consts::PI).abs() < 1e-12);
5203 assert_eq!(boundary_periods[1], None);
5204
5205 let mut unicode_opts = BTreeMap::new();
5206 unicode_opts.insert("periodic".to_string(), "[0,1]".to_string());
5207 unicode_opts.insert("period".to_string(), "[2π, τ]".to_string());
5208 let unicode_axes = parse_periodic_axes(&unicode_opts, 2).expect("unicode axes");
5209 let unicode_periods = parse_periods(&unicode_opts, &unicode_axes).expect("unicode periods");
5210 assert_eq!(unicode_axes, vec![true, true]);
5211 assert!((unicode_periods[0].unwrap() - 2.0 * std::f64::consts::PI).abs() < 1e-12);
5212 assert!((unicode_periods[1].unwrap() - std::f64::consts::TAU).abs() < 1e-12);
5213 }
5214
5215 /// The tensor boundary-token guard must ACCEPT `clamped`/`open` (the
5216 /// B-spline-clamped, non-periodic margin spelling) alongside the periodic
5217 /// selectors and the other inert non-periodic markers, and still REJECT a
5218 /// genuine endpoint constraint like `anchored`. This locks the #415 /
5219 /// cylinder fix (`te(theta, z, boundary=['periodic','clamped'])`, mgcv
5220 /// `te(bs=c("cc","ps"))`) in the fast unit lane — the end-to-end cylinder
5221 /// recovery test is R-gated (`run_r` + mgcv), so without this the guard
5222 /// regressing back to rejecting `clamped` would slip through CPU CI.
5223 #[test]
5224 fn tensor_boundary_tokens_accept_clamped_open_reject_anchored() {
5225 fn boundary(raw: &str, dim: usize) -> Result<(), String> {
5226 let mut opts = BTreeMap::new();
5227 opts.insert("boundary".to_string(), raw.to_string());
5228 validate_tensor_boundary_tokens(&opts, dim)
5229 }
5230
5231 // Mixed periodic + clamped (the cylinder) and its bare/case/quote
5232 // variants are all accepted.
5233 for raw in [
5234 "['periodic', 'clamped']",
5235 "['periodic', 'open']",
5236 "['cc', 'clamped']",
5237 "['clamped', 'natural']",
5238 "[Periodic, CLAMPED]",
5239 "c('cc', 'clamped')", // mgcv-style c(...) vector form round-trips
5240 ] {
5241 assert!(
5242 boundary(raw, 2).is_ok(),
5243 "boundary={raw:?} must be accepted (clamped/open/inert non-periodic markers)"
5244 );
5245 }
5246
5247 // `bc=` is an accepted alias for `boundary=`.
5248 let mut bc_opts = BTreeMap::new();
5249 bc_opts.insert("bc".to_string(), "['periodic', 'clamped']".to_string());
5250 assert!(validate_tensor_boundary_tokens(&bc_opts, 2).is_ok());
5251
5252 // A genuine endpoint constraint has no ordinary-margin meaning on a
5253 // tensor and must still be surfaced as a clean unsupported-feature error
5254 // rather than silently dropped.
5255 let err = boundary("['periodic', 'anchored']", 2)
5256 .expect_err("anchored endpoint constraint must be rejected on a tensor margin");
5257 assert!(
5258 err.contains("anchored") && err.contains("not supported"),
5259 "rejection must name the offending token and be an unsupported-feature error: {err}"
5260 );
5261
5262 // Absent boundary/bc is a no-op success.
5263 assert!(validate_tensor_boundary_tokens(&BTreeMap::new(), 2).is_ok());
5264 }
5265
5266 #[test]
5267 fn parse_single_axis_periodic_zero_as_axis_not_false() {
5268 let mut opts = BTreeMap::new();
5269 opts.insert("periodic".to_string(), "[0]".to_string());
5270 opts.insert("period".to_string(), "2*pi".to_string());
5271 opts.insert("origin".to_string(), "0".to_string());
5272 let axes = parse_periodic_axes(&opts, 1).expect("axes");
5273 let periods = parse_periods(&opts, &axes).expect("periods");
5274 let origins = parse_period_origins(&opts, &axes).expect("origins");
5275 assert_eq!(axes, vec![true]);
5276 assert!((periods[0].unwrap() - 2.0 * std::f64::consts::PI).abs() < 1e-12);
5277 assert_eq!(origins[0], Some(0.0));
5278 }
5279
5280 #[test]
5281 fn one_dimensional_bspline_accepts_boundary_periodic() {
5282 let ds = continuous_dataset(
5283 &["y", "theta"],
5284 (0..16)
5285 .map(|i| {
5286 let theta = std::f64::consts::TAU * i as f64 / 16.0;
5287 vec![theta.sin(), theta]
5288 })
5289 .collect(),
5290 );
5291 let parsed = parse_formula("y ~ s(theta, boundary=periodic, period=2*pi, origin=0, k=8)")
5292 .expect("parse");
5293 let col_map = ds.column_map();
5294 let mut notes = Vec::new();
5295 let terms = build_termspec(
5296 &parsed.terms,
5297 &ds,
5298 &col_map,
5299 &mut notes,
5300 &gam_runtime::resource::ResourcePolicy::default_library(),
5301 )
5302 .expect("periodic boundary should build");
5303 let SmoothBasisSpec::BSpline1D { spec, .. } = &terms.smooth_terms[0].basis else {
5304 panic!("expected 1D B-spline");
5305 };
5306 assert!(matches!(
5307 &spec.knotspec,
5308 BSplineKnotSpec::PeriodicUniform {
5309 data_range,
5310 num_basis: 8
5311 } if *data_range == (0.0, std::f64::consts::TAU)
5312 ));
5313 }
5314
5315 #[test]
5316 fn univariate_smooth_accepts_mgcv_cubic_regression_aliases() {
5317 let ds = continuous_dataset(
5318 &["y", "x"],
5319 (0..32)
5320 .map(|i| {
5321 let x = i as f64 / 31.0;
5322 vec![x * x, x]
5323 })
5324 .collect(),
5325 );
5326 let col_map = ds.column_map();
5327
5328 for (selector, expect_double_penalty) in [("cr", false), ("cs", true)] {
5329 let formula = format!("y ~ s(x, bs='{selector}')");
5330 let parsed = parse_formula(&formula).expect("parse cr/cs smooth");
5331 let mut notes = Vec::new();
5332 let terms = build_termspec(
5333 &parsed.terms,
5334 &ds,
5335 &col_map,
5336 &mut notes,
5337 &gam_runtime::resource::ResourcePolicy::default_library(),
5338 )
5339 .unwrap_or_else(|err| panic!("bs='{selector}' must build a 1-D smooth, got: {err:?}"));
5340 let SmoothBasisSpec::BSpline1D { spec, .. } = &terms.smooth_terms[0].basis else {
5341 panic!(
5342 "bs='{selector}' must lower to a BSpline1D; got {:?}",
5343 terms.smooth_terms[0].basis
5344 );
5345 };
5346 assert_eq!(
5347 spec.double_penalty, expect_double_penalty,
5348 "bs='{selector}' must default double_penalty to mgcv's convention \
5349 (cr=no-shrinkage, cs=shrinkage); got double_penalty={}",
5350 spec.double_penalty
5351 );
5352 }
5353 }
5354
5355 #[test]
5356 fn univariate_ps_small_k_degree_reduces_through_build(/* gam#1130 */) {
5357 // mgcv accepts `s(x, bs="ps", k=3)` (and the default cubic-regression
5358 // `s(x, k=3)`) by silently reducing the cubic basis to a quadratic.
5359 // The univariate ps/bspline build path used to reject this with
5360 // "k too small for degree 3"; it must now lower to a degree-2 basis
5361 // with zero internal knots (num_basis = k = 3), matching the te(...)
5362 // margin behaviour fixed in b75f55a91. Verified across the ps alias
5363 // and the default (cr) selector that both route through
5364 // parse_ps_internal_knots.
5365 let ds = continuous_dataset(
5366 &["y", "x"],
5367 (0..32)
5368 .map(|i| {
5369 let x = i as f64 / 31.0;
5370 vec![x * x, x]
5371 })
5372 .collect(),
5373 );
5374 let col_map = ds.column_map();
5375
5376 for formula in ["y ~ s(x, bs='ps', k=3)", "y ~ s(x, k=3)"] {
5377 let parsed = parse_formula(formula).expect("parse small-k ps/cr smooth");
5378 let mut notes = Vec::new();
5379 let terms = build_termspec(
5380 &parsed.terms,
5381 &ds,
5382 &col_map,
5383 &mut notes,
5384 &gam_runtime::resource::ResourcePolicy::default_library(),
5385 )
5386 .unwrap_or_else(|err| {
5387 panic!("`{formula}` must degree-reduce, not error; got: {err:?}")
5388 });
5389 let SmoothBasisSpec::BSpline1D { spec, .. } = &terms.smooth_terms[0].basis else {
5390 panic!(
5391 "`{formula}` must lower to a BSpline1D; got {:?}",
5392 terms.smooth_terms[0].basis
5393 );
5394 };
5395 assert_eq!(
5396 spec.degree, 2,
5397 "`{formula}` must drop the cubic default to a quadratic basis"
5398 );
5399 let num_internal = match &spec.knotspec {
5400 BSplineKnotSpec::Generate {
5401 num_internal_knots, ..
5402 } => *num_internal_knots,
5403 BSplineKnotSpec::Automatic {
5404 num_internal_knots: Some(n),
5405 ..
5406 } => *n,
5407 other => panic!("`{formula}` unexpected knotspec: {other:?}"),
5408 };
5409 assert_eq!(
5410 num_internal, 0,
5411 "`{formula}` must have zero internal knots (num_basis = k = 3)"
5412 );
5413 // Resulting basis dimension is num_internal + degree + 1 = 3 = k.
5414 assert!(
5415 spec.penalty_order >= 1 && spec.penalty_order <= spec.degree,
5416 "`{formula}` penalty_order {} must satisfy 1 <= order <= degree={}",
5417 spec.penalty_order,
5418 spec.degree
5419 );
5420 }
5421 }
5422
5423 #[test]
5424 fn formula_shape_constraint_round_trips_and_rejects_bogus() {
5425 let ds = continuous_dataset(
5426 &["y", "x"],
5427 (0..32)
5428 .map(|i| {
5429 let x = i as f64 / 31.0;
5430 vec![x * x, x]
5431 })
5432 .collect(),
5433 );
5434 let col_map = ds.column_map();
5435
5436 let parsed =
5437 parse_formula("y ~ s(x, shape=monotone_increasing)").expect("parse monotone smooth");
5438 let mut notes = Vec::new();
5439 let terms = build_termspec(
5440 &parsed.terms,
5441 &ds,
5442 &col_map,
5443 &mut notes,
5444 &gam_runtime::resource::ResourcePolicy::default_library(),
5445 )
5446 .expect("monotone smooth should build");
5447 assert_eq!(
5448 terms.smooth_terms[0].shape,
5449 ShapeConstraint::MonotoneIncreasing
5450 );
5451
5452 let parsed_bad = parse_formula("y ~ s(x, shape=bogus)").expect("parse bogus shape");
5453 let mut notes_bad = Vec::new();
5454 let err = build_termspec(
5455 &parsed_bad.terms,
5456 &ds,
5457 &col_map,
5458 &mut notes_bad,
5459 &gam_runtime::resource::ResourcePolicy::default_library(),
5460 )
5461 .expect_err("bogus shape must error");
5462 assert!(
5463 format!("{err:?}").contains("unknown shape constraint"),
5464 "got: {err:?}"
5465 );
5466 }
5467
5468 #[test]
5469 fn default_sphere_smooth_uses_spherical_farthest_point_centers() {
5470 let ds = continuous_dataset(
5471 &["y", "lat", "lon"],
5472 (0..24)
5473 .map(|i| {
5474 let t = i as f64 / 24.0;
5475 let lat = -60.0 + 120.0 * t;
5476 let lon = -180.0 + 360.0 * ((7 * i) % 24) as f64 / 24.0;
5477 vec![lat.to_radians().sin(), lat, lon]
5478 })
5479 .collect(),
5480 );
5481 let parsed = parse_formula("y ~ sphere(lat, lon)").expect("parse");
5482 let col_map = ds.column_map();
5483 let mut notes = Vec::new();
5484 let terms = build_termspec(
5485 &parsed.terms,
5486 &ds,
5487 &col_map,
5488 &mut notes,
5489 &gam_runtime::resource::ResourcePolicy::default_library(),
5490 )
5491 .expect("build sphere termspec");
5492 let SmoothBasisSpec::Sphere { spec, .. } = &terms.smooth_terms[0].basis else {
5493 panic!("expected sphere term");
5494 };
5495 assert!(matches!(
5496 spec.center_strategy,
5497 CenterStrategy::FarthestPoint { .. }
5498 ));
5499 }
5500
5501 #[test]
5502 fn one_dimensional_duchon_defaults_to_scale_free_length_scale() {
5503 let ds = continuous_dataset(
5504 &["y", "x"],
5505 (0..32)
5506 .map(|i| {
5507 let x = i as f64 / 31.0;
5508 vec![(std::f64::consts::TAU * x).sin(), x]
5509 })
5510 .collect(),
5511 );
5512 let parsed = parse_formula("y ~ duchon(x)").expect("parse");
5513 let col_map = ds.column_map();
5514 let mut notes = Vec::new();
5515 let terms = build_termspec(
5516 &parsed.terms,
5517 &ds,
5518 &col_map,
5519 &mut notes,
5520 &gam_runtime::resource::ResourcePolicy::default_library(),
5521 )
5522 .expect("build default duchon termspec");
5523 let SmoothBasisSpec::Duchon { spec, .. } = &terms.smooth_terms[0].basis else {
5524 panic!("expected Duchon term");
5525 };
5526 assert_eq!(spec.length_scale, None);
5527 }
5528
5529 #[test]
5530 fn one_dimensional_duchon_length_scale_opts_into_hybrid_mode() {
5531 let ds = continuous_dataset(
5532 &["y", "x"],
5533 (0..32)
5534 .map(|i| {
5535 let x = i as f64 / 31.0;
5536 vec![(std::f64::consts::TAU * x).sin(), x]
5537 })
5538 .collect(),
5539 );
5540 let parsed = parse_formula("y ~ duchon(x, length_scale=0.25)").expect("parse");
5541 let col_map = ds.column_map();
5542 let mut notes = Vec::new();
5543 let terms = build_termspec(
5544 &parsed.terms,
5545 &ds,
5546 &col_map,
5547 &mut notes,
5548 &gam_runtime::resource::ResourcePolicy::default_library(),
5549 )
5550 .expect("build hybrid duchon termspec");
5551 let SmoothBasisSpec::Duchon { spec, .. } = &terms.smooth_terms[0].basis else {
5552 panic!("expected Duchon term");
5553 };
5554 assert_eq!(spec.length_scale, Some(0.25));
5555 }
5556
5557 #[test]
5558 fn parse_matern_nu_accepts_equivalent_half_integer_forms() {
5559 let cases = [
5560 ("1/2", MaternNu::Half),
5561 (" 1 / 2 ", MaternNu::Half),
5562 (".5", MaternNu::Half),
5563 ("0.50", MaternNu::Half),
5564 ("half", MaternNu::Half),
5565 ("3 / 2", MaternNu::ThreeHalves),
5566 ("1.50", MaternNu::ThreeHalves),
5567 ("5 / 2", MaternNu::FiveHalves),
5568 ("2.500000000000", MaternNu::FiveHalves),
5569 ("7 / 2", MaternNu::SevenHalves),
5570 ("3.50", MaternNu::SevenHalves),
5571 ("9 / 2", MaternNu::NineHalves),
5572 ("4.50", MaternNu::NineHalves),
5573 ];
5574 for (raw, expected) in cases {
5575 let parsed = parse_matern_nu(raw).expect(raw);
5576 assert!(
5577 matches!(
5578 (parsed, expected),
5579 (MaternNu::Half, MaternNu::Half)
5580 | (MaternNu::ThreeHalves, MaternNu::ThreeHalves)
5581 | (MaternNu::FiveHalves, MaternNu::FiveHalves)
5582 | (MaternNu::SevenHalves, MaternNu::SevenHalves)
5583 | (MaternNu::NineHalves, MaternNu::NineHalves)
5584 ),
5585 "parsed {raw:?} as {parsed:?}, expected {expected:?}"
5586 );
5587 }
5588 }
5589
5590 #[test]
5591 fn parse_matern_nu_rejects_unsupported_or_invalid_values() {
5592 for raw in ["1", "2", "11/2", "1/0", "nan", "fast"] {
5593 let err = parse_matern_nu(raw).expect_err(raw);
5594 assert!(
5595 err.contains("supported half-integer values"),
5596 "unexpected error for {raw:?}: {err}"
5597 );
5598 }
5599 }
5600
5601 #[test]
5602 fn parse_ps_k_promotes_underexpressive_cubic_basis() {
5603 let mut opts = BTreeMap::new();
5604 opts.insert("k".to_string(), "4".to_string());
5605 let (internal, inferred, eff_degree) = parse_ps_internal_knots(&opts, 3, 20).expect("k=4");
5606 assert_eq!(internal, 2);
5607 assert_eq!(eff_degree, 3);
5608 assert!(!inferred);
5609
5610 opts.insert("k".to_string(), "6".to_string());
5611 let (internal, inferred, eff_degree) = parse_ps_internal_knots(&opts, 3, 20).expect("k=6");
5612 assert_eq!(internal, 2);
5613 assert_eq!(eff_degree, 3);
5614 assert!(!inferred);
5615
5616 opts.insert("k".to_string(), "10".to_string());
5617 let (internal, inferred, eff_degree) = parse_ps_internal_knots(&opts, 3, 20).expect("k=10");
5618 assert_eq!(internal, 6);
5619 assert_eq!(eff_degree, 3);
5620 assert!(!inferred);
5621 }
5622
5623 #[test]
5624 fn parse_ps_internal_knots_drops_degree_for_small_k() {
5625 // mgcv's `s(x, bs="ps", k=3)` with the default cubic basis silently
5626 // reduces to a quadratic (`degree=2`) marginal. `k=3, degree=3`
5627 // should yield a quadratic basis with zero internal knots
5628 // (`num_basis = k = 3`).
5629 let mut opts = BTreeMap::new();
5630 opts.insert("k".to_string(), "3".to_string());
5631 let (internal, inferred, eff_degree) = parse_ps_internal_knots(&opts, 3, 20).expect("k=3");
5632 assert_eq!(eff_degree, 2);
5633 assert_eq!(internal, 0);
5634 assert!(!inferred);
5635
5636 // `k=2` reduces to a linear (`degree=1`) marginal — the smallest
5637 // non-trivial spline basis.
5638 opts.insert("k".to_string(), "2".to_string());
5639 let (internal, inferred, eff_degree) = parse_ps_internal_knots(&opts, 3, 20).expect("k=2");
5640 assert_eq!(eff_degree, 1);
5641 assert_eq!(internal, 0);
5642 assert!(!inferred);
5643
5644 // The under-2 case is structurally under-specified and rejected even
5645 // by the degree-reducing variant: no B-spline basis has fewer than
5646 // two functions.
5647 opts.insert("k".to_string(), "1".to_string());
5648 let err = parse_ps_internal_knots(&opts, 3, 20)
5649 .expect_err("k=1 is below the irreducible spline floor");
5650 assert!(err.contains("requires k >= 2"), "unexpected error: {err}");
5651
5652 // When the user already passed `k >= degree+1`, the helper must
5653 // preserve the existing knot geometry exactly.
5654 opts.insert("k".to_string(), "4".to_string());
5655 let (internal, inferred, eff_degree) = parse_ps_internal_knots(&opts, 3, 20).expect("k=4");
5656 assert_eq!(eff_degree, 3);
5657 assert_eq!(internal, 2);
5658 assert!(!inferred);
5659 }
5660
5661 #[test]
5662 fn factor_smooth_marginal_degree_reduces_for_small_k() {
5663 let ds = factor_dataset();
5664 let col_map = ds.column_map();
5665
5666 for (k, expected_degree) in [(3usize, 2usize), (2usize, 1usize)] {
5667 let parsed =
5668 parse_formula(&format!("y ~ s(x, g, bs=fs, k={k})")).expect("parse factor smooth");
5669 let mut notes = Vec::new();
5670 let terms = build_termspec(
5671 &parsed.terms,
5672 &ds,
5673 &col_map,
5674 &mut notes,
5675 &gam_runtime::resource::ResourcePolicy::default_library(),
5676 )
5677 .unwrap_or_else(|err| panic!("fs k={k} should degree-reduce, got: {err:?}"));
5678 let SmoothBasisSpec::FactorSmooth { spec } = &terms.smooth_terms[0].basis else {
5679 panic!(
5680 "expected factor smooth, got {:?}",
5681 terms.smooth_terms[0].basis
5682 );
5683 };
5684 assert_eq!(spec.marginal.degree, expected_degree);
5685 assert!(
5686 spec.marginal.penalty_order <= spec.marginal.degree,
5687 "penalty_order {} must be clamped to degree {}",
5688 spec.marginal.penalty_order,
5689 spec.marginal.degree
5690 );
5691 let basis_size = match spec.marginal.knotspec {
5692 BSplineKnotSpec::Generate {
5693 num_internal_knots, ..
5694 } => num_internal_knots + spec.marginal.degree + 1,
5695 BSplineKnotSpec::Automatic {
5696 num_internal_knots: Some(num_internal_knots),
5697 ..
5698 } => num_internal_knots + spec.marginal.degree + 1,
5699 ref other => panic!("unexpected factor-smooth knotspec: {other:?}"),
5700 };
5701 assert_eq!(basis_size, k);
5702 }
5703 }
5704
5705 /// Build a dataset with a ternary continuous covariate `x ∈ {0,1,2}` and a
5706 /// 2-level categorical group `g`, for the low-cardinality cr-cap tests.
5707 fn ternary_factor_dataset() -> Dataset {
5708 let rows = (0..120)
5709 .map(|i| {
5710 let x = (i % 3) as f64;
5711 let g = (i % 2) as f64;
5712 vec![x + g, x, g]
5713 })
5714 .collect::<Vec<_>>();
5715 Dataset {
5716 headers: vec!["y".into(), "x".into(), "g".into()],
5717 values: Array2::from_shape_vec(
5718 (rows.len(), 3),
5719 rows.into_iter().flat_map(|row| row.into_iter()).collect(),
5720 )
5721 .expect("rectangular ternary factor test data"),
5722 schema: DataSchema {
5723 columns: vec![
5724 SchemaColumn {
5725 name: "y".into(),
5726 kind: ColumnKindTag::Continuous,
5727 levels: vec![],
5728 },
5729 SchemaColumn {
5730 name: "x".into(),
5731 kind: ColumnKindTag::Continuous,
5732 levels: vec![],
5733 },
5734 SchemaColumn {
5735 name: "g".into(),
5736 kind: ColumnKindTag::Categorical,
5737 levels: vec!["a".into(), "b".into()],
5738 },
5739 ],
5740 },
5741 column_kinds: vec![
5742 ColumnKindTag::Continuous,
5743 ColumnKindTag::Continuous,
5744 ColumnKindTag::Categorical,
5745 ],
5746 }
5747 }
5748
5749 #[test]
5750 fn univariate_cr_smooth_caps_knots_to_data_support() {
5751 // #1541: `s(x, bs=cr, k=10)` on a ternary covariate (3 distinct values)
5752 // must NOT hard-fail in cr-knot selection ("cubic regression spline with
5753 // k=10 requires at least 10 distinct values, got 3"). The cr basis is
5754 // capped to the data support — exactly 3 value-knots at {0,1,2} — which
5755 // is full-rank for the data, so it can still represent any 3 group means.
5756 let ds = continuous_dataset(
5757 &["y", "x"],
5758 (0..90)
5759 .map(|i| vec![(i % 3) as f64, (i % 3) as f64])
5760 .collect(),
5761 );
5762 let col_map = ds.column_map();
5763 let parsed = parse_formula("y ~ s(x, bs=cr, k=10)").expect("parse cr smooth");
5764 let mut notes = Vec::new();
5765 let terms = build_termspec(
5766 &parsed.terms,
5767 &ds,
5768 &col_map,
5769 &mut notes,
5770 &gam_runtime::resource::ResourcePolicy::default_library(),
5771 )
5772 .expect("cr k=10 must cap to data support instead of erroring");
5773 let SmoothBasisSpec::BSpline1D { spec, .. } = &terms.smooth_terms[0].basis else {
5774 panic!("expected BSpline1D for s(x, bs=cr)");
5775 };
5776 let BSplineKnotSpec::NaturalCubicRegression { knots } = &spec.knotspec else {
5777 panic!("expected cr knotspec, got {:?}", spec.knotspec);
5778 };
5779 // Capped to exactly the 3 distinct covariate values.
5780 assert_eq!(knots.len(), 3, "cr basis not capped to 3 distinct values");
5781 assert_eq!(knots.as_slice().unwrap(), &[0.0, 1.0, 2.0]);
5782 // The reduction is surfaced to the user (mgcv warns in the same case).
5783 assert!(
5784 notes.iter().any(|n| n.contains("data-support cap")),
5785 "cap not reported in inference notes: {notes:?}"
5786 );
5787 }
5788
5789 #[test]
5790 fn univariate_cr_smooth_binary_covariate_degrades_to_bspline() {
5791 // #1541: a BINARY covariate has too few distinct values (2) for ANY cr
5792 // spline (needs >= 3 distinct). `s(x, bs=cr)` must degrade to a B-spline
5793 // marginal — the default basis the same data already fits — NOT hard-fail.
5794 let ds = continuous_dataset(
5795 &["y", "x"],
5796 (0..80)
5797 .map(|i| vec![(i % 2) as f64, (i % 2) as f64])
5798 .collect(),
5799 );
5800 let col_map = ds.column_map();
5801 let parsed = parse_formula("y ~ s(x, bs=cr, k=10)").expect("parse cr smooth");
5802 let mut notes = Vec::new();
5803 let terms = build_termspec(
5804 &parsed.terms,
5805 &ds,
5806 &col_map,
5807 &mut notes,
5808 &gam_runtime::resource::ResourcePolicy::default_library(),
5809 )
5810 .expect("binary cr must degrade to B-spline instead of erroring");
5811 let SmoothBasisSpec::BSpline1D { spec, .. } = &terms.smooth_terms[0].basis else {
5812 panic!("expected BSpline1D for s(x, bs=cr)");
5813 };
5814 assert!(
5815 !matches!(
5816 spec.knotspec,
5817 BSplineKnotSpec::NaturalCubicRegression { .. }
5818 ),
5819 "binary covariate must NOT build a cr basis, got {:?}",
5820 spec.knotspec
5821 );
5822 assert!(
5823 notes
5824 .iter()
5825 .any(|n| n.contains("Degraded to the linear B-spline")),
5826 "degradation not reported in inference notes: {notes:?}"
5827 );
5828 }
5829
5830 #[test]
5831 fn sz_factor_smooth_low_cardinality_uses_bspline_marginal() {
5832 // #1605: the `sz` factor-smooth marginal is the SAME penalized B-spline
5833 // the `fs` sibling uses — NOT a natural cubic regression (`cr`) marginal,
5834 // whose hard natural boundary conditions f''=0 bias curved deviations
5835 // (a consistency failure). #1542 (the reason this test exists) is
5836 // subsumed: with a B-spline marginal a low-cardinality covariate no
5837 // longer needs a special cr data-support cap and can never hard-fail the
5838 // way the old cr-marginal `sz` spelling did — the build just succeeds,
5839 // exactly as `fs` already does on the identical data.
5840 let ds = ternary_factor_dataset();
5841 let col_map = ds.column_map();
5842 let parsed = parse_formula("y ~ s(x, g, bs=sz, k=10)").expect("parse sz factor smooth");
5843 let mut notes = Vec::new();
5844 let terms = build_termspec(
5845 &parsed.terms,
5846 &ds,
5847 &col_map,
5848 &mut notes,
5849 &gam_runtime::resource::ResourcePolicy::default_library(),
5850 )
5851 .expect("sz on a ternary covariate must build (B-spline marginal), not hard-fail");
5852 let SmoothBasisSpec::FactorSmooth { spec } = &terms.smooth_terms[0].basis else {
5853 panic!("expected FactorSmooth for s(x, g, bs=sz)");
5854 };
5855 assert!(
5856 !matches!(
5857 spec.marginal.knotspec,
5858 BSplineKnotSpec::NaturalCubicRegression { .. }
5859 ),
5860 "sz marginal must be a B-spline (curvature-capable), not the \
5861 natural-BC cr basis; got {:?}",
5862 spec.marginal.knotspec
5863 );
5864 }
5865
5866 /// A dataset with a genuinely continuous covariate `x` (many distinct
5867 /// values) and a `L`-level grouping factor `g`, suitable for building a
5868 /// real factor-smooth marginal with a non-trivial {const, linear} null
5869 /// space. `y` is unused by the structural penalty checks below.
5870 fn continuous_x_factor_dataset(n: usize, n_groups: usize) -> Dataset {
5871 let rows = (0..n)
5872 .map(|i| {
5873 let x = i as f64 / (n as f64 - 1.0);
5874 let g = (i % n_groups) as f64;
5875 vec![x + g, x, g]
5876 })
5877 .collect::<Vec<_>>();
5878 let levels: Vec<String> = (0..n_groups).map(|k| format!("g{k}")).collect();
5879 Dataset {
5880 headers: vec!["y".into(), "x".into(), "g".into()],
5881 values: Array2::from_shape_vec(
5882 (rows.len(), 3),
5883 rows.into_iter().flat_map(|row| row.into_iter()).collect(),
5884 )
5885 .expect("rectangular continuous-x factor data"),
5886 schema: DataSchema {
5887 columns: vec![
5888 SchemaColumn {
5889 name: "y".into(),
5890 kind: ColumnKindTag::Continuous,
5891 levels: vec![],
5892 },
5893 SchemaColumn {
5894 name: "x".into(),
5895 kind: ColumnKindTag::Continuous,
5896 levels: vec![],
5897 },
5898 SchemaColumn {
5899 name: "g".into(),
5900 kind: ColumnKindTag::Categorical,
5901 levels,
5902 },
5903 ],
5904 },
5905 column_kinds: vec![
5906 ColumnKindTag::Continuous,
5907 ColumnKindTag::Continuous,
5908 ColumnKindTag::Categorical,
5909 ],
5910 }
5911 }
5912
5913 fn factor_smooth_spec_for(formula: &str, ds: &Dataset) -> FactorSmoothSpec {
5914 let col_map = ds.column_map();
5915 let parsed = parse_formula(formula).expect("parse factor smooth formula");
5916 let mut notes = Vec::new();
5917 let terms = build_termspec(
5918 &parsed.terms,
5919 ds,
5920 &col_map,
5921 &mut notes,
5922 &gam_runtime::resource::ResourcePolicy::default_library(),
5923 )
5924 .expect("build factor smooth term");
5925 let SmoothBasisSpec::FactorSmooth { spec } = &terms.smooth_terms[0].basis else {
5926 panic!("expected FactorSmooth basis for `{formula}`");
5927 };
5928 spec.clone()
5929 }
5930
5931 /// #1605: the sum-to-zero factor smooth `s(x, g, bs="sz")` under-fit data
5932 /// drawn from its own model class because its deviation blocks carried ONLY
5933 /// the marginal wiggliness penalty — the {const, linear} null space of every
5934 /// deviation curve was left completely unpenalized, so the single combined
5935 /// wiggliness λ could not separate per-group intercept/slope variance from
5936 /// curvature variance and REML parked it over-smoothed (same defect class as
5937 /// the closed #700, more severe). mgcv's `bs="fs"` sibling avoids the gap by
5938 /// adding a SEPARATE per-null-dimension ridge (one λ each), the
5939 /// double-penalty `I_L ⊗ S_j` structure. The fix gives `sz` the same
5940 /// null-space-ridge structure, mapped into the zero-sum CONTRAST space so the
5941 /// constraint (and `sz`'s distinctness from `fs`) is preserved.
5942 ///
5943 /// This pins the structural defect: after the fix the `sz` deviation build
5944 /// must carry MORE than just its wiggliness penalty(s) — exactly one extra
5945 /// null-space-ridge penalty per marginal null direction, matching the count
5946 /// that `fs` carries — while keeping the narrower `(L-1)·p` zero-sum design
5947 /// (NOT the `L·p` full-rank `fs` design). Before the fix `sz` carried only
5948 /// the wiggliness penalties and this fails.
5949 #[test]
5950 fn sz_factor_smooth_carries_null_space_ridge_like_fs() {
5951 let ds = continuous_x_factor_dataset(180, 4);
5952 let mut workspace = crate::basis::BasisWorkspace::new();
5953
5954 let sz_spec = factor_smooth_spec_for("y ~ s(x, g, bs=sz, k=8)", &ds);
5955 let sz_built = crate::smooth::build_factor_smooth(
5956 ds.values.view(),
5957 &sz_spec,
5958 "sz_term",
5959 &mut workspace,
5960 )
5961 .expect("build sz factor smooth");
5962
5963 let fs_spec = factor_smooth_spec_for("y ~ s(x, g, bs=fs, k=8)", &ds);
5964 let fs_built = crate::smooth::build_factor_smooth(
5965 ds.values.view(),
5966 &fs_spec,
5967 "fs_term",
5968 &mut workspace,
5969 )
5970 .expect("build fs factor smooth");
5971
5972 // Penalty structure (#1074 + #1605). `fs` is the exchangeable
5973 // random-effect smooth: all `L` level blocks share ONE wiggliness λ per
5974 // marginal penalty, plus one rank-1 null-space ridge per marginal null
5975 // direction (the #1605 double penalty). `sz` is the sum-to-zero factor
5976 // smooth and mgcv's `smooth.construct.sz` emits ONE penalty matrix PER
5977 // LEVEL — `L` independent curvature smoothing parameters — so REML can
5978 // shrink a low-amplitude group's deviation hard while leaving a busy
5979 // group nearly unpenalized. We mirror that: the single marginal
5980 // wiggliness penalty is split into its `L` independent zero-sum-contrast
5981 // summands (`L-1` free per-group blocks `(e_k e_kᵀ)⊗S` + the reference
5982 // coupling block `(11ᵀ)⊗S`), each carrying its own λ, and the null-space
5983 // ridges stay POOLED (the per-group intercept/slope shrinkage mgcv pools
5984 // under one variance even for `sz`).
5985 //
5986 // So with `nw` marginal wiggliness penalties and `nn` marginal null
5987 // directions: fs has `nw + nn` penalties; sz has `L·nw + nn`. sz must
5988 // therefore carry strictly MORE penalties than fs (the per-group split),
5989 // and the surplus must be exactly `(L-1)·nw`.
5990 let n_levels = sz_spec
5991 .group_frozen_levels
5992 .as_ref()
5993 .map(|l| l.len())
5994 .unwrap_or(4);
5995 assert!(n_levels >= 3, "test needs >=3 groups, got {n_levels}");
5996
5997 // fs = nw + nn ⇒ nn = fs_penalties - nw. The marginal has nw==1
5998 // wiggliness penalty (a single difference/curvature operator), so the
5999 // per-group split adds exactly (L-1)·nw = (L-1) extra penalties on top of
6000 // fs's count.
6001 let nw = 1usize; // one marginal wiggliness penalty for the B-spline marginal
6002 let expected_sz = fs_built.penalties.len() + (n_levels - 1) * nw;
6003 assert_eq!(
6004 sz_built.penalties.len(),
6005 expected_sz,
6006 "sz must split its wiggliness penalty per level (#1074): expected \
6007 fs_count {} + (L-1)·nw {} = {}, but sz had {}",
6008 fs_built.penalties.len(),
6009 (n_levels - 1) * nw,
6010 expected_sz,
6011 sz_built.penalties.len(),
6012 );
6013 assert!(
6014 sz_built.penalties.len() > fs_built.penalties.len(),
6015 "sz must carry strictly more penalties than fs after the per-group \
6016 split (sz={}, fs={})",
6017 sz_built.penalties.len(),
6018 fs_built.penalties.len(),
6019 );
6020
6021 // The null-space ridges must still be present (the #1605 property that
6022 // keeps the deviation curvature un-over-smoothed). After removing the `L`
6023 // per-group wiggliness blocks, the remainder are the pooled null ridges,
6024 // and there must be at least one (a B-spline marginal has a non-empty
6025 // {const, linear} null space).
6026 let n_wiggliness = n_levels * nw; // L per-group blocks
6027 assert!(
6028 sz_built.penalties.len() > n_wiggliness,
6029 "sz deviation block carries no null-space ridge (penalties={}, \
6030 wiggliness blocks={}); the null space is unpenalized and REML \
6031 over-smooths the deviations",
6032 sz_built.penalties.len(),
6033 n_wiggliness,
6034 );
6035
6036 // The zero-sum constraint must be preserved: the sz design must stay the
6037 // NARROWER `(L-1)·p` contrast design, strictly narrower than the fs
6038 // full-rank `L·p` design. This guards against "fixing" sz by making it
6039 // identical to fs (which would break identifiability / sum-to-zero).
6040 assert!(
6041 sz_built.dim < fs_built.dim,
6042 "sz design width {} must be strictly less than fs width {} \
6043 (zero-sum contrast drops one level block)",
6044 sz_built.dim,
6045 fs_built.dim,
6046 );
6047
6048 // Every penalty/metadata vector must stay parallel (length invariant the
6049 // downstream REML assembly relies on).
6050 assert_eq!(sz_built.penalties.len(), sz_built.nullspaces.len());
6051 assert_eq!(sz_built.penalties.len(), sz_built.penaltyinfo.len());
6052 assert_eq!(sz_built.penalties.len(), sz_built.null_eigenvectors.len());
6053 }
6054
6055 /// #1457: `y ~ s(x, by=g) + g` with a BARE categorical `g` must NOT lower to
6056 /// two `g` design blocks. The bare `+ g` is auto-promoted to a single
6057 /// penalized random-effect block owning the factor's full level offsets; the
6058 /// `by=` branch must then recognize that owner and skip adding its own
6059 /// unpenalized treatment-coded main effect. Before the fix the dedup guard
6060 /// recognized only explicit `group(g)` (a `ParsedTerm::RandomEffect`), so the
6061 /// auto-promoted bare-`+ g` block slipped past and a spurious second `g`
6062 /// block (plus an extra smoothing parameter) was added. Assert exactly ONE
6063 /// `g` random/categorical block, and that adding the bare `+ g` introduces no
6064 /// extra `g` blocks beyond `y ~ s(x, by=g)` alone.
6065 fn factor_dataset_l3() -> Dataset {
6066 // `g` is categorical with THREE levels (encoded 0.0/1.0/2.0).
6067 let rows = (0..30)
6068 .map(|i| {
6069 let x = i as f64 / 29.0;
6070 let g = (i % 3) as f64;
6071 vec![x + g, x, g]
6072 })
6073 .collect::<Vec<_>>();
6074 Dataset {
6075 headers: vec!["y".into(), "x".into(), "g".into()],
6076 values: Array2::from_shape_vec(
6077 (rows.len(), 3),
6078 rows.into_iter().flat_map(|row| row.into_iter()).collect(),
6079 )
6080 .expect("rectangular L=3 factor test data"),
6081 schema: DataSchema {
6082 columns: vec![
6083 SchemaColumn {
6084 name: "y".into(),
6085 kind: ColumnKindTag::Continuous,
6086 levels: vec![],
6087 },
6088 SchemaColumn {
6089 name: "x".into(),
6090 kind: ColumnKindTag::Continuous,
6091 levels: vec![],
6092 },
6093 SchemaColumn {
6094 name: "g".into(),
6095 kind: ColumnKindTag::Categorical,
6096 levels: vec!["a".into(), "b".into(), "c".into()],
6097 },
6098 ],
6099 },
6100 column_kinds: vec![
6101 ColumnKindTag::Continuous,
6102 ColumnKindTag::Continuous,
6103 ColumnKindTag::Categorical,
6104 ],
6105 }
6106 }
6107
6108 #[test]
6109 fn factor_by_smooth_plus_bare_categorical_does_not_duplicate_factor_block() {
6110 let ds = factor_dataset_l3();
6111 let col_map = ds.column_map();
6112
6113 let g_blocks = |formula: &str| -> usize {
6114 let parsed = parse_formula(formula).expect("parse by-smooth formula");
6115 let mut notes = Vec::new();
6116 let terms = build_termspec(
6117 &parsed.terms,
6118 &ds,
6119 &col_map,
6120 &mut notes,
6121 &ResourcePolicy::default_library(),
6122 )
6123 .unwrap_or_else(|err| panic!("`{formula}` must build, got: {err:?}"));
6124 terms
6125 .random_effect_terms
6126 .iter()
6127 .filter(|rt| rt.name == "g")
6128 .count()
6129 };
6130
6131 // Baseline: the standalone factor-by smooth carries exactly ONE `g`
6132 // block (the unpenalized treatment-coded factor main effect added by the
6133 // `by=` branch).
6134 let by_only = g_blocks("y ~ s(x, by=g, k=10)");
6135 assert_eq!(
6136 by_only, 1,
6137 "`y ~ s(x, by=g)` must produce exactly one `g` design block"
6138 );
6139
6140 // The bug: adding a bare `+ g` (auto-promoted to a penalized random
6141 // block owning the same level offsets) must NOT introduce a second `g`
6142 // block. Before the fix this was 2.
6143 let by_plus_bare = g_blocks("y ~ s(x, by=g, k=10) + g");
6144 assert_eq!(
6145 by_plus_bare, 1,
6146 "`y ~ s(x, by=g) + g` must collapse to ONE `g` block (#1457): the bare \
6147 `+ g` already owns the factor's level offsets, so the `by=` branch \
6148 must not add a second, treatment-coded main effect"
6149 );
6150
6151 // The bare `+ g` adds no spurious extra `g` block versus the baseline.
6152 assert_eq!(
6153 by_plus_bare, by_only,
6154 "the bare `+ g` collision must add zero extra `g` blocks (#1457)"
6155 );
6156 }
6157
6158 #[test]
6159 fn parse_tensor_periods_and_origins_aliases() {
6160 let mut opts = BTreeMap::new();
6161 opts.insert(
6162 "boundary".to_string(),
6163 "['periodic', 'periodic']".to_string(),
6164 );
6165 opts.insert("periods".to_string(), "[7, 24]".to_string());
6166 opts.insert("origins".to_string(), "[0, -12]".to_string());
6167 let axes = parse_periodic_axes(&opts, 2).expect("axes");
6168 let periods = parse_periods(&opts, &axes).expect("periods");
6169 let origins = parse_period_origins(&opts, &axes).expect("origins");
6170 assert_eq!(axes, vec![true, true]);
6171 assert_eq!(periods, vec![Some(7.0), Some(24.0)]);
6172 assert_eq!(origins, vec![Some(0.0), Some(-12.0)]);
6173 }
6174
6175 #[test]
6176 fn tensor_smooth_honors_per_margin_k_list() {
6177 let ds = continuous_dataset(
6178 &["y", "theta", "h"],
6179 (0..20)
6180 .map(|i| {
6181 let theta = std::f64::consts::TAU * i as f64 / 20.0;
6182 let h = -1.0 + 2.0 * (i % 5) as f64 / 4.0;
6183 vec![theta.cos() + h, theta, h]
6184 })
6185 .collect(),
6186 );
6187 let parsed = parse_formula(
6188 "y ~ te(theta, h, periodic=[0], period=[2*pi, None], origin=[0, None], k=[9,5])",
6189 )
6190 .expect("parse tensor formula");
6191 let col_map = ds.column_map();
6192 let mut notes = Vec::new();
6193 let terms = build_termspec(
6194 &parsed.terms,
6195 &ds,
6196 &col_map,
6197 &mut notes,
6198 &gam_runtime::resource::ResourcePolicy::default_library(),
6199 )
6200 .expect("build tensor terms");
6201 let SmoothBasisSpec::TensorBSpline { spec, .. } = &terms.smooth_terms[0].basis else {
6202 panic!("expected tensor B-spline");
6203 };
6204 let dims = spec
6205 .marginalspecs
6206 .iter()
6207 .map(|m| match m.knotspec {
6208 BSplineKnotSpec::PeriodicUniform { num_basis, .. } => num_basis,
6209 BSplineKnotSpec::Generate {
6210 num_internal_knots, ..
6211 } => num_internal_knots + m.degree + 1,
6212 // The mgcv-default `cr` margin (#1074) reports its basis size as
6213 // the number of value-knots placed.
6214 BSplineKnotSpec::NaturalCubicRegression { ref knots } => knots.len(),
6215 _ => panic!("unexpected tensor marginal knotspec"),
6216 })
6217 .collect::<Vec<_>>();
6218 assert_eq!(dims, vec![9, 5]);
6219 }
6220
6221 #[test]
6222 fn tensor_smooth_honors_per_margin_k_axis_aliases() {
6223 let ds = continuous_dataset(
6224 &["resp", "x", "y"],
6225 (0..12)
6226 .map(|i| {
6227 let t = i as f64 / 11.0;
6228 vec![t, t, 1.0 - t]
6229 })
6230 .collect(),
6231 );
6232 assert_eq!(
6233 tensor_margin_basis_sizes(&ds, "resp ~ te(x, y, k_x=9, k_y=5)"),
6234 vec![9, 5],
6235 "k_<margin> aliases should materialize requested per-margin values"
6236 );
6237 }
6238
6239 #[test]
6240 fn tensor_smooth_low_cardinality_axis_falls_back_to_lower_degree_basis() {
6241 // mgcv-style: `te(x, b, k=c(5, 2))` with a BINARY second margin (only
6242 // values {0, 1}) is a legitimate request — the binary axis can hold at
6243 // most a 2-function linear basis. We must NOT reject k=2 with a
6244 // "k too small for degree 3" config error; instead, drop the spline
6245 // degree on the binary axis to k_axis - 1 (here 1, linear) while
6246 // keeping the continuous margin at the requested degree=3, k=5.
6247 let ds = continuous_dataset(
6248 &["y", "x", "b"],
6249 (0..40)
6250 .map(|i| {
6251 let x = i as f64 / 39.0;
6252 let b = (i % 2) as f64;
6253 vec![x.sin() + 0.5 * b, x, b]
6254 })
6255 .collect(),
6256 );
6257 let parsed = parse_formula("y ~ te(x, b, k=[5, 2])").expect("parse tensor with k=[5,2]");
6258 let col_map = ds.column_map();
6259 let mut notes = Vec::new();
6260 let terms = build_termspec(
6261 &parsed.terms,
6262 &ds,
6263 &col_map,
6264 &mut notes,
6265 &gam_runtime::resource::ResourcePolicy::default_library(),
6266 )
6267 .expect("build tensor with binary margin");
6268 let SmoothBasisSpec::TensorBSpline { spec, .. } = &terms.smooth_terms[0].basis else {
6269 panic!("expected tensor B-spline for te(x, b)");
6270 };
6271 // Continuous margin keeps requested degree=3 and k=5; binary margin
6272 // drops to degree=1 (linear) so the requested k=2 yields exactly two
6273 // basis functions before tensor-product identifiability is applied.
6274 let continuous = &spec.marginalspecs[0];
6275 let binary = &spec.marginalspecs[1];
6276 assert_eq!(continuous.degree, 3);
6277 assert_eq!(binary.degree, 1);
6278 assert!(
6279 binary.penalty_order >= 1 && binary.penalty_order <= binary.degree,
6280 "binary margin penalty_order {} must satisfy 1 <= order <= degree={}",
6281 binary.penalty_order,
6282 binary.degree
6283 );
6284 let basis_size = |m: &BSplineBasisSpec| match m.knotspec {
6285 BSplineKnotSpec::PeriodicUniform { num_basis, .. } => num_basis,
6286 BSplineKnotSpec::Generate {
6287 num_internal_knots, ..
6288 } => num_internal_knots + m.degree + 1,
6289 BSplineKnotSpec::Automatic {
6290 num_internal_knots: Some(n),
6291 ..
6292 } => n + m.degree + 1,
6293 // The mgcv-default `cr` margin (#1074) reports its basis size as the
6294 // number of value-knots placed.
6295 BSplineKnotSpec::NaturalCubicRegression { ref knots } => knots.len(),
6296 _ => panic!("unexpected tensor marginal knotspec"),
6297 };
6298 assert_eq!(basis_size(continuous), 5);
6299 assert_eq!(basis_size(binary), 2);
6300 }
6301
6302 #[test]
6303 fn tensor_smooth_uniform_k_is_capped_to_a_low_cardinality_margins_distinct_values() {
6304 // Regression: a SINGLE `k=5` applied to every axis of `te(x, b, k=5)`
6305 // with a BINARY second margin (`b ∈ {0, 1}`) must build a valid tensor,
6306 // NOT hard-fail in cr-knot selection ("cubic regression spline with k=5
6307 // requires at least 5 distinct values, got 2"). mgcv caps a margin's
6308 // basis to its data support; the binary axis becomes the 2-function
6309 // (linear) margin, while the continuous axis keeps the requested k=5.
6310 // This is the `te(age, badh, k=5)` real-data case that previously errored.
6311 let ds = continuous_dataset(
6312 &["y", "x", "b"],
6313 (0..40)
6314 .map(|i| {
6315 let x = i as f64 / 39.0;
6316 let b = (i % 2) as f64;
6317 vec![x.sin() + 0.5 * b, x, b]
6318 })
6319 .collect(),
6320 );
6321 let parsed = parse_formula("y ~ te(x, b, k=5)").expect("parse tensor with uniform k=5");
6322 let col_map = ds.column_map();
6323 let mut notes = Vec::new();
6324 let terms = build_termspec(
6325 &parsed.terms,
6326 &ds,
6327 &col_map,
6328 &mut notes,
6329 &gam_runtime::resource::ResourcePolicy::default_library(),
6330 )
6331 .expect("uniform k=5 must auto-cap the binary margin instead of erroring");
6332 let SmoothBasisSpec::TensorBSpline { spec, .. } = &terms.smooth_terms[0].basis else {
6333 panic!("expected tensor B-spline for te(x, b)");
6334 };
6335 let basis_size = |m: &BSplineBasisSpec| match &m.knotspec {
6336 BSplineKnotSpec::PeriodicUniform { num_basis, .. } => *num_basis,
6337 BSplineKnotSpec::Generate {
6338 num_internal_knots, ..
6339 } => num_internal_knots + m.degree + 1,
6340 BSplineKnotSpec::Automatic {
6341 num_internal_knots: Some(n),
6342 ..
6343 } => n + m.degree + 1,
6344 BSplineKnotSpec::NaturalCubicRegression { knots } => knots.len(),
6345 other => panic!("unexpected tensor marginal knotspec: {other:?}"),
6346 };
6347 let binary = &spec.marginalspecs[1];
6348 // Binary margin is reduced to the 2-function linear basis its data
6349 // supports (k capped from 5 to 2, degree dropped to 1).
6350 assert_eq!(basis_size(binary), 2);
6351 assert_eq!(binary.degree, 1);
6352 // The continuous margin is unaffected by the cap (40 distinct values).
6353 assert_eq!(basis_size(&spec.marginalspecs[0]), 5);
6354 }
6355
6356 #[test]
6357 fn tensor_all_tp_margins_with_per_margin_k_routes_to_bspline_tensor() {
6358 // `te(x1, x2, bs=c('tp','tp'), k=c(5,5))` is mgcv's per-margin tp tensor
6359 // with per-margin basis sizes — a tensor product of two 1-D bases, each
6360 // of dimension 5. The list-valued `k=c(5,5)` is honored by
6361 // `parse_tensor_k_list`, producing one penalized B-spline margin per axis
6362 // (each spanning the requested per-axis thin-plate function space). This
6363 // is the same anisotropic-tensor routing the scalar/no-`k` case takes —
6364 // a `te()` request is ALWAYS a tensor product, never a silent isotropic
6365 // thin-plate substitution.
6366 let ds = continuous_dataset(
6367 &["y", "x1", "x2"],
6368 (0..32)
6369 .map(|i| {
6370 let t = i as f64 / 31.0;
6371 vec![t.sin(), t, 1.0 - t]
6372 })
6373 .collect(),
6374 );
6375 let parsed =
6376 parse_formula("y ~ te(x1, x2, bs=c('tp','tp'), k=c(5,5))").expect("parse tensor");
6377 let col_map = ds.column_map();
6378 let mut notes = Vec::new();
6379 let terms = build_termspec(
6380 &parsed.terms,
6381 &ds,
6382 &col_map,
6383 &mut notes,
6384 &gam_runtime::resource::ResourcePolicy::default_library(),
6385 )
6386 .expect("build tensor terms with per-margin k");
6387 let SmoothBasisSpec::TensorBSpline { spec, .. } = &terms.smooth_terms[0].basis else {
6388 panic!(
6389 "expected B-spline tensor when k=c(5,5) is supplied with bs=c('tp','tp'), got {:?}",
6390 terms.smooth_terms[0].basis
6391 );
6392 };
6393 // Since #1074 a `tp` tensor margin (k >= 3) is realized as a
6394 // Lancaster–Salkauskas natural cubic-regression margin (cr basis
6395 // dimension == knot count), not an open `Generate` B-spline. It is
6396 // still a `TensorBSpline` spec with one penalized 1-D margin per axis,
6397 // so the routing assertion above still holds; only the per-margin
6398 // knotspec variant changed. The earlier `_ => panic!` arm pinned the
6399 // pre-#1074 `Generate`-only representation and is stale. Decode every
6400 // margin variant to its basis dimension (mirroring the
6401 // `tensor_margin_basis_sizes` helper).
6402 let dims = spec
6403 .marginalspecs
6404 .iter()
6405 .map(|m| match m.knotspec {
6406 BSplineKnotSpec::Generate {
6407 num_internal_knots, ..
6408 } => num_internal_knots + m.degree + 1,
6409 BSplineKnotSpec::Automatic {
6410 num_internal_knots: Some(num_internal_knots),
6411 ..
6412 } => num_internal_knots + m.degree + 1,
6413 BSplineKnotSpec::PeriodicUniform { num_basis, .. } => num_basis,
6414 BSplineKnotSpec::Provided(ref knots) => {
6415 knots.len().saturating_sub(m.degree + 1)
6416 }
6417 BSplineKnotSpec::NaturalCubicRegression { ref knots } => knots.len(),
6418 BSplineKnotSpec::Automatic {
6419 num_internal_knots: None,
6420 ..
6421 } => panic!("test cannot infer automatic knot count"),
6422 })
6423 .collect::<Vec<_>>();
6424 assert_eq!(dims, vec![5, 5]);
6425 }
6426
6427 #[test]
6428 fn tensor_all_tp_margins_without_per_margin_k_builds_anisotropic_tensor() {
6429 // `te(x1, x2, bs=c('tp','tp'))` is a tensor-product request and must
6430 // build a genuine anisotropic tensor product (one smoothing parameter
6431 // per margin), NOT a silently-substituted multi-D isotropic thin-plate
6432 // radial smooth — that would be a different model (`s(x1,x2,bs='tp')`).
6433 // The routing is now consistent whether or not `k` is list-valued: a tp
6434 // margin vector always realizes each axis as a 1-D penalized B-spline
6435 // margin spanning the same per-axis thin-plate function space (#1082).
6436 let ds = continuous_dataset(
6437 &["y", "x1", "x2"],
6438 (0..32)
6439 .map(|i| {
6440 let t = i as f64 / 31.0;
6441 vec![t.sin(), t, 1.0 - t]
6442 })
6443 .collect(),
6444 );
6445 let parsed = parse_formula("y ~ te(x1, x2, bs=c('tp','tp'))").expect("parse tensor");
6446 let col_map = ds.column_map();
6447 let mut notes = Vec::new();
6448 let terms = build_termspec(
6449 &parsed.terms,
6450 &ds,
6451 &col_map,
6452 &mut notes,
6453 &gam_runtime::resource::ResourcePolicy::default_library(),
6454 )
6455 .expect("build tensor terms without per-margin k");
6456 let SmoothBasisSpec::TensorBSpline { spec, .. } = &terms.smooth_terms[0].basis else {
6457 panic!(
6458 "te(...,bs=c('tp','tp')) must route to an anisotropic tensor product, not a \
6459 silent isotropic thin-plate substitution; got {:?}",
6460 terms.smooth_terms[0].basis
6461 );
6462 };
6463 assert_eq!(
6464 spec.marginalspecs.len(),
6465 2,
6466 "tp tensor must carry one penalized B-spline margin per axis"
6467 );
6468 }
6469
6470 #[test]
6471 fn explicit_basis_sizes_are_not_small_n_clamped() {
6472 let ds = continuous_dataset(
6473 &["y", "x1", "x2", "x3", "x4", "x5"],
6474 (0..12)
6475 .map(|i| {
6476 let x = i as f64 / 11.0;
6477 vec![x.sin(), x, x * x, x + 0.1, 1.0 - x, (2.0 * x).sin()]
6478 })
6479 .collect(),
6480 );
6481 let parsed = parse_formula("y ~ s(x1, k=10) + s(x2) + s(x3) + s(x4) + s(x5)")
6482 .expect("parse multi-smooth formula");
6483 let col_map = ds.column_map();
6484 let mut notes = Vec::new();
6485 let terms = build_termspec(
6486 &parsed.terms,
6487 &ds,
6488 &col_map,
6489 &mut notes,
6490 &gam_runtime::resource::ResourcePolicy::default_library(),
6491 )
6492 .expect("build multi-smooth terms");
6493 let SmoothBasisSpec::BSpline1D { spec, .. } = &terms.smooth_terms[0].basis else {
6494 panic!("expected first smooth to be B-spline");
6495 };
6496 assert!(matches!(
6497 &spec.knotspec,
6498 BSplineKnotSpec::Generate {
6499 num_internal_knots: 6,
6500 ..
6501 }
6502 ));
6503 }
6504
6505 #[test]
6506 fn explicit_duchon_centers_are_not_small_n_bumped() {
6507 let ds = continuous_dataset(
6508 &["y", "x1", "x2", "x3", "x4", "x5"],
6509 (0..12)
6510 .map(|i| {
6511 let x = i as f64 / 11.0;
6512 vec![x.sin(), x, x * x, x + 0.1, 1.0 - x, (2.0 * x).sin()]
6513 })
6514 .collect(),
6515 );
6516 // Pure 1D Duchon at default options resolves the nullspace to Linear
6517 // (2s < d forces escalation), giving 2 polynomial nullspace columns;
6518 // the well-posedness gate requires num_centers > polynomial_cols, so
6519 // 3 is the smallest valid count. It is still well below the small-N
6520 // bump target of polynomial_cols + 4 = 6, so this exercises the
6521 // "explicit value is honored" path the test name advertises.
6522 let parsed = parse_formula("y ~ duchon(x1, centers=3) + s(x2) + s(x3) + s(x4) + s(x5)")
6523 .expect("parse multi-smooth formula");
6524 let col_map = ds.column_map();
6525 let mut notes = Vec::new();
6526 let terms = build_termspec(
6527 &parsed.terms,
6528 &ds,
6529 &col_map,
6530 &mut notes,
6531 &gam_runtime::resource::ResourcePolicy::default_library(),
6532 )
6533 .expect("build multi-smooth terms");
6534 let SmoothBasisSpec::Duchon { spec, .. } = &terms.smooth_terms[0].basis else {
6535 panic!("expected first smooth to be Duchon");
6536 };
6537 assert!(matches!(
6538 spec.center_strategy,
6539 CenterStrategy::FarthestPoint { num_centers: 3 }
6540 ));
6541 }
6542
6543 #[test]
6544 fn inferred_tensor_basis_cap_uses_coordinate_support_not_duplicate_rows() {
6545 let mut unique_rows = Vec::new();
6546 for i in 0..50 {
6547 let theta = i as f64 / 50.0;
6548 for j in 0..16 {
6549 let h = -1.0 + 2.0 * (j as f64) / 15.0;
6550 let y = theta.cos() + h;
6551 unique_rows.push(vec![y, theta, h]);
6552 }
6553 }
6554 let mut repeated_rows = Vec::new();
6555 for _ in 0..12 {
6556 repeated_rows.extend(unique_rows.iter().cloned());
6557 }
6558
6559 let unique = continuous_dataset(&["y", "theta", "h"], unique_rows);
6560 let repeated = continuous_dataset(&["y", "theta", "h"], repeated_rows);
6561
6562 let unique_basis = inferred_tensor_basis_product(&unique);
6563 let repeated_basis = inferred_tensor_basis_product(&repeated);
6564
6565 assert_eq!(
6566 unique_basis, repeated_basis,
6567 "duplicating existing tensor coordinates must not inflate inferred basis width"
6568 );
6569 }
6570
6571 #[test]
6572 fn inferred_three_dim_tensor_basis_stays_bounded_for_reml_selection() {
6573 // Regression for gam#813: the inferred per-margin k must be
6574 // dimension-aware so the 3-D tensor width p = ∏ k_d does not explode.
6575 // With the old 1-D-per-margin rule a 3-D `te` defaulted to 7³=343 at
6576 // small n and 20³=8000 at larger n, making the (non-Kronecker-factorable)
6577 // full-tensor sum-to-zero penalty's O(p³) REML reparameterization a
6578 // multi-minute stall. The dimension-aware budget keeps the product near
6579 // mgcv's te default (≈5³=125) regardless of n.
6580 let make = |n: usize| -> usize {
6581 let mut rows = Vec::with_capacity(n);
6582 for i in 0..n {
6583 let f = i as f64 / n as f64;
6584 rows.push(vec![f.sin(), f, (2.0 * f).cos(), (3.0 * f) % 1.0]);
6585 }
6586 let ds = continuous_dataset(&["y", "x1", "x2", "x3"], rows);
6587 let parsed = parse_formula("y ~ te(x1, x2, x3)").expect("parse 3-D tensor");
6588 let col_map = ds.column_map();
6589 let mut notes = Vec::new();
6590 let terms = build_termspec(
6591 &parsed.terms,
6592 &ds,
6593 &col_map,
6594 &mut notes,
6595 &ResourcePolicy::default_library(),
6596 )
6597 .expect("build 3-D tensor termspec");
6598 let SmoothBasisSpec::TensorBSpline { spec, .. } = &terms.smooth_terms[0].basis else {
6599 panic!("expected tensor smooth");
6600 };
6601 spec.marginalspecs
6602 .iter()
6603 .map(|m| match m.knotspec {
6604 BSplineKnotSpec::Generate {
6605 num_internal_knots, ..
6606 } => num_internal_knots + m.degree + 1,
6607 BSplineKnotSpec::Automatic {
6608 num_internal_knots: Some(num_internal_knots),
6609 ..
6610 } => num_internal_knots + m.degree + 1,
6611 // The mgcv-default `cr` margin (#1074) reports its basis size
6612 // as the number of value-knots placed.
6613 BSplineKnotSpec::NaturalCubicRegression { ref knots } => knots.len(),
6614 _ => panic!("unexpected tensor margin knotspec"),
6615 })
6616 .product()
6617 };
6618
6619 // n=30 (the issue's data): was 7³=343, must now be modest.
6620 assert!(
6621 make(60) <= 216,
6622 "3-D te at small n must stay near the mgcv te default, got {}",
6623 make(60)
6624 );
6625 // Larger n must NOT grow the product toward n³ (was 20³=8000).
6626 assert!(
6627 make(2000) <= 216,
6628 "3-D te at large n must not blow ∏k toward the data size, got {}",
6629 make(2000)
6630 );
6631 }
6632
6633 #[test]
6634 fn parse_bspline_boundary_conditions_and_side_selector() {
6635 // Non-zero anchors are rejected at parse time; the diagnostic must
6636 // name the side and value, which doubles as a check that the
6637 // `side=left` filter routes the global `anchor=` value to the
6638 // left endpoint (not the right).
6639 let mut opts = BTreeMap::new();
6640 opts.insert("boundary_conditions".to_string(), "anchored".to_string());
6641 opts.insert("side".to_string(), "left".to_string());
6642 opts.insert("anchor".to_string(), "2.5".to_string());
6643 let err = parse_bspline_boundary_conditions(&opts)
6644 .expect_err("non-zero left anchor must be rejected")
6645 .to_string();
6646 assert!(
6647 err.contains("left") && err.contains("2.5"),
6648 "rejection should name the affected side and value: {err}"
6649 );
6650
6651 // Side-specific aliases (`start_bc`/`end_bc`) plus the side-specific
6652 // anchor key (`right_anchor`) must funnel the value onto the right
6653 // endpoint — verified through the rejection diagnostic.
6654 let mut opts = BTreeMap::new();
6655 opts.insert("start_bc".to_string(), "clamped".to_string());
6656 opts.insert("end_bc".to_string(), "zero".to_string());
6657 opts.insert("right_anchor".to_string(), "-1.0".to_string());
6658 let err = parse_bspline_boundary_conditions(&opts)
6659 .expect_err("non-zero right anchor must be rejected")
6660 .to_string();
6661 assert!(
6662 err.contains("right") && err.contains("-1"),
6663 "rejection should name the affected side and value: {err}"
6664 );
6665
6666 // With anchors at zero the basis builder accepts the configuration,
6667 // so the same alias plumbing yields a clean `Anchored { value: 0.0 }`
6668 // on the right and `Clamped` on the left.
6669 let mut opts = BTreeMap::new();
6670 opts.insert("start_bc".to_string(), "clamped".to_string());
6671 opts.insert("end_bc".to_string(), "zero".to_string());
6672 let parsed = parse_bspline_boundary_conditions(&opts).expect("boundary conditions");
6673 assert!(matches!(
6674 parsed.left,
6675 BSplineEndpointBoundaryCondition::Clamped
6676 ));
6677 assert!(matches!(
6678 parsed.right,
6679 BSplineEndpointBoundaryCondition::Anchored { value } if value.abs() < 1e-12
6680 ));
6681 }
6682
6683 #[test]
6684 fn categorical_by_numeric_interaction_expands_treatment_coded_cells() {
6685 // `y ~ x:g` is an INTERACTION-ONLY numeric-by-factor model: there is no
6686 // `x` main effect, so the marginal parent that would identify a dropped
6687 // reference level is ABSENT. The expansion must therefore be marginality-
6688 // aware (gam#1158) and DUMMY-code `g` — keep ALL levels — yielding the
6689 // "common intercept, separate slopes" design (one x-slope column per
6690 // group). Treatment-coding here (dropping the reference level) would pin
6691 // the reference group's slope to zero, a rank-deficient fit; that wrong
6692 // behaviour is what this test now guards against. (The treatment-coded
6693 // path is exercised when the `x` parent is present — see
6694 // `categorical_by_numeric_interaction_keeps_treatment_coding_with_parent`.)
6695 let ds = factor_dataset();
6696 // `g` is categorical with two levels (encoded 0.0 → "a", 1.0 → "b").
6697 let parsed = parse_formula("y ~ x:g").expect("parse `y ~ x:g`");
6698 let col_map = ds.column_map();
6699 let mut notes = Vec::new();
6700 let terms = build_termspec(
6701 &parsed.terms,
6702 &ds,
6703 &col_map,
6704 &mut notes,
6705 &ResourcePolicy::default_library(),
6706 )
6707 .expect("factor-aware `x:g` interaction must build, not error");
6708
6709 assert_eq!(
6710 terms.linear_terms.len(),
6711 2,
6712 "interaction-only `x:g` keeps ALL factor levels (full dummy coding): one slope column per group"
6713 );
6714
6715 let x_col = *col_map.get("x").expect("x column");
6716 let g_col = *col_map.get("g").expect("g column");
6717
6718 // Both level gates must appear exactly once across the two cell columns,
6719 // and each cell carries `x` as a product factor (not a raw column for g).
6720 let mut seen_bits = std::collections::HashSet::new();
6721 for term in &terms.linear_terms {
6722 assert!(
6723 term.is_interaction(),
6724 "the categorical-by-numeric cell is a Wilkinson-Rogers interaction"
6725 );
6726 assert_eq!(term.feature_cols, vec![x_col]);
6727 assert_eq!(term.categorical_levels.len(), 1);
6728 let (gate_col, gate_bits) = term.categorical_levels[0];
6729 assert_eq!(gate_col, g_col);
6730 assert!(seen_bits.insert(gate_bits), "each level appears once");
6731
6732 // Realize and check it equals `1[g == gate_bits] * x` row by row.
6733 let column = term
6734 .realized_design_column(ds.values.view())
6735 .expect("realize cell column");
6736 let n = ds.values.nrows();
6737 assert_eq!(column.len(), n);
6738 for row in 0..n {
6739 let x = ds.values[[row, x_col]];
6740 let g = ds.values[[row, g_col]];
6741 let expected = if g.to_bits() == gate_bits { x } else { 0.0 };
6742 assert!(
6743 (column[row] - expected).abs() < 1e-12,
6744 "row {row}: g={g}, x={x}, expected {expected}, got {}",
6745 column[row]
6746 );
6747 }
6748 }
6749 // Both the reference level "a" (0.0) and the non-reference "b" (1.0) are
6750 // kept — the reference level is NOT dropped in the interaction-only form.
6751 assert!(seen_bits.contains(&0.0_f64.to_bits()));
6752 assert!(seen_bits.contains(&1.0_f64.to_bits()));
6753 }
6754
6755 #[test]
6756 fn categorical_by_numeric_interaction_keeps_treatment_coding_with_parent() {
6757 // With the `x` main effect PRESENT (`y ~ x + x:g`), the marginal parent
6758 // that identifies a dropped reference level exists, so `x:g` keeps its
6759 // historical treatment coding: the reference level "a" is dropped and
6760 // only the non-reference slope-deviation column for "b" is emitted. This
6761 // guards that the marginality-aware fix (gam#1158) does NOT regress the
6762 // parent-present form, which must stay column-space-identical to mgcv's
6763 // `x + x:g`.
6764 let ds = factor_dataset();
6765 let parsed = parse_formula("y ~ x + x:g").expect("parse `y ~ x + x:g`");
6766 let col_map = ds.column_map();
6767 let mut notes = Vec::new();
6768 let terms = build_termspec(
6769 &parsed.terms,
6770 &ds,
6771 &col_map,
6772 &mut notes,
6773 &ResourcePolicy::default_library(),
6774 )
6775 .expect("`x + x:g` must build");
6776
6777 // One main-effect `x` column plus one treatment-coded interaction cell.
6778 let x_col = *col_map.get("x").expect("x column");
6779 let g_col = *col_map.get("g").expect("g column");
6780 let interaction_cells: Vec<_> = terms
6781 .linear_terms
6782 .iter()
6783 .filter(|t| t.is_interaction())
6784 .collect();
6785 assert_eq!(
6786 interaction_cells.len(),
6787 1,
6788 "with `x` present, `x:g` is treatment-coded → one cell (reference dropped)"
6789 );
6790 let term = interaction_cells[0];
6791 assert_eq!(term.feature_cols, vec![x_col]);
6792 assert_eq!(term.categorical_levels.len(), 1);
6793 let (gate_col, gate_bits) = term.categorical_levels[0];
6794 assert_eq!(gate_col, g_col);
6795 // The dropped reference is "a" (0.0); the kept gate is "b" (1.0).
6796 assert_eq!(gate_bits, 1.0_f64.to_bits());
6797 }
6798
6799 #[test]
6800 fn categorical_by_categorical_interaction_expands_full_cross_cells() {
6801 // `y ~ f:g` is an INTERACTION-ONLY factor-by-factor model: neither `f`
6802 // nor `g` appears as a main effect, so neither marginal parent is
6803 // present and BOTH factors must be dummy-coded (gam#1159). The correct
6804 // design is the SATURATED cell-means model: the full cross of ALL levels
6805 // (3 * 2 = 6 cells) minus ONE reference cell (the lexicographically-first
6806 // level of every factor, here f0:g0) absorbed by the intercept — rank
6807 // 6-1 = 5 cell columns + intercept, column-space-identical to `f*g`.
6808 // Treatment-coding both factors (the old behaviour) kept only
6809 // (3-1)*(2-1) = 2 cells and collapsed the rest onto the intercept, a
6810 // rank-deficient fit; that is the bug this test now guards against.
6811 let n = 30usize;
6812 let mut rows = Vec::with_capacity(n);
6813 for i in 0..n {
6814 let y = (i as f64).sin();
6815 let f = (i % 3) as f64; // 3 levels: 0,1,2
6816 let g = (i % 2) as f64; // 2 levels: 0,1
6817 rows.push(vec![y, f, g]);
6818 }
6819 let values = Array2::from_shape_vec(
6820 (n, 3),
6821 rows.into_iter().flat_map(|row| row.into_iter()).collect(),
6822 )
6823 .expect("rectangular cross-factor data");
6824 let ds = Dataset {
6825 headers: vec!["y".into(), "f".into(), "g".into()],
6826 values,
6827 schema: DataSchema {
6828 columns: vec![
6829 SchemaColumn {
6830 name: "y".into(),
6831 kind: ColumnKindTag::Continuous,
6832 levels: vec![],
6833 },
6834 SchemaColumn {
6835 name: "f".into(),
6836 kind: ColumnKindTag::Categorical,
6837 levels: vec!["f0".into(), "f1".into(), "f2".into()],
6838 },
6839 SchemaColumn {
6840 name: "g".into(),
6841 kind: ColumnKindTag::Categorical,
6842 levels: vec!["g0".into(), "g1".into()],
6843 },
6844 ],
6845 },
6846 column_kinds: vec![
6847 ColumnKindTag::Continuous,
6848 ColumnKindTag::Categorical,
6849 ColumnKindTag::Categorical,
6850 ],
6851 };
6852
6853 let parsed = parse_formula("y ~ f:g").expect("parse `y ~ f:g`");
6854 let col_map = ds.column_map();
6855 let mut notes = Vec::new();
6856 let terms = build_termspec(
6857 &parsed.terms,
6858 &ds,
6859 &col_map,
6860 &mut notes,
6861 &ResourcePolicy::default_library(),
6862 )
6863 .expect("factor-by-factor `f:g` interaction must build, not error");
6864
6865 assert_eq!(
6866 terms.linear_terms.len(),
6867 5,
6868 "saturated 3*2 = 6 cross cells minus one reference cell (f0:g0) = 5"
6869 );
6870
6871 let f_col = *col_map.get("f").expect("f column");
6872 let g_col = *col_map.get("g").expect("g column");
6873 // The dropped reference cell pairs each factor's lexicographically-first
6874 // level: f0 (0.0) and g0 (0.0). It must NOT appear among the emitted
6875 // cells; every OTHER cross cell must.
6876 let f0 = 0.0_f64.to_bits();
6877 let g0 = 0.0_f64.to_bits();
6878 let mut emitted = std::collections::HashSet::new();
6879 for term in &terms.linear_terms {
6880 // No numeric operand: the realized column is a pure cell indicator.
6881 assert!(term.feature_cols.is_empty());
6882 assert_eq!(term.categorical_levels.len(), 2);
6883 let mut gates = std::collections::HashMap::new();
6884 for &(col, bits) in &term.categorical_levels {
6885 gates.insert(col, bits);
6886 }
6887 let f_bits = *gates.get(&f_col).expect("f gate present");
6888 let g_bits = *gates.get(&g_col).expect("g gate present");
6889 // The reference cell f0:g0 must have been dropped.
6890 assert!(
6891 !(f_bits == f0 && g_bits == g0),
6892 "the reference cell f0:g0 must be absorbed by the intercept, not emitted"
6893 );
6894 emitted.insert((f_bits, g_bits));
6895
6896 let column = term
6897 .realized_design_column(ds.values.view())
6898 .expect("realize cross cell");
6899 for row in 0..n {
6900 let f = ds.values[[row, f_col]];
6901 let g = ds.values[[row, g_col]];
6902 let expected = if f.to_bits() == f_bits && g.to_bits() == g_bits {
6903 1.0
6904 } else {
6905 0.0
6906 };
6907 assert!(
6908 (column[row] - expected).abs() < 1e-12,
6909 "row {row}: expected {expected}, got {}",
6910 column[row]
6911 );
6912 }
6913 assert!(
6914 column.iter().any(|&v| v == 1.0),
6915 "each cross cell must be observed in the data"
6916 );
6917 }
6918 // Every non-reference cross cell is present exactly once: all 6 cells
6919 // except f0:g0.
6920 let f_levels = [0.0_f64.to_bits(), 1.0_f64.to_bits(), 2.0_f64.to_bits()];
6921 let g_levels = [0.0_f64.to_bits(), 1.0_f64.to_bits()];
6922 for &fb in &f_levels {
6923 for &gb in &g_levels {
6924 if fb == f0 && gb == g0 {
6925 continue;
6926 }
6927 assert!(
6928 emitted.contains(&(fb, gb)),
6929 "saturated cross cell must be present"
6930 );
6931 }
6932 }
6933 }
6934}