gam_terms/term_builder.rs
1//! Term construction: bridge from parsed formula terms to `TermCollectionSpec`.
2//!
3//! This module takes the AST produced by `inference::formula_dsl` and a loaded
4//! dataset, resolves column references, infers knot counts and center strategies,
5//! and produces a `TermCollectionSpec` ready for `build_term_collection_design`.
6
7use std::collections::{BTreeMap, BTreeSet, HashMap};
8use std::path::PathBuf;
9
10use ndarray::{Array2, ArrayView1};
11
12use crate::basis::{
13 BSplineBasisSpec, BSplineBoundaryConditions, BSplineEndpointBoundaryCondition,
14 BSplineIdentifiability, BSplineKnotSpec, CenterCountRequest, CenterStrategy,
15 ConstantCurvatureBasisSpec, ConstantCurvatureIdentifiability, DuchonBasisSpec,
16 DuchonNullspaceOrder, DuchonOperatorPenaltySpec, MaternBasisSpec, MaternIdentifiability,
17 MaternNu, MeasureJetBasisSpec, MeasureJetIdentifiability, OneDimensionalBoundary,
18 SpatialIdentifiability, SphereMethod, SphereWahbaKernel, SphericalSplineBasisSpec,
19 SphericalSplineIdentifiability, ThinPlateBasisSpec, auto_spatial_center_strategy,
20 default_num_centers, default_spatial_center_strategy, default_spherical_harmonic_degree,
21 plan_spatial_basis, thin_plate_penalty_order,
22};
23use crate::inference::formula_dsl::{
24 ParsedTerm, SmoothKind, option_bool, option_f64, option_f64_strict, option_usize,
25 option_usize_any, option_usize_any_strict, option_usize_strict, strip_quotes,
26};
27use crate::smooth::{
28 BySmoothKind, ByVarKind, ByVariableSpec, FactorSmoothFlavour, FactorSmoothSpec,
29 LinearCoefficientGeometry, LinearTermSpec, RandomEffectTermSpec, ShapeConstraint,
30 SmoothBasisSpec, SmoothTermSpec, TensorBSplineIdentifiability,
31 TensorBSplinePenaltyDecomposition, TensorBSplineSpec, TermCollectionSpec,
32};
33use gam_data::{ColumnKindTag, DataError, EncodedDataset as Dataset};
34use gam_problem::types::ColIdx;
35use gam_runtime::resource::ResourcePolicy;
36
37/// Default B-spline degree when a smooth's `degree=` option is absent. Cubic
38/// (degree 3) is the standard GAM convention: C² continuity with a low knot
39/// count.
40const DEFAULT_BSPLINE_DEGREE: usize = 3;
41
42/// Default difference-penalty order when a smooth's `penalty_order=` (alias
43/// `m=`) option is absent. Second-order (curvature) is the standard P-spline
44/// convention.
45const DEFAULT_PENALTY_ORDER: usize = 2;
46
47/// Default basis dimension for one-dimensional cyclic cubic P-splines.
48///
49/// Periodic smooths spend no coefficients on free endpoints, so they should not
50/// inherit the larger open B-spline knot ceiling by default. This is still only
51/// a default: callers can request a richer periodic space with `k=`.
52const CYCLIC_DEFAULT_BASIS_DIM: usize = 12;
53
54/// Default shared-marginal basis dimension for `bs="fs"`/`bs="sz"` factor smooths,
55/// matching mgcv's factor-smooth default `k=10`. A factor smooth shares one
56/// marginal across all levels; a modest basis recovers the shared signal without
57/// over-fitting each group's within-group noise (gam#903). Overridden by an
58/// explicit `k`/`basis_dim`.
59const FACTOR_SMOOTH_DEFAULT_BASIS_DIM: usize = 10;
60
61/// Default row-chunk size for the out-of-core PCA-basis smooth when the
62/// `chunk_size=` option is absent. Streams the design in row blocks to bound
63/// peak memory independent of the dataset row count.
64const DEFAULT_PCA_CHUNK_SIZE: usize = 4096;
65
66// ---------------------------------------------------------------------------
67// Typed errors
68// ---------------------------------------------------------------------------
69
70/// Typed errors emitted by term-builder helpers. `Display` reproduces the exact
71/// pre-refactor `format!(...)` text byte-for-byte, so callers that string-match
72/// on the message (tests, log assertions) keep working unchanged. Public-API
73/// functions still return `Result<_, String>` and use `.to_string()` shims at
74/// their boundary to stay compatible with callers in protected modules.
75#[derive(Clone, Debug)]
76pub enum TermBuilderError {
77 /// Column-resolution / column-kind lookup failures whose context is purely
78 /// internal (column-kind table out-of-sync, alias map missing an entry,
79 /// etc.). User-facing "this formula references a column that doesn't
80 /// exist" diagnostics use the dedicated `ColumnNotFound` variant so the
81 /// FFI boundary can lift the structured payload into a Python
82 /// `ColumnNotFoundError` without parsing prose.
83 MissingColumn { reason: String },
84 /// A formula referenced a column that is not present in the input data.
85 /// Mirrors `DataError::ColumnNotFound` field-for-field so the conversion
86 /// across module boundaries is a pure data move (no re-derivation, no
87 /// string re-parsing). Public callers see byte-identical `Display`
88 /// output to the legacy `missing_column_message` text.
89 ColumnNotFound {
90 name: String,
91 role: Option<String>,
92 available: Vec<String>,
93 similar: Vec<String>,
94 tsv_hint: bool,
95 },
96 /// User-specified configuration is internally inconsistent (e.g. too few
97 /// variables for a smooth type, conflicting size options, requested basis
98 /// dimension below the polynomial nullspace).
99 IncompatibleConfig { reason: String },
100 /// Option parsing failure: malformed numeric expression, unknown option
101 /// key, out-of-range integer, list-length mismatch, etc.
102 InvalidOption { reason: String },
103 /// User requested a feature that is intentionally not supported (unknown
104 /// smooth type / method / kernel / identifiability, non-zero anchor,
105 /// internal-only token, etc.).
106 UnsupportedFeature { reason: String },
107 /// Input data is degenerate for the requested term (constant column,
108 /// non-finite categorical entries, ...).
109 DegenerateData { reason: String },
110 /// Term-collection-stage formula error — a node that the caller was
111 /// supposed to resolve upstream reached the builder.
112 MalformedFormula { reason: String },
113}
114
115impl std::fmt::Display for TermBuilderError {
116 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
117 match self {
118 TermBuilderError::MissingColumn { reason }
119 | TermBuilderError::IncompatibleConfig { reason }
120 | TermBuilderError::InvalidOption { reason }
121 | TermBuilderError::UnsupportedFeature { reason }
122 | TermBuilderError::DegenerateData { reason }
123 | TermBuilderError::MalformedFormula { reason } => f.write_str(reason),
124 // Delegate to the canonical `DataError::ColumnNotFound` formatter
125 // so a single source of truth defines the human text. The
126 // intermediate `DataError` constructed here owns its strings only
127 // for the duration of the Display call — no allocation cost
128 // beyond the original payload that this variant already holds.
129 TermBuilderError::ColumnNotFound {
130 name,
131 role,
132 available,
133 similar,
134 tsv_hint,
135 } => {
136 let canonical = DataError::ColumnNotFound {
137 name: name.clone(),
138 role: role.clone(),
139 available: available.clone(),
140 similar: similar.clone(),
141 tsv_hint: *tsv_hint,
142 };
143 std::fmt::Display::fmt(&canonical, f)
144 }
145 }
146 }
147}
148
149impl From<TermBuilderError> for String {
150 fn from(err: TermBuilderError) -> String {
151 err.to_string()
152 }
153}
154
155/// Catchall lift for the term-builder's internal `Result<_, String>` helpers
156/// (numeric expression parsing, option lookup, boundary-condition parsing,
157/// ...) that flow into `build_termspec` via `?`. Maps to
158/// `IncompatibleConfig`, which is the most appropriate generic bucket for
159/// option/config-style failures — leaf sites that emit structured payloads
160/// (`From<DataError>` for column-not-found) bypass this fallback.
161impl From<String> for TermBuilderError {
162 fn from(reason: String) -> Self {
163 Self::IncompatibleConfig { reason }
164 }
165}
166
167/// Typed lift from data-layer errors. `DataError::ColumnNotFound` becomes
168/// `TermBuilderError::ColumnNotFound` field-for-field — no stringification,
169/// no information loss — so the FFI boundary downstream can dispatch on
170/// the typed variant. Other `DataError` variants degrade into
171/// `MissingColumn` since they describe column-resolution-time failures
172/// without a dedicated structured destination.
173impl From<DataError> for TermBuilderError {
174 fn from(err: DataError) -> Self {
175 match err {
176 DataError::ColumnNotFound {
177 name,
178 role,
179 available,
180 similar,
181 tsv_hint,
182 } => Self::ColumnNotFound {
183 name,
184 role,
185 available,
186 similar,
187 tsv_hint,
188 },
189 DataError::SchemaMismatch { reason }
190 | DataError::ParseError { reason }
191 | DataError::EncodingFailure { reason }
192 | DataError::EmptyInput { reason }
193 | DataError::InvalidValue { reason } => Self::MissingColumn { reason },
194 }
195 }
196}
197
198// Constructor helpers — keep error-site code compact and consistent.
199impl TermBuilderError {
200 #[inline]
201 fn missing_column(reason: impl Into<String>) -> Self {
202 TermBuilderError::MissingColumn {
203 reason: reason.into(),
204 }
205 }
206 #[inline]
207 fn incompatible_config(reason: impl Into<String>) -> Self {
208 TermBuilderError::IncompatibleConfig {
209 reason: reason.into(),
210 }
211 }
212 #[inline]
213 fn invalid_option(reason: impl Into<String>) -> Self {
214 TermBuilderError::InvalidOption {
215 reason: reason.into(),
216 }
217 }
218 #[inline]
219 fn unsupported_feature(reason: impl Into<String>) -> Self {
220 TermBuilderError::UnsupportedFeature {
221 reason: reason.into(),
222 }
223 }
224 #[inline]
225 fn degenerate_data(reason: impl Into<String>) -> Self {
226 TermBuilderError::DegenerateData {
227 reason: reason.into(),
228 }
229 }
230 #[inline]
231 fn malformed_formula(reason: impl Into<String>) -> Self {
232 TermBuilderError::MalformedFormula {
233 reason: reason.into(),
234 }
235 }
236}
237
238// ---------------------------------------------------------------------------
239// Column resolution
240// ---------------------------------------------------------------------------
241
242/// Resolve a bare column name to its index, returning a typed
243/// `DataError::ColumnNotFound` on miss so the FFI boundary can surface a
244/// structured `gamfit.ColumnNotFoundError(column=…, available=…)` rather
245/// than rely on string-classification of human prose. Internal callers that
246/// still flow `Result<_, String>` get byte-identical text via
247/// `From<DataError> for String`.
248pub fn resolve_col(col_map: &HashMap<String, usize>, name: &str) -> Result<usize, DataError> {
249 col_map
250 .get(name)
251 .copied()
252 .ok_or_else(|| DataError::column_not_found(col_map, name, None))
253}
254
255/// Like `resolve_col` but tags the missing-column payload with a role label
256/// (`"response"`, `"entry"`, `"exit"`, `"event"`, `"z"`, `"id"`, …) so the
257/// boundary-side Python exception can disambiguate which formula slot held
258/// the bad reference.
259pub fn resolve_role_col(
260 col_map: &HashMap<String, usize>,
261 name: &str,
262 role: &str,
263) -> Result<usize, DataError> {
264 col_map
265 .get(name)
266 .copied()
267 .ok_or_else(|| DataError::column_not_found(col_map, name, Some(role)))
268}
269
270fn encoded_levels_for_column(ds: &Dataset, col: ColIdx) -> Vec<(u64, String)> {
271 let mut seen = BTreeSet::<u64>::new();
272 for value in ds.values.column(col.get()) {
273 if value.is_finite() {
274 seen.insert(value.to_bits());
275 }
276 }
277 let schema_levels = ds
278 .schema
279 .columns
280 .get(col.get())
281 .map(|column| column.levels.as_slice())
282 .unwrap_or(&[]);
283 seen.into_iter()
284 .enumerate()
285 .map(|(idx, bits)| {
286 let fallback = format!("level{}", idx + 1);
287 let label = schema_levels.get(idx).cloned().unwrap_or(fallback);
288 (bits, label)
289 })
290 .collect()
291}
292
293pub fn column_map_with_alias(
294 col_map: &HashMap<String, usize>,
295 alias: &str,
296 target_column: &str,
297) -> HashMap<String, usize> {
298 let mut aliased = col_map.clone();
299 if let Some(idx) = col_map.get(target_column).copied() {
300 aliased.entry(alias.to_string()).or_insert(idx);
301 }
302 aliased
303}
304
305// ---------------------------------------------------------------------------
306// ParsedTerm[] + Dataset → TermCollectionSpec
307// ---------------------------------------------------------------------------
308
309pub fn build_termspec(
310 terms: &[ParsedTerm],
311 ds: &Dataset,
312 col_map: &HashMap<String, usize>,
313 inference_notes: &mut Vec<String>,
314 policy: &ResourcePolicy,
315) -> Result<TermCollectionSpec, TermBuilderError> {
316 let mut linear_terms = Vec::<LinearTermSpec>::new();
317 let mut random_terms = Vec::<RandomEffectTermSpec>::new();
318 let mut smooth_terms = Vec::<SmoothTermSpec>::new();
319 let smooth_coordinate_count = terms
320 .iter()
321 .map(|term| match term {
322 ParsedTerm::Smooth { vars, .. } => vars.len(),
323 _ => 0,
324 })
325 .sum::<usize>();
326
327 for t in terms {
328 match t {
329 ParsedTerm::Linear {
330 name,
331 explicit,
332 coefficient_min,
333 coefficient_max,
334 } => {
335 let col = resolve_col(col_map, name)?;
336 let auto_kind = ds.column_kinds.get(col).copied().ok_or_else(|| {
337 TermBuilderError::missing_column(format!(
338 "internal column-kind lookup failed for '{name}'"
339 ))
340 .to_string()
341 })?;
342 if *explicit {
343 linear_terms.push(LinearTermSpec {
344 name: name.clone(),
345 feature_col: col,
346 feature_cols: vec![col],
347 categorical_levels: vec![],
348 // Parametric linear terms are unpenalized by default
349 // (MLE, matching mgcv/glm); see #749.
350 double_penalty: false,
351 coefficient_geometry: LinearCoefficientGeometry::Unconstrained,
352 coefficient_min: *coefficient_min,
353 coefficient_max: *coefficient_max,
354 });
355 } else {
356 match auto_kind {
357 ColumnKindTag::Continuous | ColumnKindTag::Binary => {
358 linear_terms.push(LinearTermSpec {
359 name: name.clone(),
360 feature_col: col,
361 feature_cols: vec![col],
362 categorical_levels: vec![],
363 // Unpenalized parametric effect by default (#749).
364 double_penalty: false,
365 coefficient_geometry: LinearCoefficientGeometry::Unconstrained,
366 coefficient_min: *coefficient_min,
367 coefficient_max: *coefficient_max,
368 });
369 }
370 ColumnKindTag::Categorical => {
371 if coefficient_min.is_some() || coefficient_max.is_some() {
372 return Err(TermBuilderError::incompatible_config(format!(
373 "coefficient constraints are not supported for categorical auto-random-effect term '{name}'; use group({name}) or an unconstrained numeric term"
374 )));
375 }
376 random_terms.push(RandomEffectTermSpec {
377 name: name.clone(),
378 feature_col: col,
379 drop_first_level: false,
380 penalized: true,
381 frozen_levels: None,
382 });
383 }
384 }
385 }
386 }
387 ParsedTerm::BoundedLinear {
388 name,
389 min,
390 max,
391 prior,
392 } => {
393 let col = resolve_col(col_map, name)?;
394 let auto_kind = ds.column_kinds.get(col).copied().ok_or_else(|| {
395 TermBuilderError::missing_column(format!(
396 "internal column-kind lookup failed for '{name}'"
397 ))
398 .to_string()
399 })?;
400 if !matches!(auto_kind, ColumnKindTag::Continuous | ColumnKindTag::Binary) {
401 return Err(TermBuilderError::incompatible_config(format!(
402 "bounded() currently supports only numeric columns, got categorical '{name}'"
403 )));
404 }
405 linear_terms.push(LinearTermSpec {
406 name: name.clone(),
407 feature_col: col,
408 feature_cols: vec![col],
409 categorical_levels: vec![],
410 double_penalty: false,
411 coefficient_geometry: LinearCoefficientGeometry::Bounded {
412 min: *min,
413 max: *max,
414 prior: prior.clone(),
415 },
416 coefficient_min: None,
417 coefficient_max: None,
418 });
419 }
420 ParsedTerm::RandomEffect { name } => {
421 let col = resolve_col(col_map, name)?;
422 random_terms.push(RandomEffectTermSpec {
423 name: name.clone(),
424 feature_col: col,
425 drop_first_level: false,
426 penalized: true,
427 frozen_levels: None,
428 });
429 }
430 ParsedTerm::Smooth {
431 label,
432 vars,
433 kind,
434 options,
435 } => {
436 let smooth_vars = vars.clone();
437 let by_name = options.get("by").cloned();
438 // `bs="sz"` (sum-to-zero), like `bs="fs"`/`bs="re"`, is a
439 // factor-smooth family handled natively by `build_smooth_basis`'s
440 // fs/sz/re path: it detects the categorical factor among the
441 // variables and emits a `SmoothBasisSpec::FactorSmooth { Sz }`
442 // with the correct single-penalty marginal and modest default
443 // basis. Route sz straight through `build_smooth_basis` rather
444 // than intercepting it into a legacy `FactorSumToZero` envelope
445 // here (which left `sz(fac, x)` mis-typed as `FactorSumToZero`
446 // instead of the expected `FactorSmooth { Sz }`).
447 let cols = smooth_vars
448 .iter()
449 .map(|v| resolve_col(col_map, v))
450 .collect::<Result<Vec<_>, _>>()?;
451 let mut inner_options = options.clone();
452 inner_options.remove("by");
453 // `ordered=` is consumed here (ByVarKind::Factor routing) and
454 // must not propagate to the inner basis builder, which has no
455 // allow-list entry for it and would reject it as an unknown option.
456 inner_options.remove("ordered");
457 // Pop the shape constraint before `build_smooth_basis` runs so
458 // it never reaches the per-kind `validate_known_options`
459 // allow-lists (the constraint is a property of the smooth term,
460 // not of any one basis kind). Basis-incompatible requests still
461 // fail loudly downstream via `shape_supports_basis`.
462 let shape = match inner_options.remove("shape") {
463 None => ShapeConstraint::None,
464 Some(raw) => crate::smooth::parse_shape_constraint(&raw)
465 .map_err(TermBuilderError::invalid_option)?,
466 };
467 let inner_basis = build_smooth_basis(
468 *kind,
469 &smooth_vars,
470 &cols,
471 &inner_options,
472 ds,
473 inference_notes,
474 policy,
475 smooth_coordinate_count,
476 )?;
477 let inner_basis = match inner_basis {
478 SmoothBasisSpec::FactorSmooth {
479 spec:
480 FactorSmoothSpec {
481 continuous_cols,
482 group_col,
483 marginal,
484 flavour: FactorSmoothFlavour::Sz,
485 frozen_global_orthogonality,
486 ..
487 },
488 } => {
489 if continuous_cols.len() != 1 {
490 return Err(TermBuilderError::incompatible_config(format!(
491 "sz factor-smooth currently expects exactly one continuous covariate, found {}",
492 continuous_cols.len()
493 )));
494 }
495 SmoothBasisSpec::FactorSumToZero {
496 inner: Box::new(SmoothBasisSpec::BSpline1D {
497 feature_col: continuous_cols[0],
498 spec: marginal,
499 }),
500 by_col: group_col,
501 levels: encoded_levels_for_column(ds, ColIdx::new(group_col))
502 .into_iter()
503 .map(|(bits, _)| bits)
504 .collect(),
505 frozen_global_orthogonality,
506 }
507 }
508 other => other,
509 };
510 if let Some(by_name) = by_name {
511 let by_col = resolve_col(col_map, &by_name)?;
512 match ds.column_kinds.get(by_col).copied().ok_or_else(|| {
513 format!("internal column-kind lookup failed for by variable '{by_name}'")
514 })? {
515 ColumnKindTag::Categorical => {
516 let levels = encoded_levels_for_column(ds, ColIdx::new(by_col));
517 // A penalized random block for this factor already
518 // owns its full level offsets when EITHER an explicit
519 // `group(factor)` appears, OR a *bare* categorical
520 // `+ factor` does — the latter is auto-promoted to a
521 // penalized random-effect block (see the
522 // `ParsedTerm::Linear` / `ColumnKindTag::Categorical`
523 // arm above, `penalized: true`). Both representations
524 // carry the same per-level offsets, so #1457: the
525 // `by=` branch must NOT additionally add its own
526 // unpenalized treatment-coded main effect, which would
527 // double-represent the factor (two `g` design blocks +
528 // a spurious extra smoothing parameter).
529 let penalized_group_owner_present =
530 terms.iter().any(|other| match other {
531 ParsedTerm::RandomEffect { name } => name == &by_name,
532 ParsedTerm::Linear {
533 name,
534 explicit: false,
535 ..
536 } if name == &by_name => col_map
537 .get(name)
538 .and_then(|c| ds.column_kinds.get(*c).copied())
539 .map(|kind| matches!(kind, ColumnKindTag::Categorical))
540 .unwrap_or(false),
541 _ => false,
542 });
543 // Add an unpenalized treatment-coded fixed main
544 // effect for a standalone factor-by smooth, unless
545 // the same factor already has an explicit
546 // `group(factor)` term OR a bare categorical `+
547 // factor` that was auto-promoted to a penalized
548 // random block (#1457). In those mixed-model forms
549 // the penalized random intercept is the coherent
550 // owner of level offsets; adding a no-pooling fixed
551 // factor effect would bypass random-effect
552 // shrinkage and degrade BLUP-style predictions.
553 if !random_terms.iter().any(|rt| rt.name == by_name)
554 && !penalized_group_owner_present
555 {
556 random_terms.push(RandomEffectTermSpec {
557 name: by_name.clone(),
558 feature_col: by_col,
559 drop_first_level: true,
560 penalized: false,
561 frozen_levels: None,
562 });
563 }
564 // Unordered factor-by smooths are independent
565 // level-specific smooths. Preserve that
566 // term-spec structure explicitly so later
567 // hierarchy/identifiability passes can see the
568 // per-level ownership rather than a generic
569 // BySmooth envelope.
570 for (level_bits, level_label) in levels {
571 smooth_terms.push(SmoothTermSpec {
572 name: format!("{label}:by={by_name}[{level_label}]"),
573 basis: SmoothBasisSpec::ByVariable {
574 inner: Box::new(inner_basis.clone()),
575 by_col,
576 kind: BySmoothKind::Level { level_bits },
577 by: ByVariableSpec::Level {
578 value_bits: level_bits,
579 label: level_label,
580 },
581 },
582 shape: shape.clone(),
583 joint_null_rotation: None,
584 });
585 }
586 }
587 ColumnKindTag::Binary | ColumnKindTag::Continuous => {
588 smooth_terms.push(SmoothTermSpec {
589 name: label.clone(),
590 basis: SmoothBasisSpec::ByVariable {
591 inner: Box::new(inner_basis),
592 by_col,
593 kind: BySmoothKind::Numeric,
594 by: ByVariableSpec::Numeric,
595 },
596 shape,
597 joint_null_rotation: None,
598 });
599 }
600 }
601 } else {
602 smooth_terms.push(SmoothTermSpec {
603 name: label.clone(),
604 basis: inner_basis,
605 shape,
606 joint_null_rotation: None,
607 });
608 }
609 }
610 ParsedTerm::LinkWiggle { .. }
611 | ParsedTerm::TimeWiggle { .. }
612 | ParsedTerm::LinkConfig { .. }
613 | ParsedTerm::SurvivalConfig { .. } => {
614 // Consumed at formula level, not design terms.
615 }
616 ParsedTerm::LogSlopeSurface { .. } => {
617 return Err(TermBuilderError::malformed_formula(
618 "logslope(...) declarations must be resolved by the marginal-slope formula path before building a term spec",
619 ));
620 }
621 ParsedTerm::Interaction { vars } => {
622 // A linear `:` interaction realizes one design column equal to
623 // the elementwise product of its operands. Numeric (continuous/
624 // binary) operands multiply directly; a categorical operand is
625 // a factor, so the product is expanded factor-aware: one design
626 // column per surviving cell of the factor(s), each an indicator
627 // `1[factor == level]` gating the numeric product.
628 //
629 // Coding is MARGINALITY-AWARE (gam#1158, gam#1159). A categorical
630 // operand `g` is treatment-coded (its lexicographically first
631 // reference level dropped) ONLY when the lower-order term obtained
632 // by removing `g` from this interaction is also present in the
633 // model — that lower-order term is what makes the dropped level
634 // identifiable, exactly mgcv's marginality rule. When that parent
635 // is ABSENT (the interaction-only form), dropping the reference
636 // level instead pins a group to the reference fit (a rank-deficient
637 // design), so we keep ALL levels (full dummy coding) and rely on a
638 // single intercept cell-drop below for identifiability:
639 // * `y ~ x:g` with no `x` main effect → "common intercept,
640 // separate slopes": every group keeps its own x-slope.
641 // * `y ~ g:h` with no `g`/`h` main effects → the saturated
642 // cell-means model: full cross of all levels minus one
643 // reference cell absorbed by the intercept.
644 // When the parents ARE present (`x + x:g`, or `g*h` = `g + h +
645 // g:h`), the historical treatment coding is preserved so those
646 // forms stay correct.
647 //
648 // A main effect for var V is a `Linear`/`BoundedLinear`/
649 // `RandomEffect` ParsedTerm whose referenced name is V (an
650 // auto-detected categorical `Linear` becomes a RandomEffect main
651 // effect; either spelling counts). We only treat such standalone
652 // main-effect terms as parents — not V appearing inside another
653 // interaction.
654 let main_effect_present = |target: &str| -> bool {
655 terms.iter().any(|other| match other {
656 ParsedTerm::Linear { name, .. }
657 | ParsedTerm::BoundedLinear { name, .. }
658 | ParsedTerm::RandomEffect { name } => name == target,
659 _ => false,
660 })
661 };
662 // The lower-order parent of dropping operand `drop_var` from this
663 // interaction is present iff EVERY other operand is a main effect.
664 // For the two cases we care about (`x:g`, `g:h`) the interaction
665 // has two operands, so this reduces to "is the single remaining
666 // operand a main effect"; the general form handles any arity.
667 let parent_present = |drop_var: &str| -> bool {
668 vars.iter()
669 .filter(|v| v.as_str() != drop_var)
670 .all(|v| main_effect_present(v))
671 };
672
673 let mut numeric_cols = Vec::<usize>::new();
674 // Per categorical operand: (var name, col, kept levels, was the
675 // reference level dropped / treatment-coded?).
676 let mut categorical_factors =
677 Vec::<(String, usize, Vec<(u64, String)>, bool)>::new();
678 for var in vars {
679 let col = resolve_col(col_map, var)?;
680 let kind = ds.column_kinds.get(col).copied().ok_or_else(|| {
681 TermBuilderError::missing_column(format!(
682 "internal column-kind lookup failed for '{var}'"
683 ))
684 .to_string()
685 })?;
686 match kind {
687 ColumnKindTag::Continuous | ColumnKindTag::Binary => numeric_cols.push(col),
688 ColumnKindTag::Categorical => {
689 let mut levels = encoded_levels_for_column(ds, ColIdx::new(col));
690 // Treatment-code (drop the reference level) only when
691 // the marginal parent that identifies it is present;
692 // otherwise keep every level (full dummy coding).
693 let treatment_coded = parent_present(var);
694 if treatment_coded && levels.len() > 1 {
695 levels.remove(0);
696 }
697 if levels.is_empty() {
698 return Err(TermBuilderError::incompatible_config(format!(
699 "interaction `{}` references categorical column `{var}` with no usable levels",
700 vars.join(":")
701 )));
702 }
703 categorical_factors.push((var.clone(), col, levels, treatment_coded));
704 }
705 }
706 }
707
708 let label = vars.join(":");
709
710 if categorical_factors.is_empty() {
711 // Pure numeric `:` interaction — single product column,
712 // identical to the historical behaviour.
713 linear_terms.push(LinearTermSpec {
714 name: label,
715 feature_col: numeric_cols[0],
716 feature_cols: numeric_cols,
717 categorical_levels: vec![],
718 // Parametric `:` interaction column is unpenalized by
719 // default, same as any other linear term (#749).
720 double_penalty: false,
721 coefficient_geometry: LinearCoefficientGeometry::Unconstrained,
722 coefficient_min: None,
723 coefficient_max: None,
724 });
725 inference_notes.push(format!(
726 "wired linear interaction `{}` as product of numeric columns",
727 vars.join(":")
728 ));
729 } else {
730 // Factor-aware expansion: cartesian product over the kept
731 // levels of every categorical operand. Each cell yields one
732 // column gating the numeric product (or, with no numeric
733 // operand, a pure cell indicator).
734 let mut cells: Vec<Vec<(usize, u64, String)>> = vec![Vec::new()];
735 for (_var, col, levels, _treatment_coded) in &categorical_factors {
736 let mut next = Vec::with_capacity(cells.len() * levels.len());
737 for cell in &cells {
738 for (bits, level_label) in levels {
739 let mut extended = cell.clone();
740 extended.push((*col, *bits, level_label.clone()));
741 next.push(extended);
742 }
743 }
744 cells = next;
745 }
746
747 // Intercept-identifiability cell drop. When the cells are PURE
748 // INDICATORS (no numeric operand) and at least one factor was
749 // dummy-coded (kept all its levels), the full set of cell
750 // columns sums to the all-ones intercept and is rank-deficient
751 // against it. Drop exactly ONE reference cell — the cell where
752 // every factor sits at its reference (lexicographically first)
753 // level — so the remaining saturated cells are identifiable
754 // (rank n_g*n_h - 1 cells + intercept). With a numeric operand
755 // the cells gate `x` and sum to `x`, not the intercept, so no
756 // cell is dropped (the collinearity there is with the absent
757 // `x` main effect, which is exactly why full coding is right).
758 let any_dummy_coded = categorical_factors
759 .iter()
760 .any(|(_, _, _, treatment_coded)| !*treatment_coded);
761 if numeric_cols.is_empty() && any_dummy_coded {
762 // The reference cell pairs each factor's column with the
763 // bits of its lexicographically-first (index 0) level.
764 let reference_cell: Vec<(usize, u64)> = categorical_factors
765 .iter()
766 .map(|(_, col, _, _)| {
767 let levels = encoded_levels_for_column(ds, ColIdx::new(*col));
768 (*col, levels[0].0)
769 })
770 .collect();
771 cells.retain(|cell| {
772 !reference_cell.iter().all(|(rcol, rbits)| {
773 cell.iter()
774 .any(|(col, bits, _)| col == rcol && bits == rbits)
775 })
776 });
777 }
778
779 let n_cells = cells.len();
780 for cell in cells {
781 let cell_suffix = cell
782 .iter()
783 .map(|(_, _, level_label)| level_label.as_str())
784 .collect::<Vec<_>>()
785 .join(":");
786 let categorical_levels =
787 cell.iter().map(|(col, bits, _)| (*col, *bits)).collect();
788 // `feature_col` is required to point at a real column;
789 // use the first numeric operand when present, otherwise
790 // the first categorical column (its raw value is never
791 // multiplied — `realized_design_column` starts from ones
792 // and only gates by the level indicators).
793 let feature_col = numeric_cols
794 .first()
795 .copied()
796 .unwrap_or(categorical_factors[0].1);
797 linear_terms.push(LinearTermSpec {
798 name: format!("{label}:{cell_suffix}"),
799 feature_col,
800 feature_cols: numeric_cols.clone(),
801 categorical_levels,
802 double_penalty: false,
803 coefficient_geometry: LinearCoefficientGeometry::Unconstrained,
804 coefficient_min: None,
805 coefficient_max: None,
806 });
807 }
808 let all_treatment_coded = !any_dummy_coded;
809 let coding = if all_treatment_coded {
810 "treatment-coded"
811 } else {
812 "marginality-aware (full dummy / saturated)"
813 };
814 inference_notes.push(format!(
815 "wired factor-aware linear interaction `{}` as {} {} cell column(s)",
816 vars.join(":"),
817 n_cells,
818 coding
819 ));
820 }
821 }
822 }
823 }
824
825 Ok(TermCollectionSpec {
826 linear_terms,
827 random_effect_terms: random_terms,
828 smooth_terms,
829 })
830}
831
832fn split_list_option(raw: &str) -> Vec<String> {
833 let t = raw.trim();
834 // Accept the Python/JSON list form `[a, b]` AND mgcv's R-vector forms
835 // `c(a, b)` / `(a, b)` as bracketed wrappers around a comma-separated body.
836 // mgcv-style formulas pass per-margin numeric options as `k=c(5,5)` /
837 // `period=c(2*pi, pi)`; without R-vector peeling here those entries were
838 // split into `["c(5", "5)"]` and the downstream numeric parser then
839 // misreported the leading garbage as the invalid digit.
840 let inner = t
841 .strip_prefix('[')
842 .and_then(|u| u.strip_suffix(']'))
843 .or_else(|| {
844 t.strip_prefix("c(")
845 .or_else(|| t.strip_prefix("C("))
846 .or_else(|| t.strip_prefix('('))
847 .and_then(|u| u.strip_suffix(')'))
848 })
849 .unwrap_or(t);
850 inner
851 .split(',')
852 .map(|v| v.trim().to_string())
853 .filter(|v| !v.is_empty())
854 .collect()
855}
856
857fn parse_numeric_expr(raw: &str) -> Result<f64, String> {
858 let mut acc = 1.0f64;
859 let normalized = raw.replace(' ', "");
860 if normalized.eq_ignore_ascii_case("none") {
861 return Err("None is not numeric".to_string());
862 }
863 for factor in normalized.split('*') {
864 if factor.is_empty() {
865 return Err(format!("invalid numeric expression '{raw}'"));
866 }
867 let value = if factor.eq_ignore_ascii_case("pi") || factor == "π" {
868 std::f64::consts::PI
869 } else if factor.eq_ignore_ascii_case("tau") || factor == "τ" {
870 std::f64::consts::TAU
871 } else if let Some(prefix) = factor
872 .strip_suffix("pi")
873 .or_else(|| factor.strip_suffix("π"))
874 {
875 let coefficient = if prefix.is_empty() {
876 1.0
877 } else {
878 prefix
879 .parse::<f64>()
880 .map_err(|err| format!("invalid numeric expression '{raw}': {err}"))?
881 };
882 coefficient * std::f64::consts::PI
883 } else if let Some(prefix) = factor
884 .strip_suffix("tau")
885 .or_else(|| factor.strip_suffix("τ"))
886 {
887 let coefficient = if prefix.is_empty() {
888 1.0
889 } else {
890 prefix
891 .parse::<f64>()
892 .map_err(|err| format!("invalid numeric expression '{raw}': {err}"))?
893 };
894 coefficient * std::f64::consts::TAU
895 } else {
896 factor
897 .parse::<f64>()
898 .map_err(|err| format!("invalid numeric expression '{raw}': {err}"))?
899 };
900 acc *= value;
901 }
902 Ok(acc)
903}
904
905/// Read an endpoint/period option as a numeric *expression* (`2*pi`, `tau`,
906/// `0.5*tau`, `6.283185307179586`, ...) — the same grammar that `period=` and
907/// `origin=` already accept via [`parse_numeric_expr`].
908///
909/// Returns `Ok(None)` when the key is absent, `Ok(Some(v))` when it parses, and
910/// a hard `Err` when the key is *present but unparseable*. The crucial contrast
911/// is with the lenient [`option_f64`], which collapses an unparseable value to
912/// `None` and lets the caller silently substitute the data range — wrapping a
913/// cyclic smooth at the wrong period with no diagnostic (the #815 failure mode).
914fn option_numeric_expr(
915 options: &BTreeMap<String, String>,
916 key: &str,
917) -> Result<Option<f64>, String> {
918 match options.get(key) {
919 None => Ok(None),
920 Some(raw) => parse_numeric_expr(raw)
921 .map(Some)
922 .map_err(|err| format!("option `{key}={raw}` is not a valid numeric value: {err}")),
923 }
924}
925
926fn parse_periods_option(
927 options: &BTreeMap<String, String>,
928 dim: usize,
929) -> Result<Option<Vec<Option<f64>>>, String> {
930 let Some(raw) = options.get("period") else {
931 return Ok(None);
932 };
933 let values = split_list_option(raw);
934 let mut periods = vec![None; dim];
935 if values.len() == 1 && dim == 1 {
936 periods[0] = Some(parse_numeric_expr(&values[0])?);
937 } else {
938 if values.len() != dim {
939 return Err(format!(
940 "period list length {} must match smooth dimension {}",
941 values.len(),
942 dim
943 ));
944 }
945 for (i, v) in values.iter().enumerate() {
946 if v.eq_ignore_ascii_case("none") {
947 continue;
948 }
949 periods[i] = Some(parse_numeric_expr(v)?);
950 }
951 }
952 Ok(Some(periods))
953}
954
955fn parse_periodic_axes_option(
956 options: &BTreeMap<String, String>,
957 dim: usize,
958) -> Result<Option<Vec<Option<f64>>>, String> {
959 let Some(raw_axes) = options.get("periodic") else {
960 return Ok(None);
961 };
962 let mut periods = parse_periods_option(options, dim)?.unwrap_or_else(|| vec![None; dim]);
963 // Scalar boolean form (`periodic=true` / `false`, `yes` / `no`) applies to
964 // every axis — the documented per-axis-flag broadcast (see the doc on
965 // `parse_periodic_axes`, the tensor sibling that already accepts it). A
966 // 1-D `duchon(x, periodic=true)` lands here: the cyclic *domain* is then
967 // resolved from the data range by `parse_cyclic_boundary` (the 1-D builder
968 // consults `boundary` first), so a finite explicit period is NOT required —
969 // we only need to NOT mis-read "true" as an axis index (#1074). `false`
970 // means no axis is periodic.
971 let lowered = raw_axes.trim().to_ascii_lowercase();
972 match lowered.as_str() {
973 "true" | "yes" | "y" => return Ok(Some(periods)),
974 // `false` means NO axis is periodic. Return `None` — NOT
975 // `Some(vec![None; dim])` — because the radial 1-D consumer treats a
976 // `Some([None])` as "periodicity requested, derive the wrap period from
977 // the data range" (see the Duchon builder arm below, which back-fills
978 // `axes[0] = data_span` for a lone `None`) and the 1-D builder routes on
979 // `spec.periodic.is_some()`. Emitting `Some([None])` here therefore
980 // silently produced a *periodic* smooth for an explicit `periodic=false`
981 // — the exact regression this arm now avoids, matching the bracketed
982 // `[false]` form handled by the per-axis boolean block below.
983 "false" | "no" | "n" => return Ok(None),
984 _ => {}
985 }
986 let axes = split_list_option(raw_axes);
987 if axes.is_empty() {
988 return Ok(Some(periods));
989 }
990
991 // Boolean forms `periodic=true` / `periodic=[true, false, ...]`, mirroring
992 // `parse_tensor_periodic_axes`. The radial 1-D builders (`duchon`/`tps`/
993 // `matern`) intentionally DERIVE the wrap period from the closed center
994 // lattice when none is supplied (`prepare_periodic_duchon_centers_1d_with_period`,
995 // gam#580: `None => span`), so a boolean-selected periodic axis legitimately
996 // omits `period`. Without this branch, `duchon(x, periodic=true)`-style
997 // radial formulas failed with the misleading "invalid periodic axis 'true'".
998 let is_bool = |t: &str| {
999 matches!(
1000 t.to_ascii_lowercase().as_str(),
1001 "true" | "yes" | "y" | "false" | "no" | "n"
1002 )
1003 };
1004 let is_truthy = |t: &str| matches!(t.to_ascii_lowercase().as_str(), "true" | "yes" | "y");
1005
1006 // Scalar boolean: `periodic=true` / `periodic=false`.
1007 if axes.len() == 1 && is_bool(&axes[0]) {
1008 if !is_truthy(&axes[0]) {
1009 // Non-periodic: return None so the 1-D builder (which routes on
1010 // `spec.periodic.is_some()`) does NOT take the periodic path.
1011 return Ok(None);
1012 }
1013 // Every axis periodic; honor any explicit per-axis period, else leave
1014 // `None` for the caller (formula arm) / builder to derive the span.
1015 return Ok(Some(periods));
1016 }
1017
1018 // Per-axis boolean list: `periodic=[true, false, ...]` (length must match dim).
1019 if axes.iter().all(|a| is_bool(a)) {
1020 if axes.len() != dim {
1021 return Err(format!(
1022 "periodic flag list length {} must match smooth dimension {dim}",
1023 axes.len()
1024 ));
1025 }
1026 if !axes.iter().any(|a| is_truthy(a)) {
1027 return Ok(None);
1028 }
1029 for (i, a) in axes.iter().enumerate() {
1030 if !is_truthy(a) {
1031 periods[i] = None;
1032 }
1033 }
1034 return Ok(Some(periods));
1035 }
1036
1037 // Index-list form: `periodic=[0, 2]`. Each listed axis must carry an
1038 // explicit finite period — an index gives no per-axis span-derive hint.
1039 for a in &axes {
1040 let axis = a
1041 .parse::<usize>()
1042 .map_err(|err| format!("invalid periodic axis '{a}': {err}"))?;
1043 if axis >= dim {
1044 return Err(format!(
1045 "periodic axis {axis} out of range for {dim}D smooth"
1046 ));
1047 }
1048 if periods[axis].is_none() {
1049 return Err(format!(
1050 "periodic axis {axis} requires period[{axis}] to be finite"
1051 ));
1052 }
1053 }
1054 // Axes not listed are non-periodic even if period list has a finite placeholder.
1055 let listed: std::collections::BTreeSet<usize> = axes
1056 .iter()
1057 .filter_map(|a| a.parse::<usize>().ok())
1058 .collect();
1059 for i in 0..dim {
1060 if !listed.contains(&i) {
1061 periods[i] = None;
1062 }
1063 }
1064 Ok(Some(periods))
1065}
1066
1067// ---------------------------------------------------------------------------
1068// Smooth basis spec construction
1069// ---------------------------------------------------------------------------
1070
1071fn parse_option_list(raw: &str) -> Vec<String> {
1072 let trimmed = raw.trim();
1073 // Accept both the Python/JSON list form `[a, b]` and mgcv's R vector form
1074 // `c(a, b)` (and a bare `(a, b)`) as the bracketed wrapper around a
1075 // comma-separated option list. mgcv writes per-margin options as
1076 // `bs=c('tp','tp')` / `m=c(2,2)`, so the `c(...)` form must round-trip
1077 // through the same splitter the `[...]` form uses.
1078 let inner = trimmed
1079 .strip_prefix('[')
1080 .and_then(|v| v.strip_suffix(']'))
1081 .or_else(|| {
1082 trimmed
1083 .strip_prefix("c(")
1084 .or_else(|| trimmed.strip_prefix("C("))
1085 .or_else(|| trimmed.strip_prefix('('))
1086 .and_then(|v| v.strip_suffix(')'))
1087 })
1088 .unwrap_or(trimmed);
1089 inner
1090 .split(',')
1091 .map(|v| {
1092 v.trim()
1093 .trim_matches('"')
1094 .trim_matches('\'')
1095 .to_ascii_lowercase()
1096 })
1097 .filter(|v| !v.is_empty())
1098 .collect()
1099}
1100
1101fn parse_periodic_axes(
1102 options: &BTreeMap<String, String>,
1103 dim: usize,
1104) -> Result<Vec<bool>, String> {
1105 let mut axes = vec![false; dim];
1106 if let Some(raw) = options.get("periodic").or_else(|| options.get("cyclic")) {
1107 let lowered = raw.trim().to_ascii_lowercase();
1108 match lowered.as_str() {
1109 "true" | "yes" | "y" => {
1110 axes.fill(true);
1111 return Ok(axes);
1112 }
1113 "false" | "no" | "n" => return Ok(axes),
1114 _ => {}
1115 }
1116 for axis_raw in parse_option_list(raw) {
1117 let axis = axis_raw
1118 .parse::<usize>()
1119 .map_err(|err| format!("invalid periodic axis '{axis_raw}': {err}"))?;
1120 if axis >= dim {
1121 return Err(format!(
1122 "periodic axis {axis} out of range for {dim}D smooth"
1123 ));
1124 }
1125 axes[axis] = true;
1126 }
1127 }
1128 if let Some(raw) = options.get("boundary").or_else(|| options.get("bc")) {
1129 let boundary = parse_option_list(raw);
1130 if boundary.len() == dim {
1131 for (axis, value) in boundary.iter().enumerate() {
1132 if matches!(value.as_str(), "periodic" | "cyclic" | "cc") {
1133 axes[axis] = true;
1134 }
1135 }
1136 } else if dim == 1
1137 && matches!(
1138 boundary.first().map(String::as_str),
1139 Some("periodic" | "cyclic" | "cc")
1140 )
1141 {
1142 axes[0] = true;
1143 }
1144 }
1145 Ok(axes)
1146}
1147
1148fn parse_optional_numeric_list(
1149 options: &BTreeMap<String, String>,
1150 keys: &[&str],
1151 dim: usize,
1152) -> Result<Vec<Option<f64>>, String> {
1153 let Some(raw) = keys.iter().find_map(|key| options.get(*key)) else {
1154 return Ok(vec![None; dim]);
1155 };
1156 let values = split_list_option(raw);
1157 let mut out = vec![None; dim];
1158 if values.len() == 1 && dim == 1 {
1159 if !values[0].eq_ignore_ascii_case("none") {
1160 out[0] = Some(parse_numeric_expr(&values[0])?);
1161 }
1162 return Ok(out);
1163 }
1164 if values.len() != dim {
1165 return Err(format!(
1166 "numeric option list length {} must match smooth dimension {}",
1167 values.len(),
1168 dim
1169 ));
1170 }
1171 for (i, value) in values.iter().enumerate() {
1172 if !value.eq_ignore_ascii_case("none") {
1173 out[i] = Some(parse_numeric_expr(value)?);
1174 }
1175 }
1176 Ok(out)
1177}
1178
1179fn parse_periods(
1180 options: &BTreeMap<String, String>,
1181 periodic_axes: &[bool],
1182) -> Result<Vec<Option<f64>>, String> {
1183 let dim = periodic_axes.len();
1184 // Broadcast a single-element `period=[v]` onto the lone periodic axis
1185 // of a multi-axis smooth (e.g. `te(th, h, bc=['periodic','natural'],
1186 // period=[2*pi])`): with only one periodic margin, the value can only
1187 // belong there.
1188 let lone_periodic_broadcast = options
1189 .get("period")
1190 .or_else(|| options.get("periods"))
1191 .and_then(|raw| {
1192 let values = split_list_option(raw);
1193 if values.len() != 1 || dim <= 1 {
1194 return None;
1195 }
1196 let mut iter = periodic_axes.iter().enumerate().filter(|(_, p)| **p);
1197 let first = iter.next()?;
1198 if iter.next().is_some() {
1199 return None;
1200 }
1201 Some((first.0, values.into_iter().next().unwrap()))
1202 });
1203 let periods = if let Some((axis, value)) = lone_periodic_broadcast {
1204 let mut out = vec![None; dim];
1205 if !value.eq_ignore_ascii_case("none") {
1206 out[axis] = Some(parse_numeric_expr(&value)?);
1207 }
1208 out
1209 } else {
1210 parse_optional_numeric_list(options, &["period", "periods"], dim)?
1211 };
1212 for (axis, (periodic, period)) in periodic_axes.iter().zip(periods.iter()).enumerate() {
1213 if *periodic
1214 && let Some(value) = period
1215 && (!value.is_finite() || *value <= 0.0)
1216 {
1217 return Err(format!(
1218 "period for periodic axis {axis} must be finite and positive, got {value}"
1219 ));
1220 }
1221 }
1222 Ok(periods)
1223}
1224
1225fn parse_period_origins(
1226 options: &BTreeMap<String, String>,
1227 periodic_axes: &[bool],
1228) -> Result<Vec<Option<f64>>, String> {
1229 parse_optional_numeric_list(
1230 options,
1231 &[
1232 "origin",
1233 "origins",
1234 "period_origin",
1235 "period-origin",
1236 "domain_origin",
1237 ],
1238 periodic_axes.len(),
1239 )
1240}
1241
1242/// Parse a per-axis periodic flag list for tensor smooths. Accepts three forms:
1243/// - `periodic=true` / `periodic=false` (scalar applied to every axis),
1244/// - `periodic=[true, false, ...]` (one flag per axis, length `dim`),
1245/// - `periodic=c(1, 1)` / `c(0, 0)` (a length-`dim` 0/1 mask, mgcv's
1246/// per-margin spelling — distinguished from an axis-index list by the
1247/// repeated 0/1 value), and
1248/// - `periodic=[0, 2, ...]` (axis indices that are periodic; others are not).
1249///
1250/// `boundary=[..., "periodic"/"cyclic"/"cc", ...]` may also flip individual
1251/// axes on; non-matching tokens leave the existing flag unchanged.
1252fn parse_tensor_periodic_axes(
1253 options: &BTreeMap<String, String>,
1254 dim: usize,
1255) -> Result<Vec<bool>, String> {
1256 let mut axes = vec![false; dim];
1257 if let Some(raw) = options.get("periodic").or_else(|| options.get("cyclic")) {
1258 let lowered = raw.trim().to_ascii_lowercase();
1259 match lowered.as_str() {
1260 "true" | "yes" | "y" => {
1261 axes.fill(true);
1262 }
1263 "false" | "no" | "n" => {
1264 // Already false; allow `boundary=` below to flip axes if set.
1265 }
1266 _ => {
1267 let entries = parse_option_list(raw);
1268 let all_bool = !entries.is_empty()
1269 && entries.iter().all(|v| {
1270 matches!(
1271 v.as_str(),
1272 "true" | "yes" | "y" | "false" | "no" | "n" | "none"
1273 )
1274 });
1275 // mgcv writes per-margin flag vectors as `periodic=c(1,1)` /
1276 // `periodic=c(0,0)` — a length-`dim` mask where each entry is a
1277 // 0/1 flag for THAT margin, not an axis index. A bare axis-index
1278 // list (`periodic=[0,1]`, `periodic=[0]`) lists DISTINCT margin
1279 // indices to turn on. The two collide only when the list is all
1280 // 0/1 of length `dim`; disambiguate by the repeated-value
1281 // signature `c(1,1)`/`c(0,0)` (a valid axis-index set never
1282 // repeats an index), which is the canonical mask spelling. This
1283 // is what makes the leading tensor margin honor its periodic flag
1284 // (#1751: `periodic=c(1,1)` previously parsed `1,1` as axis
1285 // indices, marking only axis 1 and dropping axis 0).
1286 let all_zero_one =
1287 !entries.is_empty() && entries.iter().all(|v| v == "0" || v == "1");
1288 let has_repeat = {
1289 let mut seen = std::collections::BTreeSet::new();
1290 !entries.iter().all(|v| seen.insert(v.clone()))
1291 };
1292 let numeric_mask = all_zero_one && entries.len() == dim && has_repeat;
1293 if all_bool || numeric_mask {
1294 if entries.len() != dim {
1295 return Err(format!(
1296 "periodic list length {} must match smooth dimension {}",
1297 entries.len(),
1298 dim
1299 ));
1300 }
1301 for (i, v) in entries.iter().enumerate() {
1302 axes[i] = matches!(v.as_str(), "true" | "yes" | "y" | "1");
1303 }
1304 } else {
1305 for axis_raw in entries {
1306 let axis = axis_raw
1307 .parse::<usize>()
1308 .map_err(|err| format!("invalid periodic axis '{axis_raw}': {err}"))?;
1309 if axis >= dim {
1310 return Err(format!(
1311 "periodic axis {axis} out of range for {dim}D smooth"
1312 ));
1313 }
1314 axes[axis] = true;
1315 }
1316 }
1317 }
1318 }
1319 }
1320 if let Some(raw) = options.get("boundary").or_else(|| options.get("bc")) {
1321 let boundary = parse_option_list(raw);
1322 if boundary.len() == dim {
1323 for (axis, value) in boundary.iter().enumerate() {
1324 if matches!(value.as_str(), "periodic" | "cyclic" | "cc") {
1325 axes[axis] = true;
1326 }
1327 }
1328 }
1329 }
1330 // A per-margin basis vector (`bs=c('cc','ps')` / `type=[...]`) declares each
1331 // margin's basis family, and a cyclic family (`cc`/`cp`/`cyclic`) makes THAT
1332 // margin periodic — exactly as the 1-D `s(x, bs='cc')` smooth wraps its lone
1333 // axis. Without this, the per-margin `cc` token was validated but discarded:
1334 // every `bs=c(...)` spelling collapsed to the same open B-spline tensor
1335 // (#1752). Only honor the vector form here; a scalar `bs='cc'` on a tensor is
1336 // ambiguous about which margins wrap, so it does not flip any axis on.
1337 if let Some(raw) = options.get("bs").or_else(|| options.get("type"))
1338 && bs_selector_is_vector(raw)
1339 {
1340 let per_margin = parse_option_list(raw);
1341 if per_margin.len() == dim {
1342 for (axis, margin_bs) in per_margin.iter().enumerate() {
1343 if matches!(canonicalize_smooth_type(margin_bs), "cc" | "cp" | "cyclic") {
1344 axes[axis] = true;
1345 }
1346 }
1347 }
1348 }
1349 Ok(axes)
1350}
1351
1352/// Validate the per-margin `boundary=`/`bc=` tokens on a tensor-product smooth.
1353///
1354/// The tensor `boundary`/`bc` list selects, per margin, whether the margin
1355/// *wraps* (a `periodic`/`cyclic`/`cc` token, consumed by
1356/// [`parse_tensor_periodic_axes`]) or is an ordinary non-periodic margin. In the
1357/// tensor DSL a *non-periodic* margin is spelled `clamped` — in the B-spline
1358/// sense of a **clamped knot vector**, i.e. the standard open spline that is
1359/// free at its two ends and does not wrap (exactly how the callers document it:
1360/// "non-periodic / clamped … free at the two ends, no wrap"). It is therefore an
1361/// inert marker here, not a zero-derivative endpoint reparameterization: a
1362/// cylinder `te(theta, z, boundary=['periodic','clamped'], …)` is a cyclic θ
1363/// margin tensor-producted with an ordinary open z margin, the direct analog of
1364/// mgcv `te(bs=c("cc","ps"))` / `te(bs=c("cc","cr"))`.
1365///
1366/// The periodic selectors and the inert non-periodic markers
1367/// (`clamped`/`open`/`natural`/`free`/`none`/empty) are accepted; anything else
1368/// (e.g. a genuine `anchored` zero-value endpoint constraint, which has no
1369/// ordinary-margin meaning in a tensor) is surfaced as a clean
1370/// unsupported-feature error rather than silently dropped. Previously `clamped`
1371/// itself was rejected, so the cylinder/torus mixed-boundary tensors — the exact
1372/// construction the manifold quality suite builds — could not be fit at all.
1373fn validate_tensor_boundary_tokens(
1374 options: &BTreeMap<String, String>,
1375 dim: usize,
1376) -> Result<(), String> {
1377 let Some(raw) = options.get("boundary").or_else(|| options.get("bc")) else {
1378 return Ok(());
1379 };
1380 let entries = parse_option_list(raw);
1381 for (axis, value) in entries.iter().enumerate() {
1382 let inert = matches!(
1383 value.trim().to_ascii_lowercase().as_str(),
1384 "clamped" | "open" | "natural" | "free" | "none" | "" | "periodic" | "cyclic" | "cc"
1385 );
1386 if !inert {
1387 return Err(TermBuilderError::unsupported_feature(format!(
1388 "tensor smooth margin {axis} boundary token '{value}' is not supported \
1389 (got bc/boundary={raw:?} on a {dim}-D tensor); tensor margins accept the periodic \
1390 selectors (periodic/cyclic/cc) or the non-periodic markers (clamped/open/natural/free). \
1391 Apply anchored/zero-value endpoint constraints with a 1-D s(x, bc=...) term instead."
1392 ))
1393 .to_string());
1394 }
1395 }
1396 Ok(())
1397}
1398
1399fn tensor_k_axis_option_axis(
1400 key: &str,
1401 cols: &[usize],
1402 ds: &Dataset,
1403) -> Result<Option<usize>, String> {
1404 let Some(suffix) = key.strip_prefix("k_") else {
1405 return Ok(None);
1406 };
1407 if suffix.is_empty() {
1408 return Err("tensor k axis option must be named k_<axis> or k_<variable>".to_string());
1409 }
1410 if let Ok(axis) = suffix.parse::<usize>() {
1411 return if axis < cols.len() {
1412 Ok(Some(axis))
1413 } else {
1414 Err(format!(
1415 "tensor k axis option `{key}` references axis {axis}, but the smooth has {} margins",
1416 cols.len()
1417 ))
1418 };
1419 }
1420
1421 let mut matches = cols
1422 .iter()
1423 .enumerate()
1424 .filter(|(_, col)| ds.headers.get(**col).is_some_and(|name| name == suffix))
1425 .map(|(axis, _)| axis);
1426 let first = matches.next();
1427 if matches.next().is_some() {
1428 return Err(format!(
1429 "tensor k axis option `{key}` matches more than one margin named `{suffix}`"
1430 ));
1431 }
1432 first.map(Some).ok_or_else(|| {
1433 let margin_names = cols
1434 .iter()
1435 .enumerate()
1436 .map(|(axis, col)| {
1437 let name = ds
1438 .headers
1439 .get(*col)
1440 .map(String::as_str)
1441 .unwrap_or("<unnamed>");
1442 format!("{axis}:{name}")
1443 })
1444 .collect::<Vec<_>>()
1445 .join(", ");
1446 format!(
1447 "tensor k axis option `{key}` does not match a margin index or name; tensor margins are [{margin_names}]"
1448 )
1449 })
1450}
1451
1452fn is_tensor_k_axis_option_key(key: &str) -> bool {
1453 key.strip_prefix("k_")
1454 .is_some_and(|suffix| !suffix.is_empty())
1455}
1456
1457/// Parse a per-margin basis dimension list (`k=<scalar>`, `k=[k0, k1, ...]`,
1458/// or axis aliases like `k_x=...` / `k_0=...`). A scalar is broadcast across
1459/// all axes; `None` returns the heuristic from the data column.
1460fn parse_tensor_k_list(
1461 options: &BTreeMap<String, String>,
1462 cols: &[usize],
1463 ds: &Dataset,
1464) -> Result<(Vec<usize>, bool), String> {
1465 let mut axis_values = vec![None; cols.len()];
1466 let mut saw_axis_alias = false;
1467 for (key, value) in options {
1468 let Some(axis) = tensor_k_axis_option_axis(key, cols, ds)? else {
1469 continue;
1470 };
1471 saw_axis_alias = true;
1472 if axis_values[axis].is_some() {
1473 return Err(format!("tensor k axis {axis} is specified more than once"));
1474 }
1475 let k: usize = value
1476 .parse()
1477 .map_err(|err| format!("invalid tensor k option `{key}={value}`: {err}"))?;
1478 axis_values[axis] = Some(k);
1479 }
1480
1481 let raw = options
1482 .get("k")
1483 .or_else(|| options.get("basis_dim"))
1484 .or_else(|| options.get("basis-dim"))
1485 .or_else(|| options.get("basisdim"));
1486 if saw_axis_alias {
1487 if raw.is_some() {
1488 return Err(
1489 "tensor k axis aliases cannot be combined with k= or basis_dim=".to_string(),
1490 );
1491 }
1492 if let Some(missing_axis) = axis_values.iter().position(Option::is_none) {
1493 let margin_name = cols
1494 .get(missing_axis)
1495 .and_then(|col| ds.headers.get(*col))
1496 .map(String::as_str)
1497 .unwrap_or("<unnamed>");
1498 return Err(format!(
1499 "tensor k axis aliases must specify every margin; missing axis {missing_axis} ({margin_name})"
1500 ));
1501 }
1502 return Ok((
1503 axis_values
1504 .into_iter()
1505 .map(|k| k.expect("missing axis values rejected above"))
1506 .collect(),
1507 false,
1508 ));
1509 }
1510 let Some(raw) = raw else {
1511 let inferred = heuristic_tensor_margin_knots(cols, ds);
1512 return Ok((inferred, true));
1513 };
1514 let entries = split_list_option(raw);
1515 if entries.len() == 1 {
1516 let k: usize = entries[0]
1517 .parse()
1518 .map_err(|err| format!("invalid tensor k '{}': {err}", entries[0]))?;
1519 return Ok((vec![k; cols.len()], false));
1520 }
1521 if entries.len() != cols.len() {
1522 return Err(format!(
1523 "tensor k list length {} must match smooth dimension {}",
1524 entries.len(),
1525 cols.len()
1526 ));
1527 }
1528 let mut out = Vec::with_capacity(entries.len());
1529 for entry in entries {
1530 let k: usize = entry
1531 .parse()
1532 .map_err(|err| format!("invalid tensor k '{entry}': {err}"))?;
1533 out.push(k);
1534 }
1535 Ok((out, false))
1536}
1537
1538/// Parse the `identifiability=` option for tensor-product smooths. Mirrors the
1539/// vocabulary of the Matern/Duchon parsers so the formula DSL is consistent.
1540///
1541/// `kind` selects the default identifiability when no explicit
1542/// `identifiability=` option is supplied: `te(...)` ([`SmoothKind::Te`]) keeps
1543/// the full-tensor sum-to-zero default, while `ti(...)` ([`SmoothKind::Ti`])
1544/// defaults to per-margin sum-to-zero so the marginal main effects are excluded
1545/// (the mgcv tensor-interaction semantics). An explicit option always wins.
1546fn parse_tensor_identifiability(
1547 options: &BTreeMap<String, String>,
1548 kind: SmoothKind,
1549) -> Result<TensorBSplineIdentifiability, String> {
1550 let Some(raw) = options.get("identifiability").map(String::as_str) else {
1551 return Ok(match kind {
1552 SmoothKind::Ti => TensorBSplineIdentifiability::MarginalSumToZero,
1553 _ => TensorBSplineIdentifiability::default(),
1554 });
1555 };
1556 match raw.trim().to_ascii_lowercase().as_str() {
1557 "none" => Ok(TensorBSplineIdentifiability::None),
1558 "sum_tozero" | "sum-to-zero" | "center_sum_tozero" | "center-sum-to-zero" | "centered"
1559 | "sumtozero" => Ok(TensorBSplineIdentifiability::SumToZero),
1560 "marginal_sum_tozero" | "marginal-sum-to-zero" | "marginal_sumtozero"
1561 | "marginalsumtozero" | "interaction" => {
1562 Ok(TensorBSplineIdentifiability::MarginalSumToZero)
1563 }
1564 other => Err(TermBuilderError::unsupported_feature(format!(
1565 "invalid tensor identifiability '{other}'; expected one of: none, sum_tozero, marginal_sum_tozero"
1566 ))
1567 .to_string()),
1568 }
1569}
1570
1571fn bspline_boundary_declares_periodic_axis(options: &BTreeMap<String, String>) -> bool {
1572 options
1573 .get("boundary")
1574 .or_else(|| options.get("bc"))
1575 .map(|raw| {
1576 parse_option_list(raw)
1577 .into_iter()
1578 .any(|value| matches!(value.as_str(), "periodic" | "cyclic" | "cc"))
1579 })
1580 .unwrap_or(false)
1581}
1582
1583/// Canonical-name lookup for the `bs=`/`type=` smooth selector.
1584///
1585/// User-facing names — including mgcv-compatible spellings whose semantics
1586/// match an existing gamfit smooth exactly — collapse to the engine-internal
1587/// canonical names used by the dispatch in [`build_smooth_basis`]. Adding a
1588/// new exactly-equivalent alias is a one-line entry here; the match arms
1589/// below remain the single dispatch site.
1590///
1591/// Aliases listed here MUST be true semantic equivalents of the canonical
1592/// target, not approximations. mgcv names whose semantics differ from any
1593/// gamfit smooth (e.g. `bs="ts"` shrinkage thin-plate, `bs="ad"` adaptive)
1594/// are intentionally NOT mapped here — they should reach the unsupported-type
1595/// path so users get a real diagnostic instead of a silent semantic
1596/// substitution. mgcv's `bs="cr"`/`"cs"` (cubic regression and its shrinkage
1597/// twin) are handled directly in the [`build_smooth_basis`] dispatch — they
1598/// are not aliased here because the `cr`/`cs` distinction controls a default
1599/// (`double_penalty`) that the canonical-name layer cannot see.
1600///
1601/// Unrecognised inputs pass through unchanged so the dispatch can produce its
1602/// usual "unsupported smooth type" error, preserving the existing diagnostic
1603/// surface for genuine typos.
1604pub(crate) fn canonicalize_smooth_type(raw: &str) -> &str {
1605 match raw {
1606 // Thin-plate spline. mgcv `bs="tp"` is the default thin-plate
1607 // regression spline — exact semantic equivalent of gamfit's `"tps"`.
1608 "tp" => "tps",
1609 // Gaussian process / Matérn. mgcv `bs="gp"` defaults to a Matérn
1610 // covariance kernel with REML smoothing parameter selection, which
1611 // matches gamfit's `"matern"` exactly (same kernel-Gram identity,
1612 // same REML route).
1613 "gp" => "matern",
1614 // Constant-curvature (M_κ) geodesic-kernel smooth (#944). All aliases
1615 // collapse to one canonical type so `bs="curv"`/`bs="mkappa"` cannot
1616 // diverge from `curv(...)`.
1617 "curv" | "constant_curvature" | "mkappa" => "curvature",
1618 // Measure-jet spline: multiscale local-jet-residual energy of the
1619 // empirical measure. No mgcv equivalent (mgcv has no measure-learned
1620 // geometry smooth), so no mgcv alias is mapped.
1621 "mjs" | "measure_jet" | "web" => "measurejet",
1622 other => other,
1623 }
1624}
1625
1626/// Is `margin_bs` a per-margin basis name that the tensor builder realizes as a
1627/// penalized 1-D B-spline margin?
1628///
1629/// gam's tensor product is built from penalized B-spline marginals. mgcv's
1630/// thin-plate (`tp`/`tps`), P-spline (`ps`), B-spline (`bs`), cubic-regression
1631/// (`cr`/`cs`), and cyclic (`cc`/`cp`/`cyclic`) marginals are all penalized
1632/// splines spanning the same per-axis smoothing space, so a B-spline margin
1633/// reproduces the same tensor smoothing class. Margin kinds with fundamentally
1634/// different structure (adaptive, random-effect, sphere) are NOT accepted as
1635/// tensor margins.
1636pub(crate) fn tensor_margin_bs_is_supported(margin_bs: &str) -> bool {
1637 matches!(
1638 canonicalize_smooth_type(margin_bs),
1639 "tps" | "ps" | "bs" | "bspline" | "cr" | "cs" | "cc" | "cp" | "cyclic"
1640 )
1641}
1642
1643/// Does the smooth request a periodic/cyclic axis via its options?
1644///
1645/// Mirrors the boundary-condition reading used by the periodic-aware dispatch
1646/// branches. Factored out so the type resolver and `build_smooth_basis` agree
1647/// on a single notion of "periodic requested".
1648pub(crate) fn smooth_options_declare_periodic(options: &BTreeMap<String, String>) -> bool {
1649 options.contains_key("periodic")
1650 || options.contains_key("cyclic")
1651 || options
1652 .get("boundary")
1653 .or_else(|| options.get("bc"))
1654 .map(|boundary| {
1655 boundary.to_ascii_lowercase().contains("periodic")
1656 || boundary.to_ascii_lowercase().contains("cyclic")
1657 })
1658 .unwrap_or(false)
1659}
1660
1661/// Resolve the canonical engine-internal smooth-type name for a term.
1662///
1663/// Reads the user-facing `type=`/`bs=` selector and collapses mgcv-compatible
1664/// aliases (`tp`→`tps`, `gp`→`matern`) via [`canonicalize_smooth_type`], or
1665/// derives the default from the smooth kind/arity when no selector is given.
1666/// This is the single source of truth for the dispatch in
1667/// [`build_smooth_basis`]; other call sites (e.g. predictor-specific basis
1668/// policy) use it so the classification never drifts from the dispatch.
1669/// Is the raw `bs=`/`type=` selector a vector literal (`c('tp','tp')`,
1670/// `['tp','tp']`, `(tp, tp)`) rather than a scalar smooth-type name?
1671///
1672/// mgcv's tensor smooths take a *per-margin* basis vector
1673/// (`te(x1, x2, bs=c('tp','tp'))`). Such a value is not a scalar canonical
1674/// type and must not be fed through [`canonicalize_smooth_type`] — it has to be
1675/// recognized as a tensor request and split into per-margin types. A scalar
1676/// selector (`bs="tp"`) is left untouched.
1677pub(crate) fn bs_selector_is_vector(raw: &str) -> bool {
1678 let trimmed = raw.trim();
1679 let bracketed = (trimmed.starts_with('[') && trimmed.ends_with(']'))
1680 || (trimmed.starts_with("c(") || trimmed.starts_with("C(")) && trimmed.ends_with(')')
1681 || (trimmed.starts_with('(') && trimmed.ends_with(')'));
1682 bracketed && !parse_option_list(trimmed).is_empty()
1683}
1684
1685pub fn resolve_smooth_type_name(
1686 kind: SmoothKind,
1687 n_cols: usize,
1688 options: &BTreeMap<String, String>,
1689) -> String {
1690 let selector = options.get("type").or_else(|| options.get("bs"));
1691 // A per-margin basis vector is a tensor request, never a scalar type. Route
1692 // it to the tensor builder, which reads the per-margin types out of the
1693 // same `bs=` option. (A vector on a non-tensor smooth is ill-formed and
1694 // falls through to the scalar path below so the existing diagnostic fires.)
1695 if let Some(raw) = selector
1696 && bs_selector_is_vector(raw)
1697 && matches!(kind, SmoothKind::Te | SmoothKind::Ti | SmoothKind::T2)
1698 {
1699 return "tensor".to_string();
1700 }
1701 selector
1702 .map(|s| canonicalize_smooth_type(&s.to_ascii_lowercase()).to_string())
1703 .unwrap_or_else(|| match kind {
1704 SmoothKind::Te | SmoothKind::Ti | SmoothKind::T2 => "tensor".to_string(),
1705 SmoothKind::S if n_cols == 1 => "bspline".to_string(),
1706 // Mixed periodic Euclidean radial kernels are not separable on the
1707 // cylinder. Use a tensor product with a cyclic margin so s(theta,h)
1708 // honors seam continuity while preserving the formula-level s(...).
1709 SmoothKind::S if smooth_options_declare_periodic(options) => "tensor".to_string(),
1710 SmoothKind::S => "tps".to_string(),
1711 })
1712}
1713
1714/// Does this canonical smooth type size its basis through the generous spatial
1715/// center heuristic ([`crate::basis::default_num_centers`])?
1716///
1717/// Only the radial spatial bases (thin-plate, Matérn/GP, Duchon) route their
1718/// default basis dimension through `plan_spatial_basis(.., Default, ..)`. The
1719/// B-spline, cyclic, tensor, and factor-smooth bases use their own modest
1720/// knot-based defaults, so they are unaffected by — and must not be perturbed
1721/// by — secondary-predictor basis-parsimony adjustments (#501).
1722pub fn smooth_type_uses_spatial_center_heuristic(canonical_type: &str) -> bool {
1723 matches!(canonical_type, "tps" | "matern" | "duchon")
1724}
1725
1726pub fn build_smooth_basis(
1727 kind: SmoothKind,
1728 vars: &[String],
1729 cols: &[usize],
1730 options: &BTreeMap<String, String>,
1731 ds: &Dataset,
1732 inference_notes: &mut Vec<String>,
1733 policy: &ResourcePolicy,
1734 smooth_coordinate_count: usize,
1735) -> Result<SmoothBasisSpec, String> {
1736 // Fail fast on degenerate input: a smooth whose (non-categorical) coordinate
1737 // columns collapse to a SINGLE distinct point can only ever fit the response
1738 // mean — its design matrix is rank-1. For a UNIVARIATE smooth this is exactly
1739 // "the one column is constant": `smooth(x)`/`matern(x)` on constant `x` would
1740 // otherwise silently fit the mean of `y` with no visible cue (Duchon already
1741 // errors loudly via the basis layer; this makes the diagnosis explicit and
1742 // uniform). For a MULTIVARIATE smooth (tensor, sphere, tps, ...) a single
1743 // constant coordinate is NOT degenerate — the basis still varies along the
1744 // other coordinate(s) and the penalty absorbs the rank-deficient direction
1745 // (e.g. a constant-longitude meridian arc on the sphere is a well-posed 1-D
1746 // slice of S²). Such a term is degenerate only when EVERY coordinate is
1747 // constant at once, i.e. the joint input is a single point. Test the JOINT
1748 // cardinality, not each column independently, so the loud diagnosis still
1749 // fires for the genuinely rank-1 case without rejecting well-posed
1750 // lower-dimensional slices.
1751 let coord_cols: Vec<(&String, usize)> = vars
1752 .iter()
1753 .zip(cols.iter().copied())
1754 .filter(|(_, col)| !matches!(ds.column_kinds.get(*col), Some(ColumnKindTag::Categorical)))
1755 .collect();
1756 if !coord_cols.is_empty() {
1757 let views: Vec<ArrayView1<'_, f64>> = coord_cols
1758 .iter()
1759 .map(|(_, col)| ds.values.column(*col))
1760 .collect();
1761 let n_rows = views[0].len();
1762 let mut distinct_points = std::collections::HashSet::<Vec<u64>>::new();
1763 for r in 0..n_rows {
1764 let key: Vec<u64> = views
1765 .iter()
1766 .map(|v| {
1767 let x = v[r];
1768 let norm = if x == 0.0 { 0.0 } else { x };
1769 norm.to_bits()
1770 })
1771 .collect();
1772 distinct_points.insert(key);
1773 if distinct_points.len() > 1 {
1774 break;
1775 }
1776 }
1777 if distinct_points.len() <= 1 {
1778 return Err(TermBuilderError::degenerate_data(if coord_cols.len() == 1 {
1779 let var = coord_cols[0].0;
1780 format!(
1781 "smooth term over '{var}' has only one unique value in the training data \
1782 — a smooth on a constant column is degenerate and would only fit the response mean. \
1783 Remove `{var}` from the smooth, drop the term, or check the data."
1784 )
1785 } else {
1786 let names = coord_cols
1787 .iter()
1788 .map(|(v, _)| v.as_str())
1789 .collect::<Vec<_>>()
1790 .join(", ");
1791 format!(
1792 "smooth term over ({names}) has only one unique joint coordinate in the training \
1793 data — every coordinate is constant, so the smooth is degenerate and would only \
1794 fit the response mean. Drop the term or check the data."
1795 )
1796 })
1797 .to_string());
1798 }
1799 }
1800 if let Some(by_name) = options.get("by").cloned() {
1801 let by_col = options
1802 .get("__by_col")
1803 .and_then(|raw| raw.parse::<usize>().ok())
1804 .or_else(|| vars.iter().position(|v| v == &by_name).map(|idx| cols[idx]))
1805 .ok_or_else(|| format!("unknown by= column '{by_name}'"))?;
1806 let mut inner_options = options.clone();
1807 inner_options.remove("by");
1808 inner_options.remove("__by_col");
1809 inner_options.remove("id");
1810 let inner = build_smooth_basis(
1811 kind,
1812 vars,
1813 cols,
1814 &inner_options,
1815 ds,
1816 inference_notes,
1817 policy,
1818 smooth_coordinate_count,
1819 )?;
1820 let by_kind = match ds.column_kinds.get(by_col).copied() {
1821 Some(ColumnKindTag::Categorical) => ByVarKind::Factor {
1822 feature_col: by_col,
1823 ordered: option_bool(options, "ordered").unwrap_or(false),
1824 frozen_levels: None,
1825 },
1826 Some(ColumnKindTag::Continuous | ColumnKindTag::Binary) => ByVarKind::Numeric {
1827 feature_col: by_col,
1828 },
1829 None => {
1830 return Err(format!(
1831 "internal column-kind lookup failed for by='{by_name}'"
1832 ));
1833 }
1834 };
1835 return Ok(SmoothBasisSpec::BySmooth {
1836 smooth: Box::new(inner),
1837 by_kind,
1838 });
1839 }
1840
1841 let smooth_double_penalty = option_bool(options, "double_penalty").unwrap_or(true);
1842 let type_opt = resolve_smooth_type_name(kind, cols.len(), options);
1843
1844 if matches!(type_opt.as_str(), "fs" | "sz" | "re") {
1845 validate_known_options(
1846 type_opt.as_str(),
1847 options,
1848 &[
1849 "type",
1850 "bs",
1851 "k",
1852 "basis_dim",
1853 "basis-dim",
1854 "basisdim",
1855 "knots",
1856 "knot_placement",
1857 "knot-placement",
1858 "knotplacement",
1859 "degree",
1860 "penalty_order",
1861 "m",
1862 "double_penalty",
1863 "ordered",
1864 ],
1865 )?;
1866 if cols.len() != 2 {
1867 return Err(format!(
1868 "{} factor-smooth currently expects exactly two variables (one numeric, one categorical)",
1869 type_opt
1870 ));
1871 }
1872 let kinds = cols
1873 .iter()
1874 .map(|&c| ds.column_kinds.get(c).copied())
1875 .collect::<Vec<_>>();
1876 let (cont_idx, group_idx) = if type_opt == "re" {
1877 // mgcv random-slope examples are often s(g, x, bs="re").
1878 match (kinds[0], kinds[1]) {
1879 (Some(ColumnKindTag::Categorical), _) => (1usize, 0usize),
1880 (_, Some(ColumnKindTag::Categorical)) => (0usize, 1usize),
1881 _ => (1usize, 0usize),
1882 }
1883 } else {
1884 match (kinds[0], kinds[1]) {
1885 (_, Some(ColumnKindTag::Categorical)) => (0usize, 1usize),
1886 (Some(ColumnKindTag::Categorical), _) => (1usize, 0usize),
1887 _ => {
1888 return Err(format!(
1889 "{} factor-smooth requires one categorical factor variable",
1890 type_opt
1891 ));
1892 }
1893 }
1894 };
1895 let c = cols[cont_idx];
1896 let (minv, maxv) = col_minmax(ds.values.column(c))?;
1897 let degree = if type_opt == "re" {
1898 1
1899 } else {
1900 option_usize(options, "degree").unwrap_or(DEFAULT_BSPLINE_DEGREE)
1901 };
1902 // For a factor smooth every group's curve is fit from THAT group's rows
1903 // alone, so the marginal's flexibility must respect the least-resolved
1904 // group, not the pooled column. The pooled heuristic can hand the marginal
1905 // a basis that saturates (or exceeds) a small group's sample — e.g. the
1906 // sleepstudy panel has 8 training days per subject, and a default cubic
1907 // basis of 8 functions interpolates each subject's 8 points, leaving no
1908 // room for the wiggliness penalty to collapse the curve toward the
1909 // per-subject line. The factor smooth then fits within-group noise and
1910 // extrapolates badly (held-out forecast worse than the population mean).
1911 //
1912 // Cap the marginal basis below the minimum per-group covariate resolution
1913 // so the penalty always retains residual degrees of freedom to shrink each
1914 // group's curvature toward its linear null space (the random-slope
1915 // estimand). This small-group cap composes with a separate upper bound at
1916 // mgcv's factor-smooth default k=10 (FACTOR_SMOOTH_DEFAULT_BASIS_DIM,
1917 // applied below), so even ample-data groups get the modest SHARED marginal
1918 // a factor smooth wants rather than the full pooled basis. The explicit
1919 // `re` random-effect form takes neither cap: it is a raw linear `[1, x]`
1920 // random effect (0 internal knots), handled in the branch above.
1921 let pooled_internal = heuristic_knots_for_column(ds.values.column(c));
1922 let default_internal = if type_opt == "re" {
1923 // `bs="re"` is a PARAMETRIC random effect, not a smooth of the
1924 // covariate: `s(x, g, bs="re")` is the mgcv random intercept+slope
1925 // `(1 + x | g)`, i.e. a per-group line `[1, x]`, penalized by an iid
1926 // ridge. A degree-1 marginal with ZERO internal knots spans exactly
1927 // that linear space (2 coefficients per group). Using the pooled
1928 // knot heuristic here instead turned the marginal into a
1929 // piecewise-linear B-spline (e.g. 6 functions/group on sleepstudy),
1930 // i.e. a *smooth* with kinks rather than a random slope — many extra
1931 // collinear-across-levels coefficients that ill-condition the joint
1932 // Newton/REML solve (minutes-long fits, and a singular block when
1933 // combined with a separate random intercept `s(g, bs="re")`). The
1934 // raw linear basis is both the correct `re` semantics and fast.
1935 0
1936 } else {
1937 let min_group_resolution =
1938 min_per_group_unique_count(ds.values.column(c), ds.values.column(cols[group_idx]));
1939 // Per-group basis dim = degree + 1 + internal. Hold it well below the
1940 // smallest group's resolution (leave at least two residual points per
1941 // group) so the smooth cannot interpolate that group and the
1942 // wiggliness penalty retains the room to collapse each curve toward
1943 // its linear null space. Never drop below `degree + 2`, which keeps
1944 // exactly the linear span plus a single curvature direction — the
1945 // minimal smoother that can still bend if the data demand it.
1946 let basis_cap = min_group_resolution.saturating_sub(2).max(degree + 2);
1947 let internal_cap = basis_cap.saturating_sub(degree + 1);
1948 let capped = pooled_internal.min(internal_cap.max(1));
1949 // A factor smooth (`fs` AND `sz`) shares ONE marginal across ALL
1950 // levels, each level's curve fit from that group's rows alone. The
1951 // pooled knot heuristic (driven by the full column's sample) hands it
1952 // a much richer basis than the shared signal needs — ~24
1953 // functions/group on the gam#903 factor-smooth-recovery fixtures — so
1954 // REML has the capacity to fit within-group noise and over-fits the
1955 // shared shape (fs: edf 58 vs mgcv's k=10/edf 39; sz: gam 0.068 vs
1956 // mgcv 0.046 truth RMSE), losing the truth-recovery head-to-head with
1957 // the mature tool. mgcv's factor-smooth default `k=10` embodies the
1958 // right convention: a modest shared marginal. Cap the marginal there
1959 // (basis ≈ degree+1+internal ≈ 10) for both flavours when the
1960 // small-group cap above is not already tighter, so REML is not handed
1961 // noise-fitting capacity it does not need. An explicit `k`/`basis_dim`
1962 // overrides this (parse_ps_internal_knots); `re` is the raw linear
1963 // effect handled above.
1964 let fs_default_internal = FACTOR_SMOOTH_DEFAULT_BASIS_DIM
1965 .saturating_sub(degree + 1)
1966 .max(1);
1967 capped.min(fs_default_internal)
1968 };
1969 let (n_knots, _, effective_degree) =
1970 parse_ps_internal_knots(options, degree, default_internal)?;
1971 let penalty_order = option_usize(options, "penalty_order")
1972 .unwrap_or(if effective_degree > 1 { 2 } else { 1 })
1973 .min(effective_degree);
1974 // All factor-smooth flavours (`fs`, `sz`, `re`) place their per-level
1975 // marginal on the SAME penalized B-spline (P-spline) basis. The flavours
1976 // differ ONLY in their penalty/constraint structure (handled below) —
1977 // sz: zero-sum deviation blocks with the per-level null space left
1978 // unpenalized; fs: random-effect double penalty; re: identity ridge.
1979 //
1980 // `sz` USED to route its default-degree marginal to a NATURAL cubic
1981 // regression spline (`cr`), on the belief that mgcv's `bs="sz"` does the
1982 // same and that cr recovers smooth signals more efficiently than the
1983 // (then uncapped) B-spline margin (#1074). That introduced a consistency
1984 // failure (#1605): the `cr` basis enforces the natural boundary
1985 // conditions f''(x_1)=f''(x_k)=0 and extrapolates linearly past the end
1986 // knots, so it CANNOT represent a per-group deviation curve with non-zero
1987 // curvature at the data boundary. Phase-shifted deviation shapes
1988 // (f''(0) = -(2π)² sin(φ) ≠ 0) are then biased toward "free linear +
1989 // anchored wiggle", under-shooting the amplitude — a bias that does NOT
1990 // vanish as n→∞ (n-independent: a genuine consistency failure, not
1991 // finite-sample shrinkage). The earlier #700/#1074 sz fixtures used
1992 // d_g ∝ sin(2πx), whose f'' happens to vanish at x=0 and x=1, so they
1993 // accidentally satisfied the natural BC and never exposed the gap; the
1994 // `fs` sibling, on this very B-spline marginal, recovers the SAME
1995 // phase-shifted data to the noise floor.
1996 //
1997 // The penalized B-spline marginal makes no boundary assumption, so it
1998 // represents arbitrary deviation shapes, and — with the
1999 // FACTOR_SMOOTH_DEFAULT_BASIS_DIM cap above already removing the
2000 // noise-fitting capacity that originally motivated leaving B-splines —
2001 // it recovers the BC-satisfying #700/#1074 signals just as well. Sharing
2002 // one marginal basis across all flavours also lets the B-spline degree/
2003 // knot degradation handle low-cardinality covariates uniformly (what
2004 // `fs` already does), so the `sz`-only cr data-support cap (#1541/#1542)
2005 // — and the asymmetry where only the cr-marginal `sz` spelling hard-
2006 // failed a 3-level ordinal — is no longer needed.
2007 let marginal_knotspec = resolve_nonperiodic_bspline_knotspec(
2008 options,
2009 ds.values.column(c),
2010 (minv, maxv),
2011 effective_degree,
2012 n_knots,
2013 )?;
2014 let marginal = BSplineBasisSpec {
2015 degree: effective_degree,
2016 penalty_order,
2017 knotspec: marginal_knotspec,
2018 // mgcv's `bs="fs"` is a random-effect-style smooth: EVERY per-level
2019 // coefficient, including the marginal null space, is penalized so
2020 // unobserved groups can be predicted — so `fs` keeps the null-space
2021 // (double) penalty. mgcv's `bs="sz"` is a pure across-level
2022 // *deviation* smooth that, under the default `select=FALSE`, leaves
2023 // the per-level null space UNPENALIZED; carrying the double penalty
2024 // there shrinks the genuine deviation signal and over-smooths the
2025 // recovered curves relative to mgcv (gam#700). `re` carries its own
2026 // identity ridge below and ignores this flag. Honour an explicit
2027 // user `double_penalty=` either way.
2028 double_penalty: option_bool(options, "double_penalty")
2029 .unwrap_or(type_opt.as_str() != "sz"),
2030 identifiability: BSplineIdentifiability::None,
2031 boundary_conditions: Default::default(),
2032 boundary: OneDimensionalBoundary::Open,
2033 };
2034 let flavour = match type_opt.as_str() {
2035 "fs" => FactorSmoothFlavour::Fs {
2036 m_null_penalty_orders: vec![
2037 option_usize(options, "m").unwrap_or(DEFAULT_PENALTY_ORDER),
2038 ],
2039 },
2040 "sz" => FactorSmoothFlavour::Sz,
2041 "re" => FactorSmoothFlavour::Re,
2042 // Outer `matches!` already restricts to fs/sz/re.
2043 other => {
2044 return Err(format!(
2045 "internal: factor-smooth flavour dispatch reached unexpected type `{}`",
2046 other
2047 ));
2048 }
2049 };
2050 return Ok(SmoothBasisSpec::FactorSmooth {
2051 spec: FactorSmoothSpec {
2052 continuous_cols: vec![c],
2053 group_col: cols[group_idx],
2054 marginal,
2055 flavour,
2056 group_frozen_levels: None,
2057 frozen_global_orthogonality: None,
2058 },
2059 });
2060 }
2061
2062 match type_opt.as_str() {
2063 "cyclic" | "cc" | "cp" | "cyclic-ps" => {
2064 validate_known_options(
2065 "cyclic",
2066 options,
2067 &[
2068 "type",
2069 "bs",
2070 "by",
2071 "k",
2072 "basis_dim",
2073 "basis-dim",
2074 "basisdim",
2075 "degree",
2076 "penalty_order",
2077 "period",
2078 "periods",
2079 "period_start",
2080 "period_end",
2081 "start",
2082 "end",
2083 "origin",
2084 "origins",
2085 "period_origin",
2086 "period-origin",
2087 "domain_origin",
2088 "double_penalty",
2089 "id",
2090 "__by_col",
2091 "identifiability",
2092 ],
2093 )?;
2094 if cols.len() != 1 {
2095 return Err(format!(
2096 "periodic smooth expects one variable, got {}",
2097 cols.len()
2098 ));
2099 }
2100 let c = cols[0];
2101 let (minv, maxv) = col_minmax(ds.values.column(c))?;
2102 let degree = option_usize(options, "degree").unwrap_or(DEFAULT_BSPLINE_DEGREE);
2103 let mut default_internal = heuristic_knots_for_column(ds.values.column(c));
2104 if ds.values.nrows() <= 32 && smooth_coordinate_count >= 5 {
2105 default_internal = default_internal.min(1);
2106 }
2107 // A periodic cubic spline has no free endpoint behaviour to spend
2108 // degrees of freedom on: the wrap constraint removes the ordinary
2109 // boundary wiggle, and the cyclic second-difference penalty leaves
2110 // only the constant direction (handled by the smooth
2111 // identifiability constraint). An over-rich default would give
2112 // small binomial/continuation-ratio fits a large penalized nuisance
2113 // space whose REML/LAML optimum is driven by finite-sample Bernoulli
2114 // noise rather than the low-frequency periodic signal. Cap the
2115 // cyclic default in the mgcv `bs="cc"` spirit: a modest basis unless
2116 // the caller explicitly requests `k=...`; high-frequency periodic
2117 // structure remains available through that explicit contract. Since
2118 // gam#1680 lowered the open-spline univariate default to ≈12
2119 // functions this cap and the open-spline default coincide, so it now
2120 // acts as an explicit floor/guard that keeps the cyclic default lean
2121 // even if the open-spline heuristic is later widened.
2122 let cyclic_default_basis_cap = CYCLIC_DEFAULT_BASIS_DIM.max(degree + 1);
2123 let default_basis = (default_internal + degree + 1).min(cyclic_default_basis_cap);
2124 let num_basis = option_usize_any(options, &["k", "basis_dim", "basis-dim", "basisdim"])
2125 .unwrap_or(default_basis);
2126 if num_basis < degree + 1 {
2127 return Err(format!(
2128 "periodic smooth: k={} too small for degree {}; expected k >= {}",
2129 num_basis,
2130 degree,
2131 degree + 1
2132 ));
2133 }
2134 // The cyclic arm is periodic on its single axis by construction, so
2135 // resolve the period exactly the way the `s()`/`ps` arm does: honour
2136 // `period=`/`periods=` first (with `origin=` setting the domain
2137 // start), and fall back to the `period_start`/`period_end` endpoint
2138 // form only when `period=` is absent. Previously this arm jumped
2139 // straight to `parse_periodic_domain_1d`, so a `period=<v>`
2140 // declaration was silently dropped and the smooth wrapped at the
2141 // data range (#816). All three helpers route through
2142 // `parse_numeric_expr`, so `period=2*pi` and `period_end=2*pi` parse
2143 // identically (#815).
2144 let periodic_axes = [true];
2145 let periods = parse_periods(options, &periodic_axes)?;
2146 let origins = parse_period_origins(options, &periodic_axes)?;
2147 // Distinguish a *cyclic basis selector* (`bs='cc'`/`cp'`/`cyclic`,
2148 // this whole arm) from a generic B-spline forced periodic by a
2149 // `periodic=`/`boundary=` flag (the `ps`/`bspline` arm). Only the
2150 // latter carries the sample-dependent off-by-ε seam that #1771's
2151 // guard in `parse_periodic_domain_1d` requires an explicit period
2152 // to avoid. A bare `s(x, bs='cc')` opts INTO mgcv's `bs="cc"`
2153 // semantics — the wrap IS the observed data range — exactly like
2154 // the tensor cc-margin fallback (`te(x, z, bs=c('cc','cc'))`). The
2155 // cyclic arm was left routing through the now-strict helper when
2156 // #1771 tightened it, so a bare cyclic smooth hard-errored with
2157 // "periodic B-spline smooth requires an explicit period" even
2158 // though its period is well-defined. Honor `period=`/`periods=`
2159 // first, then the half-open `period_start`/`period_end` endpoint
2160 // form, and only otherwise wrap at the observed `[min, max]` span.
2161 let has_endpoint_decl = ["period_start", "start", "period_end", "end"]
2162 .iter()
2163 .any(|key| options.contains_key(*key));
2164 let (domain_start, period) = if let Some(p) = periods[0] {
2165 (origins[0].unwrap_or(minv), p)
2166 } else if has_endpoint_decl {
2167 parse_periodic_domain_1d(options, minv, maxv)?
2168 } else {
2169 let span = maxv - minv;
2170 if !(span.is_finite() && span > 0.0) {
2171 return Err(format!(
2172 "cyclic smooth requires a positive observed data range to derive \
2173 its period, got [{minv}, {maxv}]"
2174 ));
2175 }
2176 (origins[0].unwrap_or(minv), span)
2177 };
2178 Ok(SmoothBasisSpec::BSpline1D {
2179 feature_col: c,
2180 spec: BSplineBasisSpec {
2181 degree,
2182 penalty_order: option_usize(options, "penalty_order")
2183 .unwrap_or(DEFAULT_PENALTY_ORDER),
2184 knotspec: BSplineKnotSpec::PeriodicUniform {
2185 data_range: (domain_start, domain_start + period),
2186 num_basis,
2187 },
2188 double_penalty: smooth_double_penalty,
2189 identifiability: BSplineIdentifiability::default(),
2190 boundary_conditions: Default::default(),
2191 boundary: OneDimensionalBoundary::Cyclic {
2192 start: domain_start,
2193 end: domain_start + period,
2194 },
2195 },
2196 })
2197 }
2198 "bspline" | "ps" | "p-spline" | "cr" | "cs" => {
2199 // mgcv's `bs="cr"` (cubic regression spline) and `bs="cs"` (its
2200 // shrinkage twin) are penalized cubic-regression smooths that span
2201 // the same per-axis function space as gamfit's `bspline` (cubic
2202 // B-spline, second-derivative penalty). Route both through the
2203 // 1-D B-spline arm; the only semantic difference is whether the
2204 // null space is shrunk: `cr` is the no-shrinkage form (mgcv's
2205 // default) and `cs` is the shrinkage form (mgcv's `cs`/gamfit's
2206 // double_penalty). Without this route, a stand-alone
2207 // `s(x, bs='cr')` (which is otherwise a routine 1-D smooth in
2208 // mgcv-compatible formulae) reached the dispatch's default arm
2209 // and aborted the whole fit with `unsupported smooth type 'cr'`,
2210 // even though the same name was already recognized as a tensor
2211 // margin (`tensor_margin_bs_is_supported`).
2212 let validation_name = match type_opt.as_str() {
2213 "cr" => "cr",
2214 "cs" => "cs",
2215 _ => "bspline",
2216 };
2217 validate_known_options(
2218 validation_name,
2219 options,
2220 &[
2221 "type",
2222 "bs",
2223 "by",
2224 "k",
2225 "basis_dim",
2226 "basis-dim",
2227 "basisdim",
2228 "knots",
2229 "knot_placement",
2230 "knot-placement",
2231 "knotplacement",
2232 "degree",
2233 "penalty_order",
2234 "boundary",
2235 "bc",
2236 "boundary_conditions",
2237 "bc_left",
2238 "bc_right",
2239 "left_bc",
2240 "right_bc",
2241 "start_bc",
2242 "end_bc",
2243 "side",
2244 "anchor",
2245 "anchor_value",
2246 "value",
2247 "anchor_left",
2248 "left_anchor",
2249 "anchor_right",
2250 "right_anchor",
2251 "periodic",
2252 "period",
2253 "periods",
2254 "period_start",
2255 "period_end",
2256 "origin",
2257 "double_penalty",
2258 "by",
2259 "id",
2260 "__by_col",
2261 "identifiability",
2262 "by",
2263 ],
2264 )?;
2265 if cols.len() != 1 {
2266 return Err(TermBuilderError::incompatible_config(format!(
2267 "bspline smooth expects one variable, got {}",
2268 cols.len()
2269 ))
2270 .to_string());
2271 }
2272 let c = cols[0];
2273 let (minv, maxv) = col_minmax(ds.values.column(c))?;
2274 let degree = option_usize(options, "degree").unwrap_or(DEFAULT_BSPLINE_DEGREE);
2275 let default_internal = heuristic_knots_for_column(ds.values.column(c));
2276 let (mut n_knots, inferred, effective_degree) =
2277 parse_ps_internal_knots(options, degree, default_internal)?;
2278 let periodic_axes = parse_periodic_axes(options, 1).map_err(|e| e.to_string())?;
2279 // Periodic margins still need enough basis functions to wrap, so
2280 // surface the per-axis degree reduction as a config error when the
2281 // user explicitly asked for a periodic-but-too-small basis. The
2282 // non-periodic path silently degrades degree to match mgcv.
2283 if periodic_axes[0] && effective_degree != degree {
2284 return Err(TermBuilderError::invalid_option(format!(
2285 "periodic smooth: k={} too small for degree {}; expected k >= {}",
2286 effective_degree + 1,
2287 degree,
2288 degree + 1
2289 ))
2290 .to_string());
2291 }
2292 if inferred && ds.values.nrows() <= 32 && smooth_coordinate_count >= 5 {
2293 n_knots = n_knots.min(1);
2294 }
2295 if inferred {
2296 let unique = unique_count_column(ds.values.column(c));
2297 let ceiling = ((unique as f64).cbrt() as usize).max(20);
2298 inference_notes.push(format!(
2299 "Automatically set {} internal knots for smooth '{}' from {} unique values (rule: clamp(unique/4, 4..max(20, cbrt(unique))) = clamp(unique/4, 4..{})). Override with knots=... or k=....",
2300 n_knots,
2301 vars.join(","),
2302 unique,
2303 ceiling,
2304 ));
2305 }
2306 let boundary_conditions =
2307 if periodic_axes[0] && bspline_boundary_declares_periodic_axis(options) {
2308 BSplineBoundaryConditions::default()
2309 } else {
2310 parse_bspline_boundary_conditions(options).map_err(|e| e.to_string())?
2311 };
2312 let periods = parse_periods(options, &periodic_axes).map_err(|e| e.to_string())?;
2313 let origins =
2314 parse_period_origins(options, &periodic_axes).map_err(|e| e.to_string())?;
2315 let (knotspec, boundary) = if periodic_axes[0] {
2316 if !boundary_conditions.is_free() {
2317 return Err(TermBuilderError::incompatible_config(
2318 "periodic B-splines cannot also declare endpoint boundary conditions",
2319 )
2320 .to_string());
2321 }
2322 {
2323 let (domain_start, p_value) = if periods[0].is_some() {
2324 (origins[0].unwrap_or(minv), periods[0].unwrap())
2325 } else {
2326 parse_periodic_domain_1d(options, minv, maxv).map_err(|e| e.to_string())?
2327 };
2328 let domain_end = domain_start + p_value;
2329 (
2330 BSplineKnotSpec::PeriodicUniform {
2331 data_range: (domain_start, domain_end),
2332 num_basis: n_knots + effective_degree + 1,
2333 },
2334 OneDimensionalBoundary::Cyclic {
2335 start: domain_start,
2336 end: domain_end,
2337 },
2338 )
2339 }
2340 } else if type_opt == "cr" || type_opt == "cs" {
2341 // mgcv `bs="cr"`/`"cs"`: a natural cubic regression spline whose
2342 // basis is indexed by `k` values at quantile-placed knots (#1074),
2343 // NOT a B-spline knot vector. Match gam's `k=` convention by
2344 // requesting the same total basis size the B-spline arm would
2345 // produce (`n_knots` internal + degree + 1), floored at the cr
2346 // minimum of 3 knots. `cr` vs `cs` (shrinkage) is carried by the
2347 // `double_penalty` flag resolved below, which the cr builder reads.
2348 //
2349 // Cap that request to the covariate's data support (#1541): a cr
2350 // basis cannot place more value-knots than there are distinct
2351 // covariate values, so an unclamped `k` on a low-cardinality
2352 // predictor (binary indicator, 3-level ordinal, small count) used
2353 // to hard-fail in `select_cr_knots` instead of reducing like mgcv
2354 // and gam's tensor path. Below the cr minimum (a binary covariate)
2355 // degrade to the B-spline marginal the default `s(x, k=..)` basis
2356 // already fits on the same data — never a hard error.
2357 let k_cr = (n_knots + effective_degree + 1).max(CR_MIN_KNOTS);
2358 let knotspec = match capped_cr_marginal_knotspec(
2359 ds.values.column(c),
2360 k_cr,
2361 &vars.join(","),
2362 inference_notes,
2363 )? {
2364 Some(cr_knotspec) => cr_knotspec,
2365 None => resolve_nonperiodic_bspline_knotspec(
2366 options,
2367 ds.values.column(c),
2368 (minv, maxv),
2369 effective_degree,
2370 n_knots,
2371 )?,
2372 };
2373 (knotspec, parse_cyclic_boundary(options, minv, maxv)?)
2374 } else {
2375 (
2376 resolve_nonperiodic_bspline_knotspec(
2377 options,
2378 ds.values.column(c),
2379 (minv, maxv),
2380 effective_degree,
2381 n_knots,
2382 )?,
2383 parse_cyclic_boundary(options, minv, maxv)?,
2384 )
2385 };
2386 // mgcv `bs="cr"` does not shrink the linear null space; only `cs`
2387 // (and the gamfit-flavoured `bspline`/`ps`) do. Honour an explicit
2388 // `double_penalty=` either way.
2389 let double_penalty = if type_opt == "cr" {
2390 option_bool(options, "double_penalty").unwrap_or(false)
2391 } else {
2392 smooth_double_penalty
2393 };
2394 // Clamp the marginal difference penalty to `<= effective_degree`
2395 // so it stays well-defined when the per-axis degree was reduced
2396 // (mirrors the tensor margin path: `create_difference_penalty_matrix`
2397 // requires order < num_basis_functions).
2398 let penalty_order = option_usize(options, "penalty_order")
2399 .unwrap_or(DEFAULT_PENALTY_ORDER)
2400 .min(effective_degree);
2401 Ok(SmoothBasisSpec::BSpline1D {
2402 feature_col: c,
2403 spec: BSplineBasisSpec {
2404 degree: effective_degree,
2405 penalty_order,
2406 knotspec,
2407 double_penalty,
2408 identifiability: BSplineIdentifiability::default(),
2409 boundary,
2410 boundary_conditions,
2411 },
2412 })
2413 }
2414 "tps" | "thinplate" | "thin-plate" => {
2415 validate_known_options(
2416 "thinplate",
2417 options,
2418 &[
2419 SECONDARY_CENTER_CAP_OPTION,
2420 "type",
2421 "bs",
2422 "by",
2423 "length_scale",
2424 "centers",
2425 "k",
2426 "basis_dim",
2427 "basis-dim",
2428 "basisdim",
2429 "knots",
2430 "include_intercept",
2431 "double_penalty",
2432 "by",
2433 "id",
2434 "__by_col",
2435 "identifiability",
2436 "by",
2437 "periodic",
2438 "cyclic",
2439 "period",
2440 "period_start",
2441 "period_end",
2442 "scale_dims",
2443 ],
2444 )?;
2445 let plan = plan_spatial_basis(
2446 ds.values.nrows(),
2447 cols.len(),
2448 CenterCountRequest::Default,
2449 DuchonNullspaceOrder::Linear,
2450 option_bool(options, "scale_dims").unwrap_or(false),
2451 policy,
2452 )
2453 .map_err(|e| e.to_string())?;
2454 // #1074: the mgcv-sized basis cap (`k = 10·3^(d-1)`) that used to live
2455 // here was DELETED. It masked the real defect — the n-scaling default
2456 // over-sizes a thin-plate field, producing a weakly-identified
2457 // two-penalty ρ-surface the outer optimizer stalls on (row-order
2458 // dependent, #1378), and surplus columns REML can't penalize away on
2459 // weak-signal fits. Capping the basis hid that stall instead of fixing
2460 // it. The default now uses the generic spatial center heuristic; the
2461 // root fix (a well-identified ρ-surface / optimizer that doesn't stall)
2462 // is tracked separately. Explicit `k`/`centers` still take full effect.
2463 let default_centers = plan.centers;
2464 let centers = parse_countwith_basis_alias(
2465 options,
2466 "centers",
2467 cap_default_spatial_centers(options, default_centers),
2468 )?;
2469 let center_strategy = if has_explicit_countwith_basis_alias(options, "centers") {
2470 spatial_center_strategy_for_dimension(centers, cols.len())
2471 } else {
2472 auto_spatial_center_strategy(centers, cols.len())
2473 };
2474 Ok(SmoothBasisSpec::ThinPlate {
2475 feature_cols: cols.to_vec(),
2476 spec: ThinPlateBasisSpec {
2477 center_strategy,
2478 periodic: parse_periodic_axes_option(options, cols.len())?,
2479 // Sentinel: leave at 0.0 when the user didn't pass an
2480 // explicit length_scale so `auto_init_length_scale_in_place`
2481 // can replace it with a data-derived initialization. The
2482 // old hard-coded 1.0 was the documented basin (see
2483 // smooth.rs `auto_init_length_scale_in_place`) that the
2484 // spatial optimizer could not escape, leaving TPS terms
2485 // initialized off the data scale.
2486 length_scale: option_f64(options, "length_scale").unwrap_or(0.0),
2487 double_penalty: smooth_double_penalty,
2488 identifiability: parse_spatial_identifiability(options)
2489 .map_err(|e| e.to_string())?,
2490 radial_reparam: None,
2491 },
2492 input_scales: None,
2493 })
2494 }
2495 "sphere" | "s2" | "sos" => {
2496 validate_known_options(
2497 "sphere",
2498 options,
2499 &[
2500 "type",
2501 "bs",
2502 "by",
2503 "centers",
2504 "k",
2505 "basis_dim",
2506 "basis-dim",
2507 "basisdim",
2508 "knots",
2509 "penalty_order",
2510 "m",
2511 "double_penalty",
2512 "id",
2513 "__by_col",
2514 "kernel",
2515 "method",
2516 "radians",
2517 "units",
2518 "degree",
2519 "l",
2520 "max_degree",
2521 "max-degree",
2522 ],
2523 )?;
2524 if cols.len() != 2 {
2525 return Err(format!(
2526 "sphere smooth expects exactly two variables (lat, lon), got {}",
2527 cols.len()
2528 ));
2529 }
2530 let radians = option_bool(options, "radians").unwrap_or_else(|| {
2531 options
2532 .get("units")
2533 .map(|u| u.eq_ignore_ascii_case("radian") || u.eq_ignore_ascii_case("radians"))
2534 .unwrap_or(false)
2535 });
2536 // An explicit `degree`/`l`/`max_degree` names a spherical-harmonic
2537 // truncation, so with no explicit kernel/method it selects the
2538 // Harmonic construction (the Wahba kernel ignores `degree` and would
2539 // silently emit a 1-column kernel design). An explicit kernel/method
2540 // still wins.
2541 let degree_requested = options.contains_key("degree")
2542 || options.contains_key("l")
2543 || options.contains_key("max_degree")
2544 || options.contains_key("max-degree");
2545 let kernel = options
2546 .get("kernel")
2547 .or_else(|| options.get("method"))
2548 .map(|raw| strip_quotes(raw).trim().to_ascii_lowercase())
2549 .unwrap_or_else(|| {
2550 if degree_requested {
2551 "harmonic".to_string()
2552 } else {
2553 "sobolev".to_string()
2554 }
2555 });
2556 let (method, wahba_kernel) = match kernel.as_str() {
2557 "sobolev" | "wahba" | "wahba_sobolev" | "wahba-sobolev" => {
2558 (SphereMethod::Wahba, SphereWahbaKernel::Sobolev)
2559 }
2560 "pseudo" | "mgcv" | "sos" | "wahba_pseudo" | "wahba-pseudo" => {
2561 (SphereMethod::Wahba, SphereWahbaKernel::Pseudo)
2562 }
2563 "harmonic" | "spherical_harmonic" | "spherical-harmonic" => {
2564 (SphereMethod::Harmonic, SphereWahbaKernel::Sobolev)
2565 }
2566 other => {
2567 return Err(format!(
2568 "unsupported sphere kernel '{other}'; expected sobolev, pseudo, or harmonic"
2569 ));
2570 }
2571 };
2572 let max_degree = if matches!(method, SphereMethod::Harmonic) {
2573 let degree =
2574 option_usize_any(options, &["degree", "l", "max_degree", "max-degree"])
2575 .or_else(|| option_usize(options, "centers"))
2576 .or_else(|| {
2577 option_usize_any(options, &["k", "basis_dim", "basis-dim", "basisdim"])
2578 .and_then(|k| (1..=128).find(|&l| l * (l + 2) >= k))
2579 })
2580 .unwrap_or_else(|| default_spherical_harmonic_degree(ds.values.nrows()));
2581 if degree == 0 {
2582 return Err("sphere smooth requires degree/max_degree >= 1".to_string());
2583 }
2584 if degree > 32 {
2585 return Err(format!(
2586 "sphere smooth max_degree={} is too large for the dense harmonic engine (limit 32)",
2587 degree
2588 ));
2589 }
2590 Some(degree)
2591 } else {
2592 None
2593 };
2594 let penalty_order = option_usize(options, "penalty_order")
2595 .or_else(|| option_usize(options, "m"))
2596 .unwrap_or(DEFAULT_PENALTY_ORDER);
2597 let center_strategy = if matches!(method, SphereMethod::Wahba) {
2598 let mut centers = parse_countwith_basis_alias(
2599 options,
2600 "centers",
2601 default_num_centers(ds.values.nrows(), cols.len()),
2602 )?;
2603 if penalty_order >= 4 {
2604 centers = centers.max(30);
2605 }
2606 CenterStrategy::FarthestPoint {
2607 num_centers: centers,
2608 }
2609 } else {
2610 CenterStrategy::FarthestPoint { num_centers: 0 }
2611 };
2612 Ok(SmoothBasisSpec::Sphere {
2613 feature_cols: cols.to_vec(),
2614 spec: SphericalSplineBasisSpec {
2615 center_strategy,
2616 penalty_order,
2617 double_penalty: smooth_double_penalty,
2618 radians,
2619 method,
2620 max_degree,
2621 wahba_kernel,
2622 identifiability: SphericalSplineIdentifiability::CenterSumToZero,
2623 },
2624 })
2625 }
2626 "curvature" => {
2627 // Constant-curvature (M_κ) geodesic-kernel smooth (#944): the
2628 // κ-generic sibling of the intrinsic S² smooth above. The feature
2629 // columns are κ-stereographic chart coordinates; `kappa=` is the
2630 // fixed sectional curvature (default 0 = flat), and the geometry
2631 // comes from `geometry::constant_curvature::ConstantCurvature`.
2632 validate_known_options(
2633 "curvature",
2634 options,
2635 &[
2636 "type",
2637 "bs",
2638 "by",
2639 "centers",
2640 "k",
2641 "basis_dim",
2642 "basis-dim",
2643 "basisdim",
2644 "knots",
2645 "kappa",
2646 "length_scale",
2647 "double_penalty",
2648 "id",
2649 "__by_col",
2650 ],
2651 )?;
2652 let kappa = option_f64(options, "kappa").unwrap_or(0.0);
2653 if !kappa.is_finite() {
2654 return Err("curvature smooth requires a finite kappa".to_string());
2655 }
2656 let length_scale = option_f64(options, "length_scale").unwrap_or(0.0);
2657 if !length_scale.is_finite() || length_scale < 0.0 {
2658 return Err(format!(
2659 "curvature smooth length_scale must be positive (or omitted for auto); got {length_scale}"
2660 ));
2661 }
2662 let centers = parse_countwith_basis_alias(
2663 options,
2664 "centers",
2665 default_num_centers(ds.values.nrows(), cols.len()),
2666 )?;
2667 if centers < 2 {
2668 return Err("curvature smooth requires at least 2 centers".to_string());
2669 }
2670 Ok(SmoothBasisSpec::ConstantCurvature {
2671 feature_cols: cols.to_vec(),
2672 spec: ConstantCurvatureBasisSpec {
2673 center_strategy: CenterStrategy::FarthestPoint {
2674 num_centers: centers,
2675 },
2676 kappa,
2677 // 0.0 sentinel = κ-independent auto initialization in the
2678 // basis builder (median chart center spacing, doubled).
2679 length_scale,
2680 // Curvature smooth defaults to NO double-penalty ridge
2681 // (#1464): the curvature-blind ridge `I` absorbs the data fit
2682 // independently of κ and rails the fitted curvature to the
2683 // +chart bound (hyperbolic truth recovered as spherical). The
2684 // RKHS Gram penalty is already full-rank PD, so the ridge adds
2685 // no stability. Honour an EXPLICIT `double_penalty=` only.
2686 double_penalty: option_bool(options, "double_penalty").unwrap_or(false),
2687 identifiability: ConstantCurvatureIdentifiability::CenterSumToZero,
2688 },
2689 })
2690 }
2691 "measurejet" => {
2692 // Measure-jet spline: multiscale local-jet-residual energy of the
2693 // empirical measure. The feature columns are ambient coordinates
2694 // of data concentrated near an unknown low-dimensional set; the
2695 // geometry (centers, masses, scale band) is read off the measure
2696 // at build time — magic by default, every option optional.
2697 validate_known_options(
2698 "measurejet",
2699 options,
2700 &[
2701 "type",
2702 "bs",
2703 "by",
2704 "centers",
2705 "k",
2706 "basis_dim",
2707 "basis-dim",
2708 "basisdim",
2709 "knots",
2710 "s",
2711 "alpha",
2712 "tau",
2713 "scales",
2714 "length_scale",
2715 "double_penalty",
2716 "multiscale",
2717 "learn_length_scale",
2718 "id",
2719 "__by_col",
2720 ],
2721 )?;
2722 let order_s = option_f64(options, "s").unwrap_or(0.0);
2723 // 0.0 = auto sentinel; explicit values must sit inside the
2724 // admissible order interval of the affine-jet (r = 2) energy.
2725 if !(order_s.is_finite() && (order_s == 0.0 || (order_s > 0.0 && order_s < 2.0))) {
2726 return Err(format!(
2727 "measurejet smooth s must lie in (0, 2) (or be omitted for auto); got {order_s}"
2728 ));
2729 }
2730 // Default to the spec Default (α = 1, density-WEIGHTED Hessian
2731 // energy — the module-header default). The density-free α = 3/2
2732 // (q^{−2}) over-smooths low-intrinsic-dimension manifolds where the
2733 // local mass q is tiny and varies along the stratum (#1116:
2734 // 13×-worse-than-matérn on a 1-D curve in 3-D); α = 1's q^{−1} is
2735 // gentler and robust across intrinsic dimensions. An explicit
2736 // `alpha=` still overrides for full-dimensional density-free use.
2737 let alpha =
2738 option_f64(options, "alpha").unwrap_or(MeasureJetBasisSpec::default().alpha);
2739 if !alpha.is_finite() {
2740 return Err("measurejet smooth requires a finite alpha".to_string());
2741 }
2742 let tau0 = option_f64(options, "tau").unwrap_or(1e-3);
2743 if !(tau0.is_finite() && tau0 >= 0.0) {
2744 return Err(format!(
2745 "measurejet smooth tau must be finite and nonnegative; got {tau0}"
2746 ));
2747 }
2748 let num_scales = option_usize(options, "scales").unwrap_or(0);
2749 let length_scale = option_f64(options, "length_scale").unwrap_or(0.0);
2750 if !length_scale.is_finite() || length_scale < 0.0 {
2751 return Err(format!(
2752 "measurejet smooth length_scale must be positive (or omitted for auto); got {length_scale}"
2753 ));
2754 }
2755 let centers = parse_countwith_basis_alias(
2756 options,
2757 "centers",
2758 default_num_centers(ds.values.nrows(), cols.len()),
2759 )?;
2760 if centers < 3 {
2761 return Err("measurejet smooth requires at least 3 centers".to_string());
2762 }
2763 // Multiscale (per-scale spectral split + (α, lnτ) ψ dials + the
2764 // affine-preserving ridge) is an explicit opt-in (#1116): default
2765 // single-scale at any center count, the Duchon/Matérn footprint.
2766 let multiscale = option_bool(options, "multiscale").unwrap_or(false);
2767 // REML-learning the representer range ℓ is an explicit opt-in.
2768 // The stable default freezes ℓ at the auto/user value; the
2769 // design-moving coordinate is expensive and can overfit low-signal
2770 // surfaces when enabled implicitly.
2771 let learn_length_scale = option_bool(options, "learn_length_scale").unwrap_or(false);
2772 Ok(SmoothBasisSpec::MeasureJet {
2773 feature_cols: cols.to_vec(),
2774 spec: MeasureJetBasisSpec {
2775 center_strategy: CenterStrategy::FarthestPoint {
2776 num_centers: centers,
2777 },
2778 order_s,
2779 alpha,
2780 tau0,
2781 num_scales,
2782 // 0.0 sentinel = auto initialization in the basis builder
2783 // (median nearest-center spacing).
2784 length_scale,
2785 double_penalty: smooth_double_penalty,
2786 learn_length_scale,
2787 multiscale,
2788 identifiability: MeasureJetIdentifiability::CenterSumToZero,
2789 frozen_quadrature: None,
2790 },
2791 input_scales: None,
2792 })
2793 }
2794 "matern" => {
2795 // Catch typos like `lengt_scale=` / `nyu=` / `centerz=` before
2796 // they get silently ignored and the user wonders why their
2797 // option had no effect. The matern() term accepts exactly
2798 // these options.
2799 validate_known_options(
2800 "matern",
2801 options,
2802 &[
2803 SECONDARY_CENTER_CAP_OPTION,
2804 "type",
2805 "bs",
2806 "by",
2807 "nu",
2808 "length_scale",
2809 "centers",
2810 "k",
2811 "basis_dim",
2812 "basis-dim",
2813 "basisdim",
2814 "knots",
2815 "include_intercept",
2816 "double_penalty",
2817 "by",
2818 "id",
2819 "__by_col",
2820 "identifiability",
2821 "by",
2822 "periodic",
2823 "cyclic",
2824 "period",
2825 "period_start",
2826 "period_end",
2827 "scale_dims",
2828 ],
2829 )?;
2830 let plan = plan_spatial_basis(
2831 ds.values.nrows(),
2832 cols.len(),
2833 CenterCountRequest::Default,
2834 DuchonNullspaceOrder::Zero,
2835 option_bool(options, "scale_dims").unwrap_or(false),
2836 policy,
2837 )
2838 .map_err(|e| e.to_string())?;
2839 let centers = parse_countwith_basis_alias(
2840 options,
2841 "centers",
2842 cap_default_spatial_centers(
2843 options,
2844 default_matern_center_count(ds.values.nrows(), cols.len(), plan.centers),
2845 ),
2846 )?;
2847 let center_strategy = if has_explicit_countwith_basis_alias(options, "centers") {
2848 spatial_center_strategy_for_dimension(centers, cols.len())
2849 } else {
2850 auto_spatial_center_strategy(centers, cols.len())
2851 };
2852 let nu = parse_matern_nu(options.get("nu").map(String::as_str).unwrap_or("5/2"))?;
2853 // The exponential (ν = 1/2) Matérn kernel has a singular Laplacian
2854 // at zero in d ≥ 2, so the operator-collocation penalty machinery
2855 // hits a non-invertible matrix during fit. Surface the cause
2856 // up-front instead of letting the user see the generic
2857 // "Matrix conditioning issue detected" wrapper from PIRLS.
2858 if matches!(nu, MaternNu::Half) && cols.len() >= 2 {
2859 return Err(TermBuilderError::unsupported_feature(format!(
2860 "matern() with nu=1/2 is not supported for d>=2 (got {} covariates): \
2861 the exponential kernel's Laplacian is singular at center collisions, \
2862 which makes the operator-collocation penalty non-invertible. \
2863 Choose nu>=3/2 (e.g. nu=3/2 or the default nu=5/2) for multi-dimensional smooths.",
2864 cols.len()
2865 ))
2866 .to_string());
2867 }
2868 let aniso_log_scales = if option_bool(options, "scale_dims").unwrap_or(false) {
2869 Some(vec![0.0; cols.len()])
2870 } else {
2871 None
2872 };
2873 Ok(SmoothBasisSpec::Matern {
2874 feature_cols: cols.to_vec(),
2875 spec: MaternBasisSpec {
2876 center_strategy,
2877 periodic: parse_periodic_axes_option(options, cols.len())?,
2878 // Sentinel: leave at 0.0 when the user didn't pass an
2879 // explicit length_scale so the planner's
2880 // `auto_init_length_scale_in_place` can replace it with the
2881 // SAME data-derived wiggly-side initialization the thin-plate
2882 // path uses (`max_range / sqrt(n)`), then let the κ-optimizer
2883 // refine from there.
2884 //
2885 // gam#1629: the previous `default_matern_length_scale` seeded
2886 // the FULL data diameter — the maximally over-smoothed corner.
2887 // Because that value is non-zero, the `0.0`-gated auto-init was
2888 // a no-op for Matérn, so the κ-optimizer started in the flat
2889 // over-smoothed basin and parked there, leaving high-frequency
2890 // 2-D surfaces unresolved (truth-RMSE ~6× worse than
2891 // thin-plate/tensor on identical data, and insensitive to `k`).
2892 // Routing Matérn through the same `0.0` sentinel as thin-plate
2893 // (see the ThinPlate branch above) starts REML in the resolving
2894 // regime it can actually escape from.
2895 length_scale: option_f64(options, "length_scale").unwrap_or(0.0),
2896 nu,
2897 include_intercept: option_bool(options, "include_intercept").unwrap_or(false),
2898 double_penalty: smooth_double_penalty,
2899 identifiability: parse_matern_identifiability(options)
2900 .map_err(|e| e.to_string())?,
2901 aniso_log_scales,
2902 // Cold build: let the bootstrap-κ spectral test decide whether
2903 // the double-penalty nullspace shrinkage survives; the freeze
2904 // step then pins that decision into the FrozenTransform so the
2905 // κ-optimizer's rebuilds keep the count invariant (gam#787/#860).
2906 nullspace_shrinkage_survived: None,
2907 },
2908 input_scales: None,
2909 })
2910 }
2911 "duchon" | "ds" => {
2912 validate_known_options(
2913 "duchon",
2914 options,
2915 &[
2916 SECONDARY_CENTER_CAP_OPTION,
2917 "type",
2918 "bs",
2919 "by",
2920 "length_scale",
2921 "centers",
2922 "k",
2923 "basis_dim",
2924 "basis-dim",
2925 "basisdim",
2926 "knots",
2927 "power",
2928 "p",
2929 "nullspace_order",
2930 "order",
2931 "identifiability",
2932 "by",
2933 "periodic",
2934 "cyclic",
2935 "period",
2936 "period_start",
2937 "period_end",
2938 "scale_dims",
2939 "double_penalty",
2940 "by",
2941 "id",
2942 "__by_col",
2943 ],
2944 )?;
2945 if options.contains_key("double_penalty") {
2946 return Err(TermBuilderError::incompatible_config(format!(
2947 "Duchon smooth '{}' does not support double_penalty; the Duchon smoother already ships its native reproducing-norm penalty plus a null-space shrinkage ridge.",
2948 vars.join(", ")
2949 ))
2950 .to_string());
2951 }
2952 let requested_nullspace_order = parse_duchon_order(options)?;
2953 let length_scale = option_f64_strict(options, "length_scale")?;
2954 // Resolve `(nullspace_order, power)`. The default (magic) path is a
2955 // structural amplitude/slope/curvature smoother: an affine (`Linear`)
2956 // polynomial nullspace and spectral power `s = (d - 1)/2`, giving the
2957 // cubic kernel `r^3` in 1D. There is no nullspace-order escalation —
2958 // the structural cubic smoother is well-defined for every dimension.
2959 //
2960 // Explicit `power=...` honors the user's value verbatim against their
2961 // requested nullspace order; the kernel validator emits a precise
2962 // diagnostic for any inadmissible combination. In the scale-free
2963 // (non-hybrid) regime fractional powers are admitted and threaded as
2964 // `f64`. The hybrid Duchon-Matérn kernel (`length_scale=Some`) is
2965 // restricted to integer powers.
2966 let (nullspace_order, power) = match parse_duchon_power_policy(options)? {
2967 DuchonPowerPolicy::Explicit(req_power) => {
2968 if length_scale.is_some() && req_power.fract() != 0.0 {
2969 return Err(TermBuilderError::incompatible_config(format!(
2970 "hybrid Duchon-Matern smooth '{}' (length_scale=...) requires an integer power, got power={}; \
2971 drop length_scale to use the scale-free structural kernel with a fractional power.",
2972 vars.join(", "),
2973 req_power,
2974 ))
2975 .to_string());
2976 }
2977 (requested_nullspace_order, req_power)
2978 }
2979 DuchonPowerPolicy::CubicStructuralDefault => {
2980 // Magic cubic rule (REQUEST-LAYER default): no explicit power ⇒
2981 // affine null space + fractional spectral power s = (d-1)/2, i.e.
2982 // the Duchon kernel φ(r)=r³ in every dimension. An EXPLICIT
2983 // `power=0` is handled above and is honored as the s=0 Duchon
2984 // kernel (r²·log r ≡ the thin-plate kernel in even d) — the magic
2985 // default lives here, not in the basis builder.
2986 match length_scale {
2987 None => crate::basis::duchon_cubic_default(cols.len()),
2988 Some(_) => {
2989 // The hybrid Matérn-blended kernel (`length_scale=Some`)
2990 // requires an INTEGER spectral power `s` (the partial-
2991 // fraction split `1/(ρ^{2p}(κ²+ρ²)^s)` is only defined for
2992 // integer `s`). The fractional cubic default `s=(d-1)/2` is
2993 // a half-integer for even `d`, and the basis builder's
2994 // `power_as_usize` maps a NON-integer to `0` (not its
2995 // floor) — so for even `d ≥ 4` the realized kernel has
2996 // `2(p+s) = 2p = 4 ≤ d`, which is non-finite at the origin
2997 // and crashes the fit (historically a non-finite
2998 // eigendecomposition; now a fit-time validation error).
2999 //
3000 // Rather than emit the fractional cubic and let it truncate
3001 // into an inadmissible kernel, resolve the SMALLEST
3002 // admissible integer `(nullspace, s)` at the requested
3003 // nullspace order. The formula default is the same
3004 // native-Gram Duchon smoother as the scale-free path, so
3005 // there is no collocation-operator floor to honor here.
3006 // Users that opt into operator penalties get the stricter
3007 // gate at basis-build time from the requested operators.
3008 let max_op = crate::basis::duchon_max_active_operator_derivative_order(
3009 &DuchonOperatorPenaltySpec::all_disabled(),
3010 );
3011 let (ns, s) = crate::basis::resolve_duchon_orders(
3012 cols.len(),
3013 requested_nullspace_order,
3014 max_op,
3015 length_scale,
3016 );
3017 (ns, s as f64)
3018 }
3019 }
3020 }
3021 };
3022 let plan = plan_spatial_basis(
3023 ds.values.nrows(),
3024 cols.len(),
3025 CenterCountRequest::Default,
3026 nullspace_order,
3027 option_bool(options, "scale_dims").unwrap_or(false),
3028 policy,
3029 )
3030 .map_err(|e| e.to_string())?;
3031 let centers_explicit = has_explicit_countwith_basis_alias(options, "centers");
3032 let polynomial_cols = match nullspace_order {
3033 DuchonNullspaceOrder::Zero => 1,
3034 DuchonNullspaceOrder::Linear => cols.len() + 1,
3035 DuchonNullspaceOrder::Degree(degree) => {
3036 crate::basis::duchon_nullspace_dimension(cols.len(), degree)
3037 }
3038 };
3039 let default_centers = default_duchon_center_count(
3040 ds.values.nrows(),
3041 cols.len(),
3042 plan.centers,
3043 polynomial_cols,
3044 );
3045 let requested_centers = parse_countwith_basis_alias(
3046 options,
3047 "centers",
3048 cap_default_spatial_centers(options, default_centers),
3049 )?;
3050 if requested_centers <= polynomial_cols {
3051 return Err(TermBuilderError::incompatible_config(format!(
3052 "Duchon smooth '{}' requested basis dimension {} but order={:?} in {}D needs {} polynomial null-space columns; choose centers/k > {}",
3053 vars.join(", "),
3054 requested_centers,
3055 nullspace_order,
3056 cols.len(),
3057 polynomial_cols,
3058 polynomial_cols,
3059 ))
3060 .to_string());
3061 }
3062 let mut centers = requested_centers;
3063 if !centers_explicit && ds.values.nrows() <= 32 && smooth_coordinate_count >= 5 {
3064 centers = centers.max(polynomial_cols + 4);
3065 }
3066 let center_strategy = if centers_explicit {
3067 spatial_center_strategy_for_dimension(centers, cols.len())
3068 } else {
3069 auto_spatial_center_strategy(centers, cols.len())
3070 };
3071 let aniso_log_scales = if option_bool(options, "scale_dims").unwrap_or(false) {
3072 Some(vec![0.0; cols.len()])
3073 } else {
3074 None
3075 };
3076 // Formula-level `duchon(...)` is the native Duchon reproducing-norm
3077 // smoother: the always-on Primary Gram plus the polynomial trend
3078 // ridge. Do not silently add collocated mass/tension penalties here.
3079 // They add extra REML hyperparameters and an O(k)-support quadrature
3080 // build to the default 2-D path, making `duchon(x, z)` materially
3081 // slower than the equivalent thin-plate fit without a principled
3082 // accuracy gain (gam#1718). Lower-order Hilbert-scale penalties remain
3083 // available to callers that construct an explicit DuchonBasisSpec.
3084 let operator_penalties = DuchonOperatorPenaltySpec::all_disabled();
3085 // For a 1-D periodic Duchon with no EXPLICIT period, anchor the wrap
3086 // to the covariate DATA range rather than letting the basis builder
3087 // derive it from the (k-subsampled) center span. The center span is a
3088 // strict subset of the data and undershoots the true period, seaming
3089 // the curve (f(0) ≠ f(2π)); the data range is the caller's actual
3090 // domain. Honors any explicit `period=` (parse_periodic_axes_option
3091 // already threaded it) and leaves multi-D / non-periodic untouched.
3092 let mut periodic = parse_periodic_axes_option(options, cols.len())?;
3093 if cols.len() == 1
3094 && let Some(axes) = periodic.as_mut()
3095 && axes.len() == 1
3096 && axes[0].is_none()
3097 {
3098 let (minv, maxv) = col_minmax(ds.values.column(cols[0]))?;
3099 if maxv > minv {
3100 axes[0] = Some(maxv - minv);
3101 }
3102 }
3103 Ok(SmoothBasisSpec::Duchon {
3104 feature_cols: cols.to_vec(),
3105 spec: DuchonBasisSpec {
3106 center_strategy,
3107 periodic,
3108 length_scale,
3109 power,
3110 nullspace_order,
3111 identifiability: parse_spatial_identifiability(options)
3112 .map_err(|e| e.to_string())?,
3113 aniso_log_scales,
3114 operator_penalties,
3115 boundary: if cols.len() == 1 {
3116 let c = cols[0];
3117 let (minv, maxv) = col_minmax(ds.values.column(c))?;
3118 parse_cyclic_boundary(options, minv, maxv)?
3119 } else {
3120 OneDimensionalBoundary::Open
3121 },
3122 radial_reparam: None,
3123 },
3124 input_scales: None,
3125 })
3126 }
3127 "tensor" | "te" | "ti" | "t2" => {
3128 validate_known_options(
3129 "tensor",
3130 options,
3131 &[
3132 "type",
3133 "bs",
3134 "by",
3135 "k",
3136 "basis_dim",
3137 "basis-dim",
3138 "basisdim",
3139 "knot_placement",
3140 "knot-placement",
3141 "knotplacement",
3142 "degree",
3143 "penalty_order",
3144 "double_penalty",
3145 "periodic",
3146 "cyclic",
3147 "period",
3148 "periods",
3149 "period_start",
3150 "period_end",
3151 "origin",
3152 "origins",
3153 "period_origin",
3154 "period-origin",
3155 "domain_origin",
3156 "boundary",
3157 "bc",
3158 "identifiability",
3159 "id",
3160 "__by_col",
3161 ],
3162 )?;
3163 if cols.len() < 2 {
3164 return Err(TermBuilderError::incompatible_config(format!(
3165 "tensor smooth expects at least 2 variables, got {}",
3166 cols.len()
3167 ))
3168 .to_string());
3169 }
3170 let dim = cols.len();
3171
3172 // Tensor-product contract (#1082). `te(x1, x2, ...)` ALWAYS builds a
3173 // genuine anisotropic tensor product of per-margin bases (the arm
3174 // below), exactly as mgcv's `te()` does — one smoothing parameter per
3175 // margin, a marginal-Kronecker-sum penalty, and the bilinear null
3176 // space left unpenalized under the default `select = FALSE`. A margin
3177 // vector `bs=c('tp','tp')` requests a thin-plate FUNCTION SPACE per
3178 // axis; the tensor realizes each axis as a 1-D penalized B-spline
3179 // margin spanning that same per-axis space (tp/ps/cr/bs/cc all share
3180 // it). We deliberately do NOT silently swap the requested tensor for a
3181 // single multi-D ISOTROPIC thin-plate radial smooth (`s(x,y,bs='tp')`):
3182 // that is a different model — one isotropic smoothing parameter, no
3183 // per-margin anisotropy — and substituting it while the user wrote a
3184 // tensor formula is dishonest. A user who genuinely wants the isotropic
3185 // radial smooth asks for it directly with `s(x1, x2, bs='tp')`.
3186 // Per-margin basis vector (`bs=c('tp','tp')` / `bs=['ps','cr']`):
3187 // validate each requested margin is a penalized-spline basis that
3188 // the tensor product realizes as a 1-D B-spline margin. mgcv's
3189 // `tp`/`ps`/`cr`/`bs`/`cc` margins are all penalized splines over
3190 // the same per-axis function space, so a B-spline margin recovers
3191 // the same tensor smoothing space; genuinely different margin kinds
3192 // (e.g. adaptive `ad`, random `re`) are rejected loudly rather than
3193 // silently substituted.
3194 if let Some(raw) = options.get("bs").or_else(|| options.get("type"))
3195 && bs_selector_is_vector(raw)
3196 {
3197 let per_margin = parse_option_list(raw);
3198 if per_margin.len() != dim {
3199 return Err(TermBuilderError::invalid_option(format!(
3200 "tensor smooth per-margin bs vector has {} entries but the smooth has {} margins",
3201 per_margin.len(),
3202 dim
3203 ))
3204 .to_string());
3205 }
3206 for (axis, margin_bs) in per_margin.iter().enumerate() {
3207 if !tensor_margin_bs_is_supported(margin_bs) {
3208 return Err(TermBuilderError::unsupported_feature(format!(
3209 "tensor smooth margin {axis} basis '{margin_bs}' is not a supported penalized-spline margin; \
3210 tensor margins accept tp/tps/ps/bs/cr/cc"
3211 ))
3212 .to_string());
3213 }
3214 }
3215 }
3216 let periodic_axes = parse_tensor_periodic_axes(options, dim)?;
3217 validate_tensor_boundary_tokens(options, dim)?;
3218 let periods_opt = parse_periods(options, &periodic_axes)?;
3219 let origins_opt = parse_period_origins(options, &periodic_axes)?;
3220 let degree = option_usize(options, "degree").unwrap_or(DEFAULT_BSPLINE_DEGREE);
3221 let penalty_order =
3222 option_usize(options, "penalty_order").unwrap_or(if degree > 1 { 2 } else { 1 });
3223 let (mut k_list, k_inferred) = parse_tensor_k_list(options, cols, ds)?;
3224 if ds.values.nrows() <= 32 && smooth_coordinate_count >= 5 {
3225 for k in &mut k_list {
3226 *k = (*k).min(degree + 2);
3227 }
3228 }
3229 if k_inferred {
3230 inference_notes.push(format!(
3231 "Automatically set per-margin basis sizes {:?} for tensor smooth '{}' \
3232 (dimension-aware tensor budget: total ∏k kept near the mgcv-te default \
3233 and within the data support, distributed geometrically across margins and \
3234 capped per margin by each column's resolution). \
3235 Override with k=<int> or k=[k0,k1,...].",
3236 k_list,
3237 vars.join(",")
3238 ));
3239 }
3240 // Per-axis requested marginal basis family. mgcv's `te()`/`ti()`
3241 // default marginal basis is the cubic regression spline (`cr`), and
3242 // the te_3d quality gap (#1074) is precisely the marginal-basis
3243 // resolution at small `k`: a `cr` margin places k value-knots at
3244 // data quantiles (finer interior resolution under natural boundary
3245 // constraints) where the cubic B-spline margin has only
3246 // `k-degree-1` interior knots. Resolve each axis to either an
3247 // explicit per-margin `bs` (vector `bs=c('cr','ps')`), a single
3248 // scalar `bs`, or the unset default — and route
3249 // `cr`/`cs`/unset/`tp`/`tps` margins through the natural cubic
3250 // regression builder (`NaturalCubicRegression` knotspec), keeping
3251 // explicit `ps`/`bs`/`bspline` on the B-spline margin.
3252 let per_axis_bs: Vec<Option<String>> =
3253 match options.get("bs").or_else(|| options.get("type")) {
3254 Some(raw) if bs_selector_is_vector(raw) => {
3255 let list = parse_option_list(raw);
3256 (0..dim).map(|a| list.get(a).cloned()).collect()
3257 }
3258 Some(raw) => {
3259 let scalar = raw
3260 .trim()
3261 .trim_matches('"')
3262 .trim_matches('\'')
3263 .to_ascii_lowercase();
3264 vec![Some(scalar); dim]
3265 }
3266 None => vec![None; dim],
3267 };
3268 // A margin is realized as a natural cubic regression spline when it
3269 // is the (unset) mgcv default, an explicit `cr`/`cs`, or a
3270 // `tp`/`tps` (same per-axis penalized-spline space). Explicit
3271 // B-spline-family margins (`ps`/`bs`/`bspline`/`p-spline`) keep the
3272 // open B-spline margin.
3273 let margin_wants_cr = |bs: &Option<String>| -> bool {
3274 matches!(
3275 bs.as_deref(),
3276 None | Some("cr") | Some("cs") | Some("tp") | Some("tps")
3277 )
3278 };
3279 let requested_knot_placement = parse_knot_placement(options)?;
3280 let mut margins: Vec<BSplineBasisSpec> = Vec::with_capacity(dim);
3281 let mut emitted_periods: Vec<Option<f64>> = Vec::with_capacity(dim);
3282 for axis in 0..dim {
3283 let c = cols[axis];
3284 let (data_min, data_max) = col_minmax(ds.values.column(c))?;
3285 // mgcv reduces a tensor margin's basis dimension to what its data
3286 // can support: a cr or B-spline margin cannot place more value
3287 // knots / basis functions than there are DISTINCT covariate
3288 // values on that axis. Without this cap an explicit `k` on a
3289 // low-cardinality margin — e.g. the binary `badh ∈ {0,1}` in
3290 // `te(age, badh, k=5)` — hard-failed in `select_cr_knots` ("cubic
3291 // regression spline with k=5 requires at least 5 distinct values,
3292 // got 2") instead of degrading to the 2-function (linear) margin
3293 // mgcv builds there. The auto-`k` path already caps per margin via
3294 // `heuristic_tensor_margin_knots`; mirror that for explicit `k`.
3295 // The cap propagates correctly: every per-axis quantity below
3296 // (effective degree, knot set, penalty order) is derived from
3297 // `k_axis`, and the marginal basis size is read from the resulting
3298 // knot spec — never from `k_list`. Floor at 2 so a margin still
3299 // carries at least a linear basis (tensor margins require k >= 2).
3300 let k_requested = k_list[axis];
3301 let n_distinct_axis = unique_count_column(ds.values.column(c));
3302 let k_axis = k_requested.min(n_distinct_axis).max(2);
3303 if k_axis < k_requested {
3304 log::info!(
3305 "tensor smooth: margin axis {axis} requested k={k_requested}, but the \
3306 covariate has only {n_distinct_axis} distinct value(s); reducing this \
3307 margin to k={k_axis} (mgcv-style data-support cap on the per-axis basis)."
3308 );
3309 }
3310 // Per-axis effective spline degree. The B-spline basis with `k`
3311 // functions is well-defined for any `degree <= k - 1`; mgcv's
3312 // `te(...)` exploits this so a binary tensor margin
3313 // (`k=2` → linear basis) or a ternary margin (`k=3` → quadratic)
3314 // can coexist with a smoother continuous margin under one
3315 // shared `degree=` request. We mirror that: if the caller
3316 // explicitly asks for `k < degree + 1`, drop the degree on
3317 // THAT axis only to the largest feasible spline, and track the
3318 // penalty order so the marginal difference penalty stays
3319 // well-defined (`order < num_basis_functions` is required by
3320 // `create_difference_penalty_matrix`). Apply the same
3321 // per-margin degree shrinkage to periodic tensor margins too:
3322 // a cyclic marginal basis with k=3 cannot be cubic, but it is
3323 // still a valid lower-degree cyclic margin with dimension k,
3324 // matching mgcv's small-k tensor-margin behavior.
3325 if k_axis < 2 {
3326 return Err(TermBuilderError::invalid_option(format!(
3327 "tensor smooth: k[{axis}]={k_axis} too small; tensor margins require k >= 2"
3328 ))
3329 .to_string());
3330 }
3331 let effective_degree = degree.min(k_axis - 1).max(1);
3332 let effective_penalty_order = penalty_order.min(effective_degree);
3333 // A `cc`/`cp`/`cyclic` per-margin basis declares periodicity
3334 // without necessarily supplying a `period=`: mgcv's `bs="cc"`
3335 // wraps at the covariate's observed data range. Mirror the 1-D
3336 // cyclic fallback (`parse_periodic_domain_1d`) here so a bare
3337 // `te(x, z, bs=c('cc','cc'))` wraps each margin on its own
3338 // [min, max] span instead of hard-erroring (#1752).
3339 let margin_is_cc = matches!(
3340 canonicalize_smooth_type(per_axis_bs[axis].as_deref().unwrap_or("")),
3341 "cc" | "cp" | "cyclic"
3342 );
3343 let (knotspec, boundary, axis_period) = if periodic_axes[axis] {
3344 // A `cc`/`cp`/`cyclic` per-margin basis declares periodicity
3345 // without necessarily supplying a `period=`; in that case wrap
3346 // at the covariate's observed [min, max] span, mirroring the
3347 // 1-D cyclic fallback (`parse_periodic_domain_1d`) so a bare
3348 // `te(x, z, bs=c('cc','cc'))` wraps each margin on its own
3349 // range instead of hard-erroring (#1752). An axis made
3350 // periodic by an explicit `periodic=`/`boundary=` selector
3351 // (not a cyclic margin basis) still requires an explicit
3352 // `period=`: a data-derived period there is a sample-dependent
3353 // off-by-ε seam and is not inferred.
3354 let (domain_start, period_value) = match periods_opt[axis] {
3355 Some(period_value) => {
3356 if !period_value.is_finite() || period_value <= 0.0 {
3357 return Err(format!(
3358 "tensor smooth axis {axis}: period must be a positive finite value, got {period_value}"
3359 ));
3360 }
3361 (origins_opt[axis].unwrap_or(data_min), period_value)
3362 }
3363 None if margin_is_cc => {
3364 let span = data_max - data_min;
3365 if !span.is_finite() || span <= 0.0 {
3366 return Err(format!(
3367 "tensor smooth axis {axis}: cyclic margin requires a positive \
3368 observed data range to derive its period, got [{data_min}, {data_max}]"
3369 ));
3370 }
3371 (origins_opt[axis].unwrap_or(data_min), span)
3372 }
3373 None => {
3374 return Err(format!(
3375 "tensor smooth axis {axis} is periodic but requires an explicit \
3376 period: pass period=<value> (scalar) or period=[..., <value>, ...]. \
3377 Deriving the period from the observed data range is sample-dependent \
3378 (off-by-ε seam), so it is not inferred."
3379 ));
3380 }
3381 };
3382 let domain_end = domain_start + period_value;
3383 (
3384 BSplineKnotSpec::PeriodicUniform {
3385 data_range: (domain_start, domain_end),
3386 num_basis: k_axis,
3387 },
3388 OneDimensionalBoundary::Cyclic {
3389 start: domain_start,
3390 end: domain_end,
3391 },
3392 Some(period_value),
3393 )
3394 } else if margin_wants_cr(&per_axis_bs[axis])
3395 && requested_knot_placement != crate::basis::BSplineKnotPlacement::Quantile
3396 && k_axis >= 3
3397 {
3398 // mgcv `te()`/`ti()` default cr margin: place exactly
3399 // `k_axis` Lancaster–Salkauskas value-knots at data
3400 // quantiles. The cr basis dimension equals the knot count,
3401 // so this reproduces the requested per-margin `k` directly.
3402 // A natural cubic regression spline needs at least 3 knots
3403 // (one interior); a `k_axis < 3` margin (e.g. a binary
3404 // tensor axis requesting a linear margin) falls through to
3405 // the B-spline branch below, exactly as before #1074 — mgcv
3406 // likewise does not build a `cr` margin below k=3. An
3407 // explicit `knot_placement=quantile` also falls through:
3408 // that option selects the generated B-spline knot strategy
3409 // represented by `Automatic { Quantile }`, whereas the cr
3410 // margin has already materialized its quantile value-knots.
3411 let cr_knots = crate::basis::select_cr_knots(ds.values.column(c), k_axis)
3412 .map_err(|e| e.to_string())?;
3413 (
3414 BSplineKnotSpec::NaturalCubicRegression { knots: cr_knots },
3415 OneDimensionalBoundary::Open,
3416 None,
3417 )
3418 } else {
3419 // `num_internal_knots = k - degree - 1` reproduces the
3420 // requested basis size exactly when degree was reduced for
3421 // a low-cardinality margin; keep the legacy `.max(1)`
3422 // floor on the un-reduced path so the existing knot
3423 // geometry is unchanged whenever the user already passed
3424 // k >= degree + 1.
3425 let num_internal_knots = if effective_degree < degree {
3426 k_axis.saturating_sub(effective_degree + 1)
3427 } else {
3428 k_axis.saturating_sub(degree + 1).max(1)
3429 };
3430 let knotspec = match requested_knot_placement {
3431 crate::basis::BSplineKnotPlacement::Uniform => BSplineKnotSpec::Generate {
3432 data_range: (data_min, data_max),
3433 num_internal_knots,
3434 },
3435 crate::basis::BSplineKnotPlacement::Quantile => {
3436 crate::basis::auto_knot_vector_1d_quantile(
3437 ds.values.column(c),
3438 num_internal_knots,
3439 effective_degree,
3440 )
3441 .map_err(|e| e.to_string())?;
3442 BSplineKnotSpec::Automatic {
3443 num_internal_knots: Some(num_internal_knots),
3444 placement: crate::basis::BSplineKnotPlacement::Quantile,
3445 }
3446 }
3447 };
3448 (knotspec, OneDimensionalBoundary::Open, None)
3449 };
3450 // A `cr` margin fixes cubic regression geometry; the cr builder
3451 // reads only the knot set + `double_penalty`. Enable null-space
3452 // shrinkage for an explicit `cs` margin. B-spline margins keep
3453 // the resolved effective degree / penalty order with no extra
3454 // null-space penalty (mgcv `select = FALSE` tensor default).
3455 let is_cr_margin =
3456 matches!(knotspec, BSplineKnotSpec::NaturalCubicRegression { .. });
3457 let margin_double_penalty =
3458 is_cr_margin && matches!(per_axis_bs[axis].as_deref(), Some("cs"));
3459 margins.push(BSplineBasisSpec {
3460 degree: effective_degree,
3461 penalty_order: effective_penalty_order,
3462 knotspec,
3463 double_penalty: margin_double_penalty,
3464 identifiability: BSplineIdentifiability::None,
3465 boundary,
3466 boundary_conditions: BSplineBoundaryConditions::default(),
3467 });
3468 emitted_periods.push(axis_period);
3469 }
3470 // #1593: canonicalize the margin order so a tensor smooth is invariant
3471 // to the typed order of its covariates. `te(x, z)` and `te(z, x)` span
3472 // the IDENTICAL tensor-product space under the identical per-margin
3473 // penalty family, but the design is the Khatri–Rao product
3474 // `B_first ⊙ B_second`, so the typed order permutes the design columns
3475 // (and the per-margin penalty blocks `S_first⊗I`, `I⊗S_second`). That
3476 // permutation is a pure relabelling in exact arithmetic — REML is
3477 // invariant to it — yet it reorders the penalized normal-equation / REML
3478 // eigen/Cholesky linear algebra, and the resulting sub-ULP differences
3479 // route the outer λ optimizer to a different terminal point in te's flat
3480 // REML valley (the over-smoothed margin rails to the ρ bound while the
3481 // other lands on a materially different λ̂). So the shipped surface
3482 // drifted ~2–6 % of range with a cosmetic swap of the covariate order
3483 // (the #1378 row-permutation / #1456 rotation flat-valley gauge family).
3484 // Sorting the margins by their source feature-column index makes the same
3485 // physical model build the identical problem regardless of typed order,
3486 // so the fit — and every prediction rebuilt from the resolved spec — is
3487 // genuinely order-invariant. `ti`/`t2` share this arm and become exactly
3488 // invariant too (they were already ~1e-5 by centring each margin
3489 // separately; canonicalization makes the swap bit-identical).
3490 let canon_cols: Vec<usize> = {
3491 let mut perm: Vec<usize> = (0..dim).collect();
3492 perm.sort_by_key(|&a| cols[a]);
3493 if perm.iter().enumerate().any(|(i, &a)| i != a) {
3494 margins = perm.iter().map(|&a| margins[a].clone()).collect();
3495 emitted_periods = perm.iter().map(|&a| emitted_periods[a]).collect();
3496 }
3497 perm.iter().map(|&a| cols[a]).collect()
3498 };
3499 let any_periodic = emitted_periods.iter().any(|p| p.is_some());
3500 let periods_vec = if any_periodic {
3501 emitted_periods
3502 } else {
3503 Vec::new()
3504 };
3505 // Tensor smooths (`te`/`ti`/`t2`) must match mgcv's DEFAULT
3506 // `select = FALSE`: the joint null space of the per-margin
3507 // penalties — the bilinear, low-order interaction directions that
3508 // no marginal roughness operator can see — is left UNPENALIZED.
3509 // mgcv only adds a null-space shrinkage penalty there under the
3510 // opt-in `select = TRUE` (which gam exposes as `double_penalty`).
3511 //
3512 // The general smooth default (`smooth_double_penalty`, true) is
3513 // calibrated for 1-D `s()` terms; carrying it into tensors silently
3514 // shrinks the genuinely-present bilinear interaction signal, so
3515 // REML places positive weight on the extra ridge and systematically
3516 // OVER-SMOOTHS the recovered surface relative to mgcv's plain
3517 // `te`/`ti` (gam#700/#701/#702/#703). Default tensors to no extra
3518 // null-space penalty; an explicit user `double_penalty=`/`select=`
3519 // still wins.
3520 let tensor_double_penalty = option_bool(options, "double_penalty").unwrap_or(false);
3521 Ok(SmoothBasisSpec::TensorBSpline {
3522 feature_cols: canon_cols,
3523 spec: TensorBSplineSpec {
3524 marginalspecs: margins,
3525 periods: periods_vec,
3526 double_penalty: tensor_double_penalty,
3527 identifiability: parse_tensor_identifiability(options, kind)?,
3528 // `t2` selects mgcv's separable (Wood, Scheipl & Faraway
3529 // 2013) decomposition. It can arrive either as the `t2(...)`
3530 // function form (`SmoothKind::T2`) or as a `type="t2"` /
3531 // `bs="t2"` option on an `s(...)`/`te(...)` term, in which
3532 // case `kind` is *not* `T2` but the resolved type string is
3533 // "t2". Keying only off `kind` silently aliased the option
3534 // form to `te`'s Kronecker-sum penalty (gam#1185); key off
3535 // the resolved type string as well so both routes build the
3536 // separable penalty.
3537 penalty_decomposition: if matches!(kind, SmoothKind::T2)
3538 || type_opt.as_str() == "t2"
3539 {
3540 TensorBSplinePenaltyDecomposition::Separable
3541 } else {
3542 TensorBSplinePenaltyDecomposition::MarginalKroneckerSum
3543 },
3544 },
3545 })
3546 }
3547 "pca" => {
3548 validate_known_options(
3549 "pca",
3550 options,
3551 &[
3552 "type",
3553 "bs",
3554 "by",
3555 "k",
3556 "basis_dim",
3557 "basis-dim",
3558 "basisdim",
3559 "lazy_path",
3560 "path",
3561 "pca_basis_path",
3562 "chunk_size",
3563 "smooth_penalty",
3564 "centered",
3565 "double_penalty",
3566 "id",
3567 "__by_col",
3568 ],
3569 )?;
3570 let path = options
3571 .get("lazy_path")
3572 .or_else(|| options.get("pca_basis_path"))
3573 .or_else(|| options.get("path"))
3574 .map(|raw| PathBuf::from(strip_quotes(raw)));
3575 let Some(path) = path else {
3576 return Err(TermBuilderError::incompatible_config(
3577 "pca smooth requires lazy_path=... on the formula path",
3578 )
3579 .to_string());
3580 };
3581 let k = option_usize_any(options, &["k", "basis_dim", "basis-dim", "basisdim"])
3582 .unwrap_or(0);
3583 let chunk_size = option_usize(options, "chunk_size").unwrap_or(DEFAULT_PCA_CHUNK_SIZE);
3584 Ok(SmoothBasisSpec::Pca {
3585 feature_cols: cols.to_vec(),
3586 basis_matrix: Array2::<f64>::zeros((cols.len(), k)),
3587 centered: option_bool(options, "centered").unwrap_or(true),
3588 smooth_penalty: option_f64(options, "smooth_penalty").unwrap_or(1.0),
3589 center_mean: None,
3590 pca_basis_path: Some(path),
3591 chunk_size,
3592 })
3593 }
3594 other => Err(TermBuilderError::unsupported_feature(format!(
3595 "unsupported smooth type '{other}'"
3596 ))
3597 .to_string()),
3598 }
3599}
3600
3601/// Initialise per-axis anisotropic log-scales on eligible spatial smooth specs.
3602pub fn enable_scale_dimensions(spec: &mut TermCollectionSpec) {
3603 for smooth in spec.smooth_terms.iter_mut() {
3604 // A multi-axis thin-plate term cannot carry per-axis anisotropy on its
3605 // single curvature penalty, so `scale_dimensions` was historically a
3606 // silent no-op for `bs="tp"` (gam#1676). Rewrite it to the
3607 // mathematically-equivalent anisotropic s=0 Duchon spline first; the
3608 // Duchon arm below then sees an already-seeded `aniso_log_scales` and
3609 // leaves it untouched.
3610 promote_thin_plate_for_scale_dimensions(&mut smooth.basis);
3611 match &mut smooth.basis {
3612 SmoothBasisSpec::Matern {
3613 feature_cols,
3614 spec: matern,
3615 ..
3616 } => {
3617 if matern.aniso_log_scales.is_none() {
3618 let d = feature_cols.len();
3619 matern.aniso_log_scales = Some(vec![0.0; d]);
3620 }
3621 }
3622 SmoothBasisSpec::Duchon {
3623 feature_cols,
3624 spec: duchon,
3625 ..
3626 } => {
3627 if duchon.aniso_log_scales.is_none() {
3628 let d = feature_cols.len();
3629 duchon.aniso_log_scales = Some(vec![0.0; d]);
3630 }
3631 }
3632 _ => {}
3633 }
3634 }
3635}
3636
3637/// Rewrite a multi-axis thin-plate term into the mathematically-equivalent
3638/// anisotropic s=0 Duchon spline so that `scale_dimensions` genuinely engages
3639/// (gam#1676).
3640///
3641/// ## Why a rewrite rather than a new field on the TPS builder
3642///
3643/// A canonical thin-plate regression spline carries a *single* curvature
3644/// penalty — the exact `∫|Dᵐ f|²` reproducing-kernel Gram. That penalty has no
3645/// per-axis structure to make one direction more or less relevant than another,
3646/// so per-axis anisotropy (`scale_dimensions`) cannot be expressed on it. The
3647/// flag was therefore a silent no-op for `bs="tp"` while it engaged for
3648/// `duchon()`/`matern()`.
3649///
3650/// The thin-plate kernel `r^{2m−d}` (the `r²·log r` log-case in even `d`) is
3651/// *exactly* the s=0 Duchon kernel (`DuchonBasisSpec::power = 0`,
3652/// `length_scale = None`) at the matching polynomial null-space order
3653/// `m = thin_plate_penalty_order(d)`. The Duchon polyharmonic family already
3654/// carries the per-axis tension ARD that `scale_dimensions` requests: its
3655/// isotropic first-order roughness penalty `Σ‖∇f‖²` splits into `d` directional
3656/// penalties `Σ(∂f/∂x_a)²`, each with its own REML `λ_a`
3657/// (`duchon_operator_penalty_candidates`). So the well-posed *anisotropic
3658/// thin-plate spline is the anisotropic s=0 Duchon spline*. Rewriting to that
3659/// representation reuses the battle-tested Duchon anisotropy / ψ-derivative /
3660/// freeze / predict machinery instead of duplicating it onto the TPS metadata
3661/// path, and keeps the polyharmonic family internally consistent. The codebase
3662/// already promotes infeasible-`k` TPS to Duchon for the same reason (the
3663/// canonical TPS single curvature penalty cannot deliver a requested
3664/// capability); per-axis anisotropy is another such capability.
3665///
3666/// This fires *only* when the user opts into `scale_dimensions`; the default
3667/// thin-plate path (`scale_dimensions` off) is left bit-for-bit unchanged.
3668/// A 1-D thin-plate term is left untouched — anisotropy is meaningless on a
3669/// single axis (its `Σ η = 0` contrast vector is empty), exactly as for a 1-D
3670/// Matérn/Duchon term.
3671fn promote_thin_plate_for_scale_dimensions(basis: &mut SmoothBasisSpec) {
3672 let SmoothBasisSpec::ThinPlate {
3673 feature_cols,
3674 spec,
3675 input_scales,
3676 } = &*basis
3677 else {
3678 return;
3679 };
3680 let d = feature_cols.len();
3681 if d <= 1 {
3682 return;
3683 }
3684 // m = thin_plate_penalty_order(d) is the TPS penalty order; the Duchon
3685 // null-space order naming is `Zero → m=1`, `Linear → m=2`,
3686 // `Degree(g) → m=g+1`, so the s=0 Duchon kernel exponent
3687 // `2(p+s) − d = 2m − d` reproduces the TPS kernel exactly.
3688 let m = thin_plate_penalty_order(d);
3689 let nullspace_order = match m {
3690 0 | 1 => DuchonNullspaceOrder::Zero,
3691 2 => DuchonNullspaceOrder::Linear,
3692 _ => DuchonNullspaceOrder::Degree(m - 1),
3693 };
3694 let duchon_spec = DuchonBasisSpec {
3695 center_strategy: spec.center_strategy.clone(),
3696 periodic: spec.periodic.clone(),
3697 // Pure, scale-free Duchon — the thin-plate kernel has no length scale
3698 // (a global TPS kernel scale is non-identifiable once REML learns the
3699 // smoothing penalty: gam#718/#721/#731/#732). The per-axis relevance
3700 // the user asked for is carried by the tension-ARD `λ_a`, not a κ axis.
3701 length_scale: None,
3702 // s = 0 ⇒ thin-plate kernel `r^{2m−d}`.
3703 power: 0.0,
3704 nullspace_order,
3705 identifiability: spec.identifiability.clone(),
3706 // All-zero geometry seed sentinel: `auto_seed_aniso_contrasts` resolves
3707 // it from the (standardized) knot cloud, and the per-axis tension split
3708 // engages on `aniso.is_some()`.
3709 aniso_log_scales: Some(vec![0.0; d]),
3710 operator_penalties: DuchonOperatorPenaltySpec::default(),
3711 boundary: OneDimensionalBoundary::Open,
3712 radial_reparam: None,
3713 };
3714 let feature_cols = feature_cols.clone();
3715 let input_scales = input_scales.clone();
3716 // All borrows of `*basis` (the `&*basis` destructure above) end with the
3717 // clones on the two preceding lines, so the reassignment is sound.
3718 *basis = SmoothBasisSpec::Duchon {
3719 feature_cols,
3720 spec: duchon_spec,
3721 input_scales,
3722 };
3723}
3724
3725// ---------------------------------------------------------------------------
3726// Data-aware helpers
3727// ---------------------------------------------------------------------------
3728
3729pub fn spatial_center_strategy_for_dimension(num_centers: usize, d: usize) -> CenterStrategy {
3730 if d <= 3 {
3731 // In low-dimensional spatial smooths, an explicit `k` is a resolution
3732 // request rather than a request for marginal quantile-midpoint centers.
3733 // Use deterministic maximin geometry so Matérn/GP and Duchon REML see a
3734 // well-resolved native kernel block with small fill distance instead of
3735 // compensating for holes or endpoint under-resolution by over-smoothing
3736 // low-noise signals (#504).
3737 CenterStrategy::FarthestPoint { num_centers }
3738 } else {
3739 default_spatial_center_strategy(num_centers, d)
3740 }
3741}
3742
3743pub fn col_minmax(col: ArrayView1<'_, f64>) -> Result<(f64, f64), String> {
3744 let min = col.iter().fold(f64::INFINITY, |a, &b| a.min(b));
3745 let max = col.iter().fold(f64::NEG_INFINITY, |a, &b| a.max(b));
3746 if !min.is_finite() || !max.is_finite() {
3747 return Err(TermBuilderError::degenerate_data(
3748 "non-finite data encountered while inferring knot range",
3749 )
3750 .to_string());
3751 }
3752 if (max - min).abs() < 1e-12 {
3753 Ok((min, min + 1e-6))
3754 } else {
3755 Ok((min, max))
3756 }
3757}
3758
3759pub fn unique_count_column(col: ArrayView1<'_, f64>) -> usize {
3760 use std::collections::HashSet;
3761 let mut set = HashSet::<u64>::with_capacity(col.len());
3762 for &v in col {
3763 let norm = if v == 0.0 { 0.0 } else { v };
3764 set.insert(norm.to_bits());
3765 }
3766 set.len().max(1)
3767}
3768
3769/// Minimum knot count for a natural cubic regression spline: `select_cr_knots`
3770/// places one value-knot per basis function and needs at least an interior knot,
3771/// so the sparsest representable cr basis is `{const, linear, curvature}` at
3772/// three knots. Below this a cr spline is not constructible and the caller must
3773/// degrade to the linear B-spline marginal.
3774pub(crate) const CR_MIN_KNOTS: usize = 3;
3775
3776/// Build a cubic-regression marginal knot spec capped to the covariate's data
3777/// support, mgcv-style.
3778///
3779/// A `cr`/`cs`/`sz` marginal places exactly one basis function per value-knot,
3780/// so `select_cr_knots` cannot place more knots than the covariate has DISTINCT
3781/// values — it `bail`s with "cubic regression spline with k=N requires at least
3782/// N distinct values" otherwise. An unclamped `k` on an ordinary low-cardinality
3783/// covariate (a binary indicator, a 3-level ordinal/Likert score, a small count)
3784/// therefore hard-failed the whole fit instead of reducing the basis the way
3785/// mgcv — and gam's own tensor-margin path (996f829d7, `term_builder.rs:2986` /
3786/// the `k_axis >= 3` cr gate at `:3047`) — do. This is the univariate / factor-
3787/// smooth sibling of that tensor cap (#1541, #1542).
3788///
3789/// Returns:
3790/// - `Some(NaturalCubicRegression { .. })` with `k = min(k_requested, n_distinct)`
3791/// value-knots when the data supports a cr spline (`n_distinct >= CR_MIN_KNOTS`).
3792/// A cr basis of exactly `n_distinct` knots is full-rank for the data — it can
3793/// represent any per-distinct-value structure (e.g. 3 arbitrary group means on
3794/// a ternary covariate) — so the cap never costs recoverable signal.
3795/// - `None` when `n_distinct < CR_MIN_KNOTS` (a binary covariate): too few
3796/// distinct values for ANY cr spline, so the caller degrades to the linear
3797/// B-spline marginal — exactly what the default `s(x, k=..)` basis already
3798/// builds on the same data, and what the tensor path's `< 3` branch builds.
3799///
3800/// `inference_notes` records any reduction so the user sees that `k` was capped
3801/// (mgcv emits a warning in the same situation).
3802fn capped_cr_marginal_knotspec(
3803 col: ArrayView1<'_, f64>,
3804 k_cr_requested: usize,
3805 label: &str,
3806 inference_notes: &mut Vec<String>,
3807) -> Result<Option<BSplineKnotSpec>, String> {
3808 let n_distinct = unique_count_column(col);
3809 let k_cr = k_cr_requested.min(n_distinct);
3810 if k_cr < CR_MIN_KNOTS {
3811 inference_notes.push(format!(
3812 "Smooth '{label}': cubic-regression ('cr'/'cs'/'sz') basis requested k={k_cr_requested}, \
3813 but the covariate has only {n_distinct} distinct value(s) — too few to support a cubic \
3814 regression spline (needs >= {CR_MIN_KNOTS} distinct values). Degraded to the linear \
3815 B-spline marginal the default basis builds on the same data."
3816 ));
3817 return Ok(None);
3818 }
3819 if k_cr < k_cr_requested {
3820 inference_notes.push(format!(
3821 "Smooth '{label}': cubic-regression ('cr'/'cs'/'sz') basis reduced from k={k_cr_requested} \
3822 to k={k_cr} to match the covariate's {n_distinct} distinct value(s) (mgcv-style \
3823 data-support cap; a cr basis cannot place more value-knots than the data has)."
3824 ));
3825 }
3826 let cr_knots = crate::basis::select_cr_knots(col, k_cr).map_err(|e| e.to_string())?;
3827 Ok(Some(BSplineKnotSpec::NaturalCubicRegression {
3828 knots: cr_knots,
3829 }))
3830}
3831
3832/// Smallest number of distinct covariate values seen within any single group
3833/// of `group_col`. For a factor smooth this is the resolution that bounds the
3834/// marginal basis: a group with `m` distinct covariate values can only inform
3835/// `m` basis coefficients, so a marginal richer than that interpolates the
3836/// group instead of estimating a penalized trend. Bits are compared exactly so
3837/// integer-valued covariates (days, dose levels) collapse to their true count.
3838fn min_per_group_unique_count(
3839 feature_col: ArrayView1<'_, f64>,
3840 group_col: ArrayView1<'_, f64>,
3841) -> usize {
3842 use std::collections::{HashMap, HashSet};
3843 let mut per_group: HashMap<u64, HashSet<u64>> = HashMap::new();
3844 for (xi, gi) in feature_col.iter().zip(group_col.iter()) {
3845 let xnorm = if *xi == 0.0 { 0.0 } else { *xi };
3846 let gnorm = if *gi == 0.0 { 0.0 } else { *gi };
3847 per_group
3848 .entry(gnorm.to_bits())
3849 .or_default()
3850 .insert(xnorm.to_bits());
3851 }
3852 per_group
3853 .values()
3854 .map(|s| s.len())
3855 .min()
3856 .unwrap_or(1)
3857 .max(1)
3858}
3859
3860/// Default internal-knot count for an *additive* univariate smooth, derived
3861/// from the column's unique-value count.
3862///
3863/// The basis dimension is `internal_knots + degree + 1`, so the cap below maps
3864/// to a default cubic basis of ~12 functions — deliberately close to mgcv's
3865/// univariate default (`k = 10`). A penalized smooth controls its wiggliness
3866/// through the *penalty*, not the basis size: REML/LAML shrinks a too-rich
3867/// basis toward the null, but it cannot do so cleanly when the basis is so
3868/// over-sized that the design becomes weakly identified. Growing the basis with
3869/// `n` (the old `n^(1/3)`-ceilinged `unique/4` rule, which pinned to 20 internal
3870/// knots ⇒ a 24-function basis for any column with ≥80 unique values) therefore
3871/// *hurts* recovery on finite, weak-signal fits: a 4-smooth additive model on
3872/// n=120 asks for ~92 coefficients, the outer optimizer stalls on the resulting
3873/// flat two-penalty (range + null-space) REML surface, and the truth leaks into
3874/// surplus columns the penalty can't shrink away (gam#1680; the same defect was
3875/// documented for thin-plate fields in gam#1074). A k-sweep on the #1680 design
3876/// confirms a basis of ~10–15 recovers truth at RMSE ≈ 0.12 while the old
3877/// 24-function default lands at ≈ 0.39 (~3× worse) — *whether or not* the
3878/// covariates are collinear, so this is basis over-richness, not collinearity.
3879///
3880/// The cap is flat in `n`: a user who genuinely needs a wigglier fit raises `k`
3881/// explicitly (mgcv's contract — opt *in* to more flexibility), and the SPEC
3882/// requires the default to allow recovering the null rather than forcing the
3883/// user to opt out of overfitting. The 4-knot floor stays put because we still
3884/// need enough basis functions to fit a non-trivial smooth at all, and the
3885/// `unique/4` growth below the cap keeps small/sparse columns (n ≤ 32, where
3886/// `unique/4 ≤ 8`) on exactly their previous knot count.
3887pub fn heuristic_knots_for_column(col: ArrayView1<'_, f64>) -> usize {
3888 /// Default cubic basis ≈ `MAX_DEFAULT_INTERNAL_KNOTS + degree + 1` = 12
3889 /// functions, matching mgcv's lean univariate default.
3890 const MAX_DEFAULT_INTERNAL_KNOTS: usize = 8;
3891 let unique = unique_count_column(col);
3892 (unique / 4).clamp(4, MAX_DEFAULT_INTERNAL_KNOTS)
3893}
3894
3895/// Per-margin basis sizes for a tensor-product smooth (`te`/`ti`/`t2`).
3896///
3897/// The 1-D heuristic [`heuristic_knots_for_column`] is calibrated for an
3898/// *additive* margin: a well-resolved column asks for the lean univariate
3899/// default (≈12 basis functions, the mgcv-like cap of 8 internal knots; see
3900/// gam#1680), which is sensible for a single `s(x)` term.
3901/// A tensor product, however, multiplies the per-margin sizes:
3902/// `p = ∏_d k_d`. Reusing the 1-D rule per margin makes `p` explode with the
3903/// tensor dimension — a 3-D `te(x,y,z)` at the 1-D ceiling of 12/margin is
3904/// `12³ ≈ 1728` columns, and every REML evaluation pays an O(p³) dense
3905/// penalty reparameterization (the full-tensor sum-to-zero constraint is not
3906/// Kronecker-factorable), turning model selection over tensor candidates into
3907/// a multi-minute single-threaded stall (gam#813). It also requests far more
3908/// coefficients than the data can identify whenever `p ≫ n`.
3909///
3910/// mgcv's `te(...)` uses a small per-margin default (`k = 5`, i.e. `5^d`).
3911/// We match that spirit while staying data-adaptive: budget the *total* tensor
3912/// column count `p_target` and distribute it geometrically across the margins
3913/// so `∏ k_d ≈ p_target`, never asking a margin for more functions than its
3914/// own unique values (and the data set) can support.
3915fn heuristic_tensor_margin_knots(cols: &[usize], ds: &Dataset) -> Vec<usize> {
3916 let d = cols.len().max(1);
3917 let degree = DEFAULT_BSPLINE_DEGREE;
3918 let min_k = degree + 2; // smallest margin that carries a difference penalty
3919 let n = ds.values.nrows();
3920
3921 // Per-margin 1-D ceiling: never request more basis functions than the
3922 // margin's own resolution (unique values) supports. This caps each axis
3923 // independently before the joint budget is applied.
3924 let per_margin_cap: Vec<usize> = cols
3925 .iter()
3926 .map(|&c| heuristic_knots_for_column(ds.values.column(c)).max(min_k))
3927 .collect();
3928
3929 // Total-basis budget. A tensor with ∏k ≫ n coefficients is rank-deficient
3930 // and pure REML cost; cap the product at a generous fraction of n while
3931 // honoring mgcv's small default for the common small-d case. The budget
3932 // grows with n but the geometric split below keeps each margin modest.
3933 // d=2 → up to ~7²=49 (mgcv-`te`-like), d=3 → ~5³=125, larger d shrinks
3934 // per-margin further so the product never blows past the data support.
3935 let mgcv_like_per_margin = match d {
3936 2 => 7usize,
3937 3 => 5usize,
3938 _ => 4usize,
3939 };
3940 let mgcv_like_total = (mgcv_like_per_margin as f64).powi(d as i32);
3941 let data_budget = (n as f64) * 0.8;
3942 let p_target = mgcv_like_total
3943 .max(min_k.pow(d as u32) as f64)
3944 .min(data_budget);
3945
3946 // Geometric per-margin target so ∏k ≈ p_target, then clamp each margin to
3947 // its own 1-D resolution cap and the difference-penalty floor.
3948 let geo_per_margin = p_target.powf(1.0 / d as f64).round() as usize;
3949 let unclamped: Vec<usize> = per_margin_cap
3950 .iter()
3951 .map(|&cap| geo_per_margin.clamp(min_k, cap))
3952 .collect();
3953
3954 // The per-margin clamps can pull some axes below `geo_per_margin` (a
3955 // low-resolution column), leaving headroom in the joint budget. Redistribute
3956 // that headroom to the margins that can still grow, so the realized ∏k stays
3957 // close to p_target instead of systematically under-shooting it.
3958 let mut k_list = unclamped;
3959 loop {
3960 let product: f64 = k_list.iter().map(|&k| k as f64).product();
3961 if product >= p_target {
3962 break;
3963 }
3964 // Grow the axis with the most remaining headroom (cap − current),
3965 // breaking ties toward the largest cap. Stop when none can grow.
3966 let Some(idx) = k_list
3967 .iter()
3968 .zip(per_margin_cap.iter())
3969 .enumerate()
3970 .filter(|&(_, (k, cap))| k < cap)
3971 .max_by_key(|&(_, (k, cap))| (cap - k, *cap))
3972 .map(|(i, _)| i)
3973 else {
3974 break;
3975 };
3976 k_list[idx] += 1;
3977 }
3978 k_list
3979}
3980
3981pub fn heuristic_centers(n: usize, d: usize) -> usize {
3982 default_num_centers(n, d)
3983}
3984
3985// ---------------------------------------------------------------------------
3986// Smooth option parsers
3987// ---------------------------------------------------------------------------
3988
3989fn parse_endpoint_side(
3990 value: &str,
3991 context: &str,
3992) -> Result<BSplineEndpointBoundaryCondition, String> {
3993 match value.trim().to_ascii_lowercase().as_str() {
3994 "" | "none" | "open" | "unconstrained" | "free" => {
3995 Ok(BSplineEndpointBoundaryCondition::Free)
3996 }
3997 "clamped" | "clamp" | "zero_derivative" | "zero-derivative" => {
3998 Ok(BSplineEndpointBoundaryCondition::Clamped)
3999 }
4000 "anchored" | "anchor" | "zero" | "zero_value" | "zero-value" => {
4001 Ok(BSplineEndpointBoundaryCondition::Anchored { value: 0.0 })
4002 }
4003 other => Err(format!(
4004 "unsupported {context} boundary condition '{other}'; expected free, clamped, or anchored"
4005 )),
4006 }
4007}
4008
4009fn boundary_anchor_value(
4010 options: &BTreeMap<String, String>,
4011 side: &str,
4012 fallback: Option<f64>,
4013) -> Option<f64> {
4014 [
4015 format!("anchor_{side}"),
4016 format!("{side}_anchor"),
4017 format!("anchor-value-{side}"),
4018 ]
4019 .iter()
4020 .find_map(|key| option_f64(options, key))
4021 .or(fallback)
4022}
4023
4024fn apply_anchor_value(
4025 cond: BSplineEndpointBoundaryCondition,
4026 value: Option<f64>,
4027) -> BSplineEndpointBoundaryCondition {
4028 match cond {
4029 BSplineEndpointBoundaryCondition::Anchored { .. } => {
4030 BSplineEndpointBoundaryCondition::Anchored {
4031 value: value.unwrap_or(0.0),
4032 }
4033 }
4034 other => other,
4035 }
4036}
4037
4038fn parse_bspline_boundary_conditions(
4039 options: &BTreeMap<String, String>,
4040) -> Result<BSplineBoundaryConditions, String> {
4041 let fallback_anchor = option_f64(options, "anchor")
4042 .or_else(|| option_f64(options, "anchor_value"))
4043 .or_else(|| option_f64(options, "value"));
4044 let global_boundary_conditions = options
4045 .get("boundary_conditions")
4046 .or_else(|| options.get("bc"));
4047 let mut boundary_conditions = BSplineBoundaryConditions::default();
4048
4049 if let Some(raw_boundary_conditions) = global_boundary_conditions {
4050 let cond = parse_endpoint_side(raw_boundary_conditions, "boundary_conditions")?;
4051 let side = options
4052 .get("side")
4053 .map(|s| s.trim().to_ascii_lowercase())
4054 .unwrap_or_else(|| "both".to_string());
4055 match side.as_str() {
4056 "both" | "all" | "endpoints" => {
4057 boundary_conditions.left = cond;
4058 boundary_conditions.right = cond;
4059 }
4060 "left" | "start" | "lower" => boundary_conditions.left = cond,
4061 "right" | "end" | "upper" => boundary_conditions.right = cond,
4062 other => {
4063 return Err(format!(
4064 "unsupported B-spline boundary side '{other}'; expected left, right, or both"
4065 ));
4066 }
4067 }
4068 }
4069
4070 if let Some(raw) = options
4071 .get("bc_left")
4072 .or_else(|| options.get("left_bc"))
4073 .or_else(|| options.get("bc_start"))
4074 .or_else(|| options.get("start_bc"))
4075 {
4076 boundary_conditions.left = parse_endpoint_side(raw, "left endpoint")?;
4077 }
4078 if let Some(raw) = options
4079 .get("bc_right")
4080 .or_else(|| options.get("right_bc"))
4081 .or_else(|| options.get("bc_end"))
4082 .or_else(|| options.get("end_bc"))
4083 {
4084 boundary_conditions.right = parse_endpoint_side(raw, "right endpoint")?;
4085 }
4086
4087 boundary_conditions.left = apply_anchor_value(
4088 boundary_conditions.left,
4089 boundary_anchor_value(options, "left", fallback_anchor),
4090 );
4091 boundary_conditions.right = apply_anchor_value(
4092 boundary_conditions.right,
4093 boundary_anchor_value(options, "right", fallback_anchor),
4094 );
4095
4096 // Non-zero anchors require an affine offset term that the current basis
4097 // builder does not synthesize (see `build_bspline_basis_1d` in
4098 // src/terms/basis.rs). Surface the rejection at parse time with the side
4099 // and value in the diagnostic, instead of letting the value-only error
4100 // emerge deep inside the basis builder where the user has no context
4101 // about which anchor key (`anchor`, `left_anchor`, `right_anchor`, …)
4102 // routed into which endpoint.
4103 reject_nonzero_anchor("left", boundary_conditions.left)?;
4104 reject_nonzero_anchor("right", boundary_conditions.right)?;
4105
4106 Ok(boundary_conditions)
4107}
4108
4109fn reject_nonzero_anchor(side: &str, cond: BSplineEndpointBoundaryCondition) -> Result<(), String> {
4110 if let BSplineEndpointBoundaryCondition::Anchored { value } = cond {
4111 if value.abs() > 1e-12 {
4112 return Err(format!(
4113 "non-zero {side} anchor {value} requires an affine offset term that is not yet supported; only anchored value 0 is accepted at parse time"
4114 ));
4115 }
4116 }
4117 Ok(())
4118}
4119
4120/// Resolve the requested internal-knot count and effective spline degree for
4121/// a 1-D penalized B-spline smooth. This mirrors the tensor-margin per-axis
4122/// degree-reduction policy: a 1-D B-spline basis with `k` functions
4123/// is well-defined for any `degree <= k - 1`, so an explicit
4124/// `s(x, bs="ps", k=3)` with default `degree=3` is interpreted as the
4125/// largest representable spline (`effective_degree = k - 1 = 2`, quadratic)
4126/// rather than rejected. The `penalty_order` carried by the caller must be
4127/// clamped to `<= effective_degree` so the marginal difference penalty
4128/// stays well-defined; the returned `effective_degree` makes that explicit.
4129///
4130/// Mirrors the tensor margin treatment in the `te(...)` builder so a
4131/// standalone smooth, a factor smooth, and a tensor margin all interpret
4132/// "small k" the same way.
4133fn parse_ps_internal_knots(
4134 options: &BTreeMap<String, String>,
4135 degree: usize,
4136 default_internal_knots: usize,
4137) -> Result<(usize, bool, usize), String> {
4138 const MIN_EXPRESSIVE_INTERNAL_KNOTS: usize = 2;
4139 // Strict variants: reject `k=-1`, `k=1.5`, `knots=-2` etc. with a
4140 // focused error instead of silently dropping the value and using the
4141 // default. Lenient `option_usize` / `option_usize_any` silently swallow
4142 // unparseable values, which leaves the user thinking they configured
4143 // something when they did not.
4144 // A list-valued `knots=[...]` carries explicit internal positions, not a
4145 // count; it is consumed by `parse_explicit_internal_knots`. Treat it as
4146 // "count not specified" here so the strict integer parse does not reject
4147 // the bracketed value (the Provided path ignores the returned count).
4148 let knots_internal = if knots_option_is_list(options) {
4149 None
4150 } else {
4151 option_usize_strict(options, "knots")?
4152 };
4153 let basis_dim = option_usize_any_strict(options, &["k", "basis_dim", "basis-dim", "basisdim"])?;
4154 if knots_internal.is_some() && basis_dim.is_some() {
4155 return Err(TermBuilderError::incompatible_config(
4156 "ps/bspline smooth: specify either knots=<internal_knots> or k=<basis_dim> (not both)",
4157 )
4158 .to_string());
4159 }
4160 if let Some(k) = basis_dim {
4161 if k < 2 {
4162 return Err(TermBuilderError::invalid_option(format!(
4163 "ps/bspline smooth: k={} too small; B-spline basis requires k >= 2",
4164 k
4165 ))
4166 .to_string());
4167 }
4168 // `degree <= k - 1` is required for the B-spline basis to be
4169 // well-defined; reduce on this axis only when the user asked for
4170 // a smaller k than the cubic default supports. This matches mgcv's
4171 // behaviour (e.g. `s(x, bs="ps", k=3)` becomes a quadratic basis)
4172 // and the per-axis reduction the tensor builder already does.
4173 let effective_degree = degree.min(k - 1).max(1);
4174 let num_internal_knots = if effective_degree < degree {
4175 // Reproduce the requested basis size exactly when degree was
4176 // reduced for a low-cardinality axis: num_basis = k.
4177 k.saturating_sub(effective_degree + 1)
4178 } else {
4179 (k - degree - 1).max(MIN_EXPRESSIVE_INTERNAL_KNOTS)
4180 };
4181 Ok((num_internal_knots, false, effective_degree))
4182 } else {
4183 Ok((
4184 knots_internal.unwrap_or(default_internal_knots),
4185 knots_internal.is_none(),
4186 degree,
4187 ))
4188 }
4189}
4190
4191/// True when the `knots` option value is a *list* literal (`[...]`, `c(...)`,
4192/// or `(...)`) rather than a scalar count. mgcv's `knots=` accepts both: a
4193/// single integer is an internal-knot count, while a vector is explicit
4194/// internal knot positions. We disambiguate purely on the wrapper syntax so a
4195/// bare `knots=5` keeps its historical count meaning.
4196fn knots_option_is_list(options: &BTreeMap<String, String>) -> bool {
4197 options
4198 .get("knots")
4199 .map(|raw| {
4200 let t = raw.trim();
4201 t.starts_with('[') || t.starts_with("c(") || t.starts_with("C(") || t.starts_with('(')
4202 })
4203 .unwrap_or(false)
4204}
4205
4206/// Parse `knots=[k0, k1, ...]` (or `c(...)` / `(...)`) into explicit internal
4207/// knot positions. Returns `Ok(None)` when `knots` is absent or a scalar count
4208/// (handled by [`parse_ps_internal_knots`]); `Ok(Some(positions))` when it is a
4209/// non-empty numeric list; and an error for an empty or unparseable list.
4210fn parse_explicit_internal_knots(
4211 options: &BTreeMap<String, String>,
4212) -> Result<Option<Vec<f64>>, String> {
4213 if !knots_option_is_list(options) {
4214 return Ok(None);
4215 }
4216 let raw = options
4217 .get("knots")
4218 .expect("knots_option_is_list implies the key is present");
4219 let tokens = split_list_option(raw);
4220 if tokens.is_empty() {
4221 return Err(TermBuilderError::invalid_option(format!(
4222 "knots={raw} is an empty list; supply at least one internal knot position \
4223 (e.g. knots=[0.2, 0.5, 0.8]) or a scalar count (e.g. knots=8)"
4224 ))
4225 .to_string());
4226 }
4227 let mut positions = Vec::with_capacity(tokens.len());
4228 for tok in &tokens {
4229 let value = parse_numeric_expr(tok).map_err(|err| {
4230 TermBuilderError::invalid_option(format!(
4231 "knots list entry '{tok}' is not a numeric position: {err}"
4232 ))
4233 .to_string()
4234 })?;
4235 positions.push(value);
4236 }
4237 Ok(Some(positions))
4238}
4239
4240/// Resolve the `knot_placement=` option for an automatically generated knot
4241/// vector. Accepts `"uniform"` (the default, equal spacing on the data range)
4242/// and `"quantile"` (interior knots at empirical data quantiles, better for
4243/// skewed covariates). Unknown values are rejected so typos do not silently
4244/// fall back to uniform.
4245fn parse_knot_placement(
4246 options: &BTreeMap<String, String>,
4247) -> Result<crate::basis::BSplineKnotPlacement, String> {
4248 use crate::basis::BSplineKnotPlacement;
4249 match options
4250 .get("knot_placement")
4251 .or_else(|| options.get("knot-placement"))
4252 .or_else(|| options.get("knotplacement"))
4253 {
4254 None => Ok(BSplineKnotPlacement::Uniform),
4255 Some(raw) => match raw
4256 .trim()
4257 .trim_matches('"')
4258 .trim_matches('\'')
4259 .to_ascii_lowercase()
4260 .as_str()
4261 {
4262 "uniform" | "even" | "equal" => Ok(BSplineKnotPlacement::Uniform),
4263 "quantile" | "quantiles" | "data" | "empirical" => Ok(BSplineKnotPlacement::Quantile),
4264 other => Err(TermBuilderError::invalid_option(format!(
4265 "knot_placement={other} is not recognised; expected \"uniform\" or \"quantile\""
4266 ))
4267 .to_string()),
4268 },
4269 }
4270}
4271
4272/// Build the non-periodic 1D B-spline knot spec for the `ps`/`bspline` and
4273/// factor-smooth marginal paths, honoring (in priority order):
4274/// 1. `knots=[...]` explicit internal positions → [`BSplineKnotSpec::Provided`]
4275/// 2. `knot_placement="quantile"` → [`BSplineKnotSpec::Automatic`]
4276/// 3. uniform generation → [`BSplineKnotSpec::Generate`]
4277///
4278/// `data` is the covariate column (used to clamp explicit positions to the
4279/// observed range and to drive quantile placement); `n_knots` is the resolved
4280/// internal-knot count from [`parse_ps_internal_knots`] used for the automatic
4281/// strategies.
4282fn resolve_nonperiodic_bspline_knotspec(
4283 options: &BTreeMap<String, String>,
4284 data: ArrayView1<'_, f64>,
4285 data_range: (f64, f64),
4286 degree: usize,
4287 n_knots: usize,
4288) -> Result<BSplineKnotSpec, String> {
4289 use crate::basis::{BSplineKnotPlacement, clamped_knot_vector_from_internal_positions};
4290 if let Some(positions) = parse_explicit_internal_knots(options)? {
4291 if option_usize_any_strict(options, &["k", "basis_dim", "basis-dim", "basisdim"])?.is_some()
4292 {
4293 return Err(TermBuilderError::incompatible_config(
4294 "ps/bspline smooth: specify either explicit knots=[...] positions or \
4295 k=<basis_dim> (not both); the basis size is fixed by the knot vector",
4296 )
4297 .to_string());
4298 }
4299 let knots = clamped_knot_vector_from_internal_positions(data_range, &positions, degree)
4300 .map_err(|e| e.to_string())?;
4301 return Ok(BSplineKnotSpec::Provided(knots));
4302 }
4303 match parse_knot_placement(options)? {
4304 BSplineKnotPlacement::Uniform => Ok(BSplineKnotSpec::Generate {
4305 data_range,
4306 num_internal_knots: n_knots,
4307 }),
4308 BSplineKnotPlacement::Quantile => {
4309 // Validate the column up-front so an unfittable request surfaces a
4310 // user-correctable error at parse time rather than deep in basis
4311 // construction. The same data drives the eventual quantile knots.
4312 crate::basis::auto_knot_vector_1d_quantile(data, n_knots, degree)
4313 .map_err(|e| e.to_string())?;
4314 Ok(BSplineKnotSpec::Automatic {
4315 num_internal_knots: Some(n_knots),
4316 placement: BSplineKnotPlacement::Quantile,
4317 })
4318 }
4319 }
4320}
4321
4322/// Reject unknown option keys with a focused error that names the term and
4323/// the offending key, plus suggests near-matches from the known-key list.
4324/// Without this, typos like `lengt_scale=0.1` or `nyu=5/2` are silently
4325/// dropped, the term uses the default, and the user has no idea why their
4326/// option had no effect.
4327pub fn validate_known_options(
4328 term_name: &str,
4329 options: &BTreeMap<String, String>,
4330 known: &[&str],
4331) -> Result<(), String> {
4332 let known_set: std::collections::BTreeSet<&&str> = known.iter().collect();
4333 for key in options.keys() {
4334 if !known_set.contains(&key.as_str()) {
4335 if term_name == "tensor" && is_tensor_k_axis_option_key(key) {
4336 continue;
4337 }
4338 // Suggest near-matches (substring or shared prefix ≥ 3).
4339 let key_l = key.to_ascii_lowercase();
4340 let mut suggestions: Vec<&str> = known
4341 .iter()
4342 .filter(|k| {
4343 let kl = k.to_ascii_lowercase();
4344 kl.contains(&key_l) || key_l.contains(&kl) || {
4345 let n = kl
4346 .chars()
4347 .zip(key_l.chars())
4348 .take_while(|(a, b)| a == b)
4349 .count();
4350 n >= 3
4351 }
4352 })
4353 .copied()
4354 .collect();
4355 suggestions.sort_unstable();
4356 suggestions.dedup();
4357 let hint = if suggestions.is_empty() {
4358 String::new()
4359 } else {
4360 format!(" — did you mean one of [{}]?", suggestions.join(", "))
4361 };
4362 return Err(TermBuilderError::invalid_option(format!(
4363 "{term_name}() does not accept option `{key}`{hint}. Valid options: [{}]",
4364 {
4365 let mut sorted = known.to_vec();
4366 sorted.sort_unstable();
4367 sorted.join(", ")
4368 }
4369 ))
4370 .to_string());
4371 }
4372 }
4373 Ok(())
4374}
4375
4376/// Private (engine-injected) option that caps the *default* spatial center
4377/// count for a secondary (distributional) predictor's smooth — see
4378/// `solver::fit_orchestration::apply_secondary_predictor_basis_parsimony` and #501.
4379///
4380/// It is deliberately NOT one of the user-facing count aliases recognised by
4381/// [`has_explicit_countwith_basis_alias`], so it never flips the spatial basis
4382/// onto the explicit (hard) center-placement strategy: the cap lowers the
4383/// *default* count while the `Auto` strategy is retained, so the count is still
4384/// softly reduced when the data can't support it.
4385pub const SECONDARY_CENTER_CAP_OPTION: &str = "__secondary_center_cap";
4386
4387/// Apply the secondary-predictor center cap to a *default* spatial center
4388/// count. A no-op when the cap option is absent (the common case) or when the
4389/// user supplied an explicit count (then `default_count` is ignored downstream
4390/// by [`parse_countwith_basis_alias`] anyway).
4391pub(crate) fn cap_default_spatial_centers(
4392 options: &BTreeMap<String, String>,
4393 default_count: usize,
4394) -> usize {
4395 match option_usize(options, SECONDARY_CENTER_CAP_OPTION) {
4396 Some(cap) => default_count.min(cap),
4397 None => default_count,
4398 }
4399}
4400
4401fn default_matern_center_count(n: usize, d: usize, planned_count: usize) -> usize {
4402 // #1074: the mgcv-sized basis cap (`k = 10·3^(d-1)`) was DELETED here too — it
4403 // masked the same over-sizing/under-penalization defect by shrinking the basis
4404 // rather than fixing the optimizer. The default now uses the generic n-scaling
4405 // plan. A small-n floor against a numerically-fragile two-column kernel block
4406 // is a legitimate degenerate guard and is kept. Explicit `k`/`centers` still
4407 // take full effect upstream.
4408 let low_n_floor = (d + 4).min(n);
4409 planned_count.max(low_n_floor).max(1)
4410}
4411
4412fn default_duchon_center_count(
4413 n: usize,
4414 d: usize,
4415 planned_count: usize,
4416 polynomial_cols: usize,
4417) -> usize {
4418 // Duchon fits pay a larger setup cost than Matérn/TPS because the
4419 // constrained radial block is rotated through its center Gram and several
4420 // operator-collocation penalties. The old generic spatial default handed a
4421 // 2-D Gaussian Duchon at n≈500 more than one hundred centers, so cold fits
4422 // spent most of their time in dense O(k³) eigensolves even though the REML
4423 // smoother uses a low-rank basis. mgcv's Duchon spline default is the
4424 // thin-plate-style `k = 10 * 3^(d - 1)` (30 in 2-D); use that as the
4425 // implicit low-rank cap while preserving the user's explicit `centers=`/`k=`
4426 // request above. The polynomial null space must still fit, so tiny
4427 // high-order bases are raised to the smallest admissible count.
4428 let mgcv_default = 10usize.saturating_mul(3usize.saturating_pow(d.saturating_sub(1) as u32));
4429 let low_n_floor = (polynomial_cols + 1).min(n).max(1);
4430 planned_count.min(mgcv_default).max(low_n_floor)
4431}
4432
4433pub fn parse_countwith_basis_alias(
4434 options: &BTreeMap<String, String>,
4435 primarykey: &str,
4436 default_count: usize,
4437) -> Result<usize, String> {
4438 // Strict: reject unparseable values (e.g. `centers=many`, `centers=-1`,
4439 // `centers=1.5`) instead of silently dropping them and falling through
4440 // to the default. Without this the user gets the auto-inferred count
4441 // silently and never realizes their explicit option was ignored.
4442 let primary = option_usize_strict(options, primarykey)?;
4443 let basis_dim = option_usize_any_strict(
4444 options,
4445 &["k", "basis_dim", "basis-dim", "basisdim", "knots"],
4446 )?;
4447 if primary.is_some() && basis_dim.is_some() {
4448 return Err(TermBuilderError::incompatible_config(format!(
4449 "specify either {}=<count> or k=<basis_dim> (not both)",
4450 primarykey
4451 ))
4452 .to_string());
4453 }
4454 Ok(primary.or(basis_dim).unwrap_or(default_count))
4455}
4456
4457pub fn has_explicit_countwith_basis_alias(
4458 options: &BTreeMap<String, String>,
4459 primarykey: &str,
4460) -> bool {
4461 options.contains_key(primarykey)
4462 || ["k", "basis_dim", "basis-dim", "basisdim", "knots"]
4463 .iter()
4464 .any(|alias| options.contains_key(*alias))
4465}
4466
4467pub fn parse_cyclic_boundary(
4468 options: &BTreeMap<String, String>,
4469 minv: f64,
4470 maxv: f64,
4471) -> Result<OneDimensionalBoundary, String> {
4472 let cyclic = option_bool(options, "cyclic")
4473 .or_else(|| option_bool(options, "periodic"))
4474 .unwrap_or(false);
4475 if !cyclic {
4476 return Ok(OneDimensionalBoundary::Open);
4477 }
4478 let start = match option_numeric_expr(options, "period_start")? {
4479 Some(v) => v,
4480 None => option_numeric_expr(options, "start")?.unwrap_or(minv),
4481 };
4482 let end = match option_numeric_expr(options, "period_end")? {
4483 Some(v) => v,
4484 None => option_numeric_expr(options, "end")?.unwrap_or(maxv),
4485 };
4486 if end <= start {
4487 return Err(format!(
4488 "cyclic smooth requires period_end/end ({end}) > period_start/start ({start})"
4489 ));
4490 }
4491 Ok(OneDimensionalBoundary::Cyclic { start, end })
4492}
4493
4494/// Parse the periodic-uniform domain for a one-dimensional cyclic smooth.
4495///
4496/// Returns the `(domain_start, period)` pair derived from
4497/// `period_start` / `start`, `period_end` / `end`, falling back to the
4498/// data range `[minv, maxv)` when neither bound is provided. The period
4499/// must be strictly positive.
4500pub fn parse_periodic_domain_1d(
4501 options: &BTreeMap<String, String>,
4502 minv: f64,
4503 maxv: f64,
4504) -> Result<(f64, f64), String> {
4505 let start_opt = match option_numeric_expr(options, "period_start")? {
4506 Some(v) => Some(v),
4507 None => option_numeric_expr(options, "start")?,
4508 };
4509 let end_opt = match option_numeric_expr(options, "period_end")? {
4510 Some(v) => Some(v),
4511 None => option_numeric_expr(options, "end")?,
4512 };
4513 // Reject the pure data-range fallback. A B-spline periodic smooth that takes
4514 // its wrap from the observed [min, max] is sample-dependent and silently
4515 // wrong: uniform draws on a true period of 2π land on [ε, 2π−ε], so using
4516 // (max−min) as the period seams the curve with an off-by-ε discontinuity and
4517 // the fit drifts with the sample. (Unlike the radial closed-lattice Duchon
4518 // path, whose centers DO tile a full period, so its span-derive is exact —
4519 // see `parse_periodic_axes_option`.) Require the caller to name the period
4520 // explicitly via `period=`/`period_end`. The end is only defaulted to `maxv`
4521 // when a `period_start`/`start` was given (a half-open declaration); a bare
4522 // periodic smooth with neither bound is an error.
4523 if end_opt.is_none() && start_opt.is_none() {
4524 return Err(
4525 "periodic B-spline smooth requires an explicit period: pass period=<value> \
4526 (e.g. period=2*pi) or period_start=/period_end=. Deriving the period from the \
4527 observed data range is sample-dependent and produces an off-by-ε seam, so it is \
4528 not inferred."
4529 .to_string(),
4530 );
4531 }
4532 let start = start_opt.unwrap_or(minv);
4533 let end = end_opt.unwrap_or(maxv);
4534 if !(start.is_finite() && end.is_finite()) {
4535 return Err(format!(
4536 "periodic smooth domain requires finite endpoints, got ({start}, {end})"
4537 ));
4538 }
4539 if end <= start {
4540 return Err(format!(
4541 "periodic smooth requires period_end/end ({end}) > period_start/start ({start})"
4542 ));
4543 }
4544 Ok((start, end - start))
4545}
4546
4547fn parse_matern_nu(raw: &str) -> Result<MaternNu, String> {
4548 let trimmed = raw.trim();
4549 let lowered = trimmed.to_ascii_lowercase();
4550 match lowered.as_str() {
4551 "1/2" | "0.5" | "half" => return Ok(MaternNu::Half),
4552 "3/2" | "1.5" => return Ok(MaternNu::ThreeHalves),
4553 "5/2" | "2.5" => return Ok(MaternNu::FiveHalves),
4554 "7/2" | "3.5" => return Ok(MaternNu::SevenHalves),
4555 "9/2" | "4.5" => return Ok(MaternNu::NineHalves),
4556 _ => {}
4557 }
4558
4559 let value = if let Some((num, den)) = trimmed.split_once('/') {
4560 let num = num
4561 .trim()
4562 .parse::<f64>()
4563 .map_err(|err| format!("{}: {err}", unsupported_matern_nu_message(raw)))?;
4564 let den = den
4565 .trim()
4566 .parse::<f64>()
4567 .map_err(|err| format!("{}: {err}", unsupported_matern_nu_message(raw)))?;
4568 if den == 0.0 || !num.is_finite() || !den.is_finite() {
4569 return Err(unsupported_matern_nu_message(raw));
4570 }
4571 num / den
4572 } else {
4573 trimmed
4574 .parse::<f64>()
4575 .map_err(|err| format!("{}: {err}", unsupported_matern_nu_message(raw)))?
4576 };
4577
4578 const TOL: f64 = 1e-12;
4579 if (value - 0.5).abs() <= TOL {
4580 Ok(MaternNu::Half)
4581 } else if (value - 1.5).abs() <= TOL {
4582 Ok(MaternNu::ThreeHalves)
4583 } else if (value - 2.5).abs() <= TOL {
4584 Ok(MaternNu::FiveHalves)
4585 } else if (value - 3.5).abs() <= TOL {
4586 Ok(MaternNu::SevenHalves)
4587 } else if (value - 4.5).abs() <= TOL {
4588 Ok(MaternNu::NineHalves)
4589 } else {
4590 Err(unsupported_matern_nu_message(raw))
4591 }
4592}
4593
4594fn unsupported_matern_nu_message(raw: &str) -> String {
4595 TermBuilderError::unsupported_feature(format!(
4596 "unsupported Matern nu '{raw}'; supported half-integer values are 1/2, 3/2, 5/2, 7/2, and 9/2"
4597 ))
4598 .to_string()
4599}
4600
4601#[derive(Clone, Debug, serde::Serialize, serde::Deserialize)]
4602pub enum DuchonPowerPolicy {
4603 Explicit(f64),
4604 /// No explicit `power=` given: defer to the cubic structural default, which
4605 /// the builder resolves dimension-aware as `s = (d − 1)/2` (so `φ(r) = r³`
4606 /// in every dimension). There is no triple-operator minimum any more.
4607 CubicStructuralDefault,
4608}
4609
4610pub fn parse_duchon_power_policy(
4611 options: &BTreeMap<String, String>,
4612) -> Result<DuchonPowerPolicy, String> {
4613 if let Some(raw_nu) = options.get("nu") {
4614 return Err(TermBuilderError::incompatible_config(format!(
4615 "Duchon smooths use power=<number>, not nu='{}'. Use power=1.5, power=2, etc.",
4616 raw_nu
4617 ))
4618 .to_string());
4619 }
4620 match options.get("power") {
4621 Some(raw) => {
4622 let value = raw.parse::<f64>().map_err(|err| {
4623 TermBuilderError::invalid_option(format!(
4624 "invalid Duchon power '{}'; expected a non-negative number such as power=1.5 or power=2: {}",
4625 raw, err
4626 ))
4627 .to_string()
4628 })?;
4629 if !value.is_finite() || value < 0.0 {
4630 return Err(TermBuilderError::invalid_option(format!(
4631 "invalid Duchon power '{}'; expected a finite non-negative number such as power=1.5 or power=2",
4632 raw
4633 ))
4634 .to_string());
4635 }
4636 Ok(DuchonPowerPolicy::Explicit(value))
4637 }
4638 None => Ok(DuchonPowerPolicy::CubicStructuralDefault),
4639 }
4640}
4641
4642pub fn parse_duchon_power(options: &BTreeMap<String, String>) -> Result<f64, String> {
4643 match parse_duchon_power_policy(options)? {
4644 DuchonPowerPolicy::Explicit(power) => Ok(power),
4645 // Context-free placeholder: the bare option parser has no column count,
4646 // so it cannot compute the dimension-aware cubic power `s = (d − 1)/2`.
4647 // The dimension-aware resolution happens later in `build_smooth_basis`;
4648 // this 1.5 is only a stand-in for callers that need a concrete number
4649 // without data context (e.g. round-trip parser tests).
4650 DuchonPowerPolicy::CubicStructuralDefault => Ok(1.5),
4651 }
4652}
4653
4654pub fn parse_duchon_order(
4655 options: &BTreeMap<String, String>,
4656) -> Result<DuchonNullspaceOrder, String> {
4657 match options.get("order") {
4658 // Structural cubic Duchon is affine-by-default: an unspecified order is
4659 // the `Linear` (constant + linear) null space, matching the magic
4660 // default. An explicit `order=0` still selects the constant-only space.
4661 None => Ok(DuchonNullspaceOrder::Linear),
4662 Some(raw) => match raw.parse::<usize>() {
4663 Ok(0) => Ok(DuchonNullspaceOrder::Zero),
4664 Ok(1) => Ok(DuchonNullspaceOrder::Linear),
4665 Ok(other) => Ok(DuchonNullspaceOrder::Degree(other)),
4666 Err(_) => Err(TermBuilderError::invalid_option(format!(
4667 "invalid Duchon order '{}'; expected a non-negative integer such as order=0, order=1, or order=2",
4668 raw
4669 ))
4670 .to_string()),
4671 },
4672 }
4673}
4674
4675fn parse_matern_identifiability(
4676 options: &BTreeMap<String, String>,
4677) -> Result<MaternIdentifiability, TermBuilderError> {
4678 let Some(raw) = options.get("identifiability").map(String::as_str) else {
4679 return Ok(MaternIdentifiability::default());
4680 };
4681 match raw.trim().to_ascii_lowercase().as_str() {
4682 "none" => Ok(MaternIdentifiability::None),
4683 "sum_tozero" | "sum-to-zero" | "center_sum_tozero" | "center-sum-to-zero" | "centered" => {
4684 Ok(MaternIdentifiability::CenterSumToZero)
4685 }
4686 "linear" | "center_linear_orthogonal" | "center-linear-orthogonal" => {
4687 Ok(MaternIdentifiability::CenterLinearOrthogonal)
4688 }
4689 other => Err(TermBuilderError::unsupported_feature(format!(
4690 "invalid Matérn identifiability '{other}'; expected one of: none, sum_tozero, linear"
4691 ))),
4692 }
4693}
4694
4695fn parse_spatial_identifiability(
4696 options: &BTreeMap<String, String>,
4697) -> Result<SpatialIdentifiability, TermBuilderError> {
4698 let Some(raw) = options.get("identifiability").map(String::as_str) else {
4699 return Ok(SpatialIdentifiability::default());
4700 };
4701 match raw.trim().to_ascii_lowercase().as_str() {
4702 "none" => Ok(SpatialIdentifiability::None),
4703 "orthogonal"
4704 | "orthogonal_to_parametric"
4705 | "orthogonal-to-parametric"
4706 | "parametric_orthogonal" => Ok(SpatialIdentifiability::OrthogonalToParametric),
4707 "frozen" => Err(TermBuilderError::unsupported_feature(
4708 "spatial identifiability 'frozen' is internal-only; use none or orthogonal_to_parametric",
4709 )),
4710 other => Err(TermBuilderError::unsupported_feature(format!(
4711 "invalid spatial identifiability '{other}'; expected one of: none, orthogonal_to_parametric"
4712 ))),
4713 }
4714}
4715
4716#[cfg(test)]
4717mod tests {
4718 use super::*;
4719 use crate::basis::OperatorPenaltySpec;
4720 use crate::inference::formula_dsl::parse_formula;
4721 use gam_data::{DataSchema, SchemaColumn};
4722 use ndarray::Array2;
4723 use std::collections::BTreeMap;
4724
4725 fn continuous_dataset(headers: &[&str], rows: Vec<Vec<f64>>) -> Dataset {
4726 let nrows = rows.len();
4727 let ncols = headers.len();
4728 let values = Array2::from_shape_vec(
4729 (nrows, ncols),
4730 rows.into_iter().flat_map(|row| row.into_iter()).collect(),
4731 )
4732 .expect("rectangular test data");
4733 Dataset {
4734 headers: headers.iter().map(|name| name.to_string()).collect(),
4735 values,
4736 schema: DataSchema {
4737 columns: headers
4738 .iter()
4739 .map(|name| SchemaColumn {
4740 name: name.to_string(),
4741 kind: ColumnKindTag::Continuous,
4742 levels: vec![],
4743 })
4744 .collect(),
4745 },
4746 column_kinds: vec![ColumnKindTag::Continuous; ncols],
4747 }
4748 }
4749
4750 fn factor_dataset() -> Dataset {
4751 let rows = (0..24)
4752 .map(|i| {
4753 let x = i as f64 / 23.0;
4754 let g = (i % 2) as f64;
4755 vec![x + g, x, g]
4756 })
4757 .collect::<Vec<_>>();
4758 Dataset {
4759 headers: vec!["y".into(), "x".into(), "g".into()],
4760 values: Array2::from_shape_vec(
4761 (rows.len(), 3),
4762 rows.into_iter().flat_map(|row| row.into_iter()).collect(),
4763 )
4764 .expect("rectangular factor test data"),
4765 schema: DataSchema {
4766 columns: vec![
4767 SchemaColumn {
4768 name: "y".into(),
4769 kind: ColumnKindTag::Continuous,
4770 levels: vec![],
4771 },
4772 SchemaColumn {
4773 name: "x".into(),
4774 kind: ColumnKindTag::Continuous,
4775 levels: vec![],
4776 },
4777 SchemaColumn {
4778 name: "g".into(),
4779 kind: ColumnKindTag::Categorical,
4780 levels: vec!["a".into(), "b".into()],
4781 },
4782 ],
4783 },
4784 column_kinds: vec![
4785 ColumnKindTag::Continuous,
4786 ColumnKindTag::Continuous,
4787 ColumnKindTag::Categorical,
4788 ],
4789 }
4790 }
4791
4792 /// #1378: the DEFAULT univariate `s(x, bs="tp")` must build a *modest*
4793 /// mgcv-sized basis, not the n-scaled spatial heuristic. The oversized
4794 /// default basis left the two-penalty REML ρ-surface with a flat valley
4795 /// whose optimizer landing point depended on row order, breaking
4796 /// row-permutation invariance. Pin the default 1-D center count so a
4797 /// regression that reinstates the n-scaled default trips here, fast, with
4798 /// no fit/optimizer in the loop.
4799 #[test]
4800 fn default_univariate_thinplate_basis_dim_is_modest() {
4801 // n = 300 (the #1378 scenario): the n-scaled spatial heuristic would
4802 // request ~75 centers here. The modest default must stay near k = 10.
4803 let n = 300usize;
4804 let rows: Vec<Vec<f64>> = (0..n)
4805 .map(|i| {
4806 let x = -3.0 + 6.0 * (i as f64) / ((n - 1) as f64);
4807 vec![x.sin(), x]
4808 })
4809 .collect();
4810 let ds = continuous_dataset(&["y", "x"], rows);
4811
4812 let mut options = BTreeMap::new();
4813 options.insert("bs".to_string(), "tp".to_string());
4814
4815 let mut notes = Vec::new();
4816 let basis = build_smooth_basis(
4817 SmoothKind::S,
4818 &["x".to_string()],
4819 &[1],
4820 &options,
4821 &ds,
4822 &mut notes,
4823 &ResourcePolicy::default_library(),
4824 1,
4825 )
4826 .expect("build default univariate tp smooth");
4827
4828 let centers = match &basis {
4829 SmoothBasisSpec::ThinPlate { spec, .. } => match &spec.center_strategy {
4830 CenterStrategy::Auto(inner) => match inner.as_ref() {
4831 CenterStrategy::FarthestPoint { num_centers }
4832 | CenterStrategy::EqualMass { num_centers }
4833 | CenterStrategy::EqualMassCovarRepresentative { num_centers }
4834 | CenterStrategy::KMeans { num_centers, .. } => *num_centers,
4835 other => panic!("unexpected auto inner center strategy: {other:?}"),
4836 },
4837 CenterStrategy::FarthestPoint { num_centers }
4838 | CenterStrategy::EqualMass { num_centers }
4839 | CenterStrategy::EqualMassCovarRepresentative { num_centers }
4840 | CenterStrategy::KMeans { num_centers, .. } => *num_centers,
4841 other => panic!("unexpected center strategy: {other:?}"),
4842 },
4843 other => panic!("expected ThinPlate basis, got {other:?}"),
4844 };
4845
4846 // #1074: the mgcv-sized basis-dim ceiling assertion was removed with the
4847 // cap it tested. The default tp basis is now n-scaled; we only assert it
4848 // still builds a usable basis.
4849 assert!(
4850 centers >= 1,
4851 "default univariate tp must still build a usable basis (centers={centers})",
4852 );
4853 }
4854
4855 /// gam#1629: a default 2-D `matern(x1, x2)` (no explicit `length_scale`)
4856 /// must leave the length-scale at the `0.0` auto sentinel — NOT the full
4857 /// data diameter — so the planner's `auto_init_length_scale_in_place` seeds
4858 /// it on the wiggly/resolving side (`max_range / sqrt(n)`), the same regime
4859 /// thin-plate uses. The previous `default_matern_length_scale` returned the
4860 /// full diameter, which is non-zero, so the `0.0`-gated auto-init was a
4861 /// no-op and the κ-optimizer started in the over-smoothed corner and parked
4862 /// there (truth-RMSE ~6× worse than thin-plate/tensor on identical
4863 /// high-frequency 2-D surfaces, insensitive to `k`). This pins the corrected
4864 /// seed geometry without a fit/optimizer in the loop.
4865 #[test]
4866 fn default_matern_2d_seeds_resolving_length_scale_not_overscaled_diameter() {
4867 // A fine multi-frequency 2-D grid (the #1629 reproduction shape): the
4868 // data diameter is O(1.4) in each axis; the resolving seed must be far
4869 // smaller than the diameter so high-frequency structure stays reachable.
4870 let side = 24usize; // n = 576
4871 let mut rows: Vec<Vec<f64>> = Vec::with_capacity(side * side);
4872 for i in 0..side {
4873 for j in 0..side {
4874 let x1 = i as f64 / (side - 1) as f64; // [0, 1]
4875 let x2 = j as f64 / (side - 1) as f64; // [0, 1]
4876 let y = (6.0 * x1).sin() * (6.0 * x2).cos();
4877 rows.push(vec![y, x1, x2]);
4878 }
4879 }
4880 let n = rows.len();
4881 let ds = continuous_dataset(&["y", "x1", "x2"], rows);
4882
4883 let mut options = BTreeMap::new();
4884 options.insert("bs".to_string(), "gp".to_string()); // gp ⇒ Matérn
4885 let mut notes = Vec::new();
4886 let mut basis = build_smooth_basis(
4887 SmoothKind::S,
4888 &["x1".to_string(), "x2".to_string()],
4889 &[1, 2],
4890 &options,
4891 &ds,
4892 &mut notes,
4893 &ResourcePolicy::default_library(),
4894 1,
4895 )
4896 .expect("build default 2-D matern smooth");
4897
4898 // (1) The builder must emit the auto sentinel, not a baked-in diameter.
4899 let (feature_cols, seeded_length_scale) = match &basis {
4900 SmoothBasisSpec::Matern {
4901 feature_cols, spec, ..
4902 } => (feature_cols.clone(), spec.length_scale),
4903 other => panic!("expected Matern basis, got {other:?}"),
4904 };
4905 assert_eq!(
4906 seeded_length_scale, 0.0,
4907 "default matern() must leave length_scale at the 0.0 auto sentinel \
4908 (got {seeded_length_scale}); a non-zero diameter default re-enters the \
4909 over-smoothed basin and disables the planner's wiggly-side auto-init",
4910 );
4911
4912 // (2) After the shared auto-init runs, the realized length-scale must
4913 // land in the resolving regime: `max_range / sqrt(n)`, far below the
4914 // data diameter. This is the seed the κ-optimizer starts REML from.
4915 crate::smooth::auto_init_length_scale_in_basis(ds.values.view(), &mut basis);
4916 let realized = match &basis {
4917 SmoothBasisSpec::Matern { spec, .. } => spec.length_scale,
4918 other => panic!("expected Matern basis after auto-init, got {other:?}"),
4919 };
4920 let expected = crate::smooth::auto_initial_length_scale(ds.values.view(), &feature_cols);
4921 assert!(
4922 (realized - expected).abs() <= 1e-12,
4923 "auto-init must seed the wiggly-side length scale max_range/sqrt(n) \
4924 (expected {expected}, got {realized})",
4925 );
4926
4927 // Sanity: the resolving seed is well below the per-axis range (≈1.0).
4928 // Before the fix the seed was the full diameter (≈√2 ≈ 1.414); the
4929 // resolving seed here is ≈ 1.0 / sqrt(576) ≈ 0.042, ~30× smaller.
4930 let max_range = 1.0_f64; // each axis spans [0, 1]
4931 assert!(
4932 realized < max_range / 4.0,
4933 "matern seed length_scale {realized} must be in the resolving regime, \
4934 not the over-smoothed diameter corner (n={n}, max_range≈{max_range})",
4935 );
4936 }
4937
4938 /// gam#1778: `matern(..., periodic=true)` and `thinplate(..., periodic=true)`
4939 /// must be ACCEPTED. The squash-merge that wired periodic support into the
4940 /// matern/thinplate basis specs forgot to add the periodic option keys to
4941 /// those two builders' `validate_known_options` whitelists (only `duchon`
4942 /// got both), so `periodic=`/`period=`/`cyclic=`/`period_start=`/`period_end=`
4943 /// were rejected as unknown options even though the spec/builder consume them.
4944 /// Before the whitelist fix this returned an "unknown option" error.
4945 #[test]
4946 fn matern_and_thinplate_accept_periodic_option() {
4947 let n = 200usize;
4948 let rows: Vec<Vec<f64>> = (0..n)
4949 .map(|i| {
4950 let x = -3.0 + 6.0 * (i as f64) / ((n - 1) as f64);
4951 vec![x.sin(), x]
4952 })
4953 .collect();
4954 let ds = continuous_dataset(&["y", "x"], rows);
4955
4956 // matern() with periodic=true must build without an unknown-option error.
4957 let mut matern_opts = BTreeMap::new();
4958 matern_opts.insert("bs".to_string(), "gp".to_string()); // gp ⇒ Matérn
4959 matern_opts.insert("periodic".to_string(), "true".to_string());
4960 let mut notes = Vec::new();
4961 let matern_basis = build_smooth_basis(
4962 SmoothKind::S,
4963 &["x".to_string()],
4964 &[1],
4965 &matern_opts,
4966 &ds,
4967 &mut notes,
4968 &ResourcePolicy::default_library(),
4969 1,
4970 )
4971 .expect("matern(x, periodic=true) must be accepted");
4972 match &matern_basis {
4973 SmoothBasisSpec::Matern { spec, .. } => assert!(
4974 spec.periodic.is_some(),
4975 "periodic=true must thread a Some(periodic) into the matern spec",
4976 ),
4977 other => panic!("expected Matern basis, got {other:?}"),
4978 }
4979
4980 // thinplate()/tps() with periodic=true must likewise be accepted.
4981 let mut tps_opts = BTreeMap::new();
4982 tps_opts.insert("bs".to_string(), "tp".to_string());
4983 tps_opts.insert("periodic".to_string(), "true".to_string());
4984 let mut notes = Vec::new();
4985 let tps_basis = build_smooth_basis(
4986 SmoothKind::S,
4987 &["x".to_string()],
4988 &[1],
4989 &tps_opts,
4990 &ds,
4991 &mut notes,
4992 &ResourcePolicy::default_library(),
4993 1,
4994 )
4995 .expect("thinplate(x, periodic=true) must be accepted");
4996 match &tps_basis {
4997 SmoothBasisSpec::ThinPlate { spec, .. } => assert!(
4998 spec.periodic.is_some(),
4999 "periodic=true must thread a Some(periodic) into the thinplate spec",
5000 ),
5001 other => panic!("expected ThinPlate basis, got {other:?}"),
5002 }
5003 }
5004
5005 /// Regression: an explicit scalar `periodic=false` on a radial spatial smooth
5006 /// must build a NON-periodic basis. The scalar-boolean shortcut used to emit
5007 /// `Some(vec![None; dim])`, which the 1-D radial builders route on via
5008 /// `spec.periodic.is_some()` (and the Duchon arm even back-fills the data
5009 /// range into a lone `None`), so `periodic=false` silently produced a
5010 /// *periodic* smooth — the opposite of what was asked. The spec's `periodic`
5011 /// field must be `None` for every radial base (matern / thinplate / duchon),
5012 /// matching the bracketed `[false]` form.
5013 #[test]
5014 fn scalar_periodic_false_builds_non_periodic_radial_smooth() {
5015 let n = 200usize;
5016 let rows: Vec<Vec<f64>> = (0..n)
5017 .map(|i| {
5018 let x = -3.0 + 6.0 * (i as f64) / ((n - 1) as f64);
5019 vec![x.sin(), x]
5020 })
5021 .collect();
5022 let ds = continuous_dataset(&["y", "x"], rows);
5023
5024 let build = |bs: &str| -> SmoothBasisSpec {
5025 let mut opts = BTreeMap::new();
5026 opts.insert("bs".to_string(), bs.to_string());
5027 opts.insert("periodic".to_string(), "false".to_string());
5028 let mut notes = Vec::new();
5029 build_smooth_basis(
5030 SmoothKind::S,
5031 &["x".to_string()],
5032 &[1],
5033 &opts,
5034 &ds,
5035 &mut notes,
5036 &ResourcePolicy::default_library(),
5037 1,
5038 )
5039 .unwrap_or_else(|e| panic!("s(x, bs={bs}, periodic=false) must be accepted: {e}"))
5040 };
5041
5042 match &build("gp") {
5043 SmoothBasisSpec::Matern { spec, .. } => assert!(
5044 spec.periodic.is_none(),
5045 "periodic=false must leave the matern spec non-periodic, got {:?}",
5046 spec.periodic
5047 ),
5048 other => panic!("expected Matern basis, got {other:?}"),
5049 }
5050 match &build("tp") {
5051 SmoothBasisSpec::ThinPlate { spec, .. } => assert!(
5052 spec.periodic.is_none(),
5053 "periodic=false must leave the thinplate spec non-periodic, got {:?}",
5054 spec.periodic
5055 ),
5056 other => panic!("expected ThinPlate basis, got {other:?}"),
5057 }
5058 match &build("duchon") {
5059 SmoothBasisSpec::Duchon { spec, .. } => assert!(
5060 spec.periodic.is_none(),
5061 "periodic=false must leave the duchon spec non-periodic (no data-range \
5062 back-fill), got {:?}",
5063 spec.periodic
5064 ),
5065 other => panic!("expected Duchon basis, got {other:?}"),
5066 }
5067 }
5068
5069 fn inferred_tensor_basis_product(ds: &Dataset) -> usize {
5070 let parsed = parse_formula("y ~ te(theta, h)").expect("parse tensor formula");
5071 let col_map = ds.column_map();
5072 let mut notes = Vec::new();
5073 let terms = build_termspec(
5074 &parsed.terms,
5075 ds,
5076 &col_map,
5077 &mut notes,
5078 &ResourcePolicy::default_library(),
5079 )
5080 .expect("build tensor termspec");
5081 let SmoothBasisSpec::TensorBSpline { spec, .. } = &terms.smooth_terms[0].basis else {
5082 panic!("expected tensor smooth");
5083 };
5084 spec.marginalspecs
5085 .iter()
5086 .map(|marginal| match marginal.knotspec {
5087 BSplineKnotSpec::Generate {
5088 num_internal_knots, ..
5089 } => num_internal_knots + marginal.degree + 1,
5090 BSplineKnotSpec::PeriodicUniform { num_basis, .. } => num_basis,
5091 BSplineKnotSpec::Automatic {
5092 num_internal_knots: Some(num_internal_knots),
5093 ..
5094 } => num_internal_knots + marginal.degree + 1,
5095 BSplineKnotSpec::Automatic {
5096 num_internal_knots: None,
5097 ..
5098 } => panic!("test helper cannot infer automatic knot count"),
5099 BSplineKnotSpec::Provided(ref knots) => {
5100 knots.len().saturating_sub(marginal.degree + 1)
5101 }
5102 // cr basis dimension equals the knot count (no degree offset).
5103 BSplineKnotSpec::NaturalCubicRegression { ref knots } => knots.len(),
5104 })
5105 .product()
5106 }
5107
5108 fn tensor_margin_basis_sizes(ds: &Dataset, formula: &str) -> Vec<usize> {
5109 let parsed = parse_formula(formula).expect("parse tensor formula");
5110 let col_map = ds.column_map();
5111 let mut notes = Vec::new();
5112 let terms = build_termspec(
5113 &parsed.terms,
5114 ds,
5115 &col_map,
5116 &mut notes,
5117 &ResourcePolicy::default_library(),
5118 )
5119 .expect("build tensor termspec");
5120 let SmoothBasisSpec::TensorBSpline { spec, .. } = &terms.smooth_terms[0].basis else {
5121 panic!("expected tensor smooth");
5122 };
5123 spec.marginalspecs
5124 .iter()
5125 .map(|marginal| match marginal.knotspec {
5126 BSplineKnotSpec::Generate {
5127 num_internal_knots, ..
5128 } => num_internal_knots + marginal.degree + 1,
5129 BSplineKnotSpec::PeriodicUniform { num_basis, .. } => num_basis,
5130 BSplineKnotSpec::Automatic {
5131 num_internal_knots: Some(num_internal_knots),
5132 ..
5133 } => num_internal_knots + marginal.degree + 1,
5134 BSplineKnotSpec::Automatic {
5135 num_internal_knots: None,
5136 ..
5137 } => panic!("test helper cannot infer automatic knot count"),
5138 BSplineKnotSpec::Provided(ref knots) => {
5139 knots.len().saturating_sub(marginal.degree + 1)
5140 }
5141 // cr basis dimension equals the knot count (no degree offset).
5142 BSplineKnotSpec::NaturalCubicRegression { ref knots } => knots.len(),
5143 })
5144 .collect()
5145 }
5146
5147 #[test]
5148 fn validate_known_options_lists_valid_option_names_for_unknown_parameter() {
5149 let mut options = BTreeMap::new();
5150 options.insert("lengt_scale".to_string(), "0.25".to_string());
5151 let err = validate_known_options(
5152 "matern",
5153 &options,
5154 &["type", "bs", "length_scale", "centers", "k", "nu"],
5155 )
5156 .expect_err("unknown smooth option should be rejected");
5157 assert!(
5158 err.contains("matern() does not accept option `lengt_scale`"),
5159 "error should name the invalid option, got: {err}"
5160 );
5161 assert!(
5162 err.contains("did you mean one of [length_scale]"),
5163 "error should suggest the closest valid option, got: {err}"
5164 );
5165 assert!(
5166 err.contains("Valid options: ["),
5167 "error should list valid option names, got: {err}"
5168 );
5169 }
5170
5171 #[test]
5172 fn tensor_k_accepts_square_bracket_per_margin_list() {
5173 let ds = continuous_dataset(
5174 &["y", "x", "z"],
5175 (0..40)
5176 .map(|i| {
5177 let x = i as f64 / 39.0;
5178 let z = ((i * 7) % 40) as f64 / 39.0;
5179 vec![x.sin() + z.cos(), x, z]
5180 })
5181 .collect(),
5182 );
5183
5184 assert_eq!(
5185 tensor_margin_basis_sizes(&ds, "y ~ te(x, z, k=[5, 6])"),
5186 vec![5, 6],
5187 "square-bracket k lists should materialize the requested per-margin values"
5188 );
5189 }
5190
5191 /// #1776 / #1752: a bare doubly-cyclic tensor `te(x, z, bs=c('cc','cc'))`
5192 /// with NO explicit `period=` must build — each cyclic margin wraps on its
5193 /// own observed `[min, max]` data span (mirroring mgcv's `bs="cc"` and the
5194 /// 1-D cyclic fallback), instead of hard-erroring "periodic but requires an
5195 /// explicit period". The periodic-radial refactor (c8c3192fa) replaced that
5196 /// fallback with an unconditional `period=`-required error and orphaned the
5197 /// `margin_is_cc` binding that drives it (the #1776 dead-binding `-D
5198 /// warnings` build break). This pins the restored data-range derivation so a
5199 /// regression that drops the `None if margin_is_cc` branch trips here, fast,
5200 /// with no fit/optimizer in the loop.
5201 #[test]
5202 fn bare_doubly_cyclic_tensor_derives_period_from_data_range_1776() {
5203 let ds = continuous_dataset(
5204 &["y", "x", "z"],
5205 (0..40)
5206 .map(|i| {
5207 let x = i as f64 / 39.0;
5208 let z = ((i * 7) % 40) as f64 / 39.0;
5209 vec![x.sin() + z.cos(), x, z]
5210 })
5211 .collect(),
5212 );
5213
5214 let parsed = parse_formula("y ~ te(x, z, bs=c('cc','cc'))")
5215 .expect("parse doubly-cyclic tensor formula");
5216 let col_map = ds.column_map();
5217 let mut notes = Vec::new();
5218 // Must NOT hard-error: the bare cyclic margins derive their period from
5219 // the observed data range (the restored #1752 fallback).
5220 let terms = build_termspec(
5221 &parsed.terms,
5222 &ds,
5223 &col_map,
5224 &mut notes,
5225 &ResourcePolicy::default_library(),
5226 )
5227 .expect(
5228 "bare cc-cc tensor must build via the data-range period fallback (#1776/#1752), \
5229 not hard-error on a missing explicit period",
5230 );
5231 let SmoothBasisSpec::TensorBSpline { spec, .. } = &terms.smooth_terms[0].basis else {
5232 panic!("expected tensor smooth");
5233 };
5234 assert_eq!(
5235 spec.marginalspecs.len(),
5236 2,
5237 "te(x, z) builds exactly two tensor margins"
5238 );
5239 for (axis, marginal) in spec.marginalspecs.iter().enumerate() {
5240 assert!(
5241 matches!(marginal.knotspec, BSplineKnotSpec::PeriodicUniform { .. }),
5242 "cyclic margin {axis} must build a periodic (wrapped) knotspec from the \
5243 data range, got {:?}",
5244 marginal.knotspec
5245 );
5246 }
5247 }
5248
5249 #[test]
5250 fn parse_cylinder_periodic_options_match_requested_forms() {
5251 let mut opts = BTreeMap::new();
5252 opts.insert("periodic".to_string(), "[0]".to_string());
5253 opts.insert("period".to_string(), "[2*pi, None]".to_string());
5254 let axes = parse_periodic_axes(&opts, 2).expect("axes");
5255 let periods = parse_periods(&opts, &axes).expect("periods");
5256 assert_eq!(axes, vec![true, false]);
5257 assert!((periods[0].unwrap() - 2.0 * std::f64::consts::PI).abs() < 1e-12);
5258 assert_eq!(periods[1], None);
5259
5260 let mut boundary_opts = BTreeMap::new();
5261 boundary_opts.insert(
5262 "boundary".to_string(),
5263 "['periodic', 'natural']".to_string(),
5264 );
5265 boundary_opts.insert("period".to_string(), "[2*pi, None]".to_string());
5266 let boundary_axes = parse_periodic_axes(&boundary_opts, 2).expect("boundary axes");
5267 let boundary_periods =
5268 parse_periods(&boundary_opts, &boundary_axes).expect("boundary periods");
5269 assert_eq!(boundary_axes, vec![true, false]);
5270 assert!((boundary_periods[0].unwrap() - 2.0 * std::f64::consts::PI).abs() < 1e-12);
5271 assert_eq!(boundary_periods[1], None);
5272
5273 let mut unicode_opts = BTreeMap::new();
5274 unicode_opts.insert("periodic".to_string(), "[0,1]".to_string());
5275 unicode_opts.insert("period".to_string(), "[2π, τ]".to_string());
5276 let unicode_axes = parse_periodic_axes(&unicode_opts, 2).expect("unicode axes");
5277 let unicode_periods = parse_periods(&unicode_opts, &unicode_axes).expect("unicode periods");
5278 assert_eq!(unicode_axes, vec![true, true]);
5279 assert!((unicode_periods[0].unwrap() - 2.0 * std::f64::consts::PI).abs() < 1e-12);
5280 assert!((unicode_periods[1].unwrap() - std::f64::consts::TAU).abs() < 1e-12);
5281 }
5282
5283 /// The tensor boundary-token guard must ACCEPT `clamped`/`open` (the
5284 /// B-spline-clamped, non-periodic margin spelling) alongside the periodic
5285 /// selectors and the other inert non-periodic markers, and still REJECT a
5286 /// genuine endpoint constraint like `anchored`. This locks the #415 /
5287 /// cylinder fix (`te(theta, z, boundary=['periodic','clamped'])`, mgcv
5288 /// `te(bs=c("cc","ps"))`) in the fast unit lane — the end-to-end cylinder
5289 /// recovery test is R-gated (`run_r` + mgcv), so without this the guard
5290 /// regressing back to rejecting `clamped` would slip through CPU CI.
5291 #[test]
5292 fn tensor_boundary_tokens_accept_clamped_open_reject_anchored() {
5293 fn boundary(raw: &str, dim: usize) -> Result<(), String> {
5294 let mut opts = BTreeMap::new();
5295 opts.insert("boundary".to_string(), raw.to_string());
5296 validate_tensor_boundary_tokens(&opts, dim)
5297 }
5298
5299 // Mixed periodic + clamped (the cylinder) and its bare/case/quote
5300 // variants are all accepted.
5301 for raw in [
5302 "['periodic', 'clamped']",
5303 "['periodic', 'open']",
5304 "['cc', 'clamped']",
5305 "['clamped', 'natural']",
5306 "[Periodic, CLAMPED]",
5307 "c('cc', 'clamped')", // mgcv-style c(...) vector form round-trips
5308 ] {
5309 assert!(
5310 boundary(raw, 2).is_ok(),
5311 "boundary={raw:?} must be accepted (clamped/open/inert non-periodic markers)"
5312 );
5313 }
5314
5315 // `bc=` is an accepted alias for `boundary=`.
5316 let mut bc_opts = BTreeMap::new();
5317 bc_opts.insert("bc".to_string(), "['periodic', 'clamped']".to_string());
5318 assert!(validate_tensor_boundary_tokens(&bc_opts, 2).is_ok());
5319
5320 // A genuine endpoint constraint has no ordinary-margin meaning on a
5321 // tensor and must still be surfaced as a clean unsupported-feature error
5322 // rather than silently dropped.
5323 let err = boundary("['periodic', 'anchored']", 2)
5324 .expect_err("anchored endpoint constraint must be rejected on a tensor margin");
5325 assert!(
5326 err.contains("anchored") && err.contains("not supported"),
5327 "rejection must name the offending token and be an unsupported-feature error: {err}"
5328 );
5329
5330 // Absent boundary/bc is a no-op success.
5331 assert!(validate_tensor_boundary_tokens(&BTreeMap::new(), 2).is_ok());
5332 }
5333
5334 #[test]
5335 fn parse_single_axis_periodic_zero_as_axis_not_false() {
5336 let mut opts = BTreeMap::new();
5337 opts.insert("periodic".to_string(), "[0]".to_string());
5338 opts.insert("period".to_string(), "2*pi".to_string());
5339 opts.insert("origin".to_string(), "0".to_string());
5340 let axes = parse_periodic_axes(&opts, 1).expect("axes");
5341 let periods = parse_periods(&opts, &axes).expect("periods");
5342 let origins = parse_period_origins(&opts, &axes).expect("origins");
5343 assert_eq!(axes, vec![true]);
5344 assert!((periods[0].unwrap() - 2.0 * std::f64::consts::PI).abs() < 1e-12);
5345 assert_eq!(origins[0], Some(0.0));
5346 }
5347
5348 #[test]
5349 fn one_dimensional_bspline_accepts_boundary_periodic() {
5350 let ds = continuous_dataset(
5351 &["y", "theta"],
5352 (0..16)
5353 .map(|i| {
5354 let theta = std::f64::consts::TAU * i as f64 / 16.0;
5355 vec![theta.sin(), theta]
5356 })
5357 .collect(),
5358 );
5359 let parsed = parse_formula("y ~ s(theta, boundary=periodic, period=2*pi, origin=0, k=8)")
5360 .expect("parse");
5361 let col_map = ds.column_map();
5362 let mut notes = Vec::new();
5363 let terms = build_termspec(
5364 &parsed.terms,
5365 &ds,
5366 &col_map,
5367 &mut notes,
5368 &gam_runtime::resource::ResourcePolicy::default_library(),
5369 )
5370 .expect("periodic boundary should build");
5371 let SmoothBasisSpec::BSpline1D { spec, .. } = &terms.smooth_terms[0].basis else {
5372 panic!("expected 1D B-spline");
5373 };
5374 assert!(matches!(
5375 &spec.knotspec,
5376 BSplineKnotSpec::PeriodicUniform {
5377 data_range,
5378 num_basis: 8
5379 } if *data_range == (0.0, std::f64::consts::TAU)
5380 ));
5381 }
5382
5383 #[test]
5384 fn univariate_smooth_accepts_mgcv_cubic_regression_aliases() {
5385 let ds = continuous_dataset(
5386 &["y", "x"],
5387 (0..32)
5388 .map(|i| {
5389 let x = i as f64 / 31.0;
5390 vec![x * x, x]
5391 })
5392 .collect(),
5393 );
5394 let col_map = ds.column_map();
5395
5396 for (selector, expect_double_penalty) in [("cr", false), ("cs", true)] {
5397 let formula = format!("y ~ s(x, bs='{selector}')");
5398 let parsed = parse_formula(&formula).expect("parse cr/cs smooth");
5399 let mut notes = Vec::new();
5400 let terms = build_termspec(
5401 &parsed.terms,
5402 &ds,
5403 &col_map,
5404 &mut notes,
5405 &gam_runtime::resource::ResourcePolicy::default_library(),
5406 )
5407 .unwrap_or_else(|err| panic!("bs='{selector}' must build a 1-D smooth, got: {err:?}"));
5408 let SmoothBasisSpec::BSpline1D { spec, .. } = &terms.smooth_terms[0].basis else {
5409 panic!(
5410 "bs='{selector}' must lower to a BSpline1D; got {:?}",
5411 terms.smooth_terms[0].basis
5412 );
5413 };
5414 assert_eq!(
5415 spec.double_penalty, expect_double_penalty,
5416 "bs='{selector}' must default double_penalty to mgcv's convention \
5417 (cr=no-shrinkage, cs=shrinkage); got double_penalty={}",
5418 spec.double_penalty
5419 );
5420 }
5421 }
5422
5423 #[test]
5424 fn univariate_ps_small_k_degree_reduces_through_build(/* gam#1130 */) {
5425 // mgcv accepts `s(x, bs="ps", k=3)` (and the default cubic-regression
5426 // `s(x, k=3)`) by silently reducing the cubic basis to a quadratic.
5427 // The univariate ps/bspline build path used to reject this with
5428 // "k too small for degree 3"; it must now lower to a degree-2 basis
5429 // with zero internal knots (num_basis = k = 3), matching the te(...)
5430 // margin behaviour fixed in b75f55a91. Verified across the ps alias
5431 // and the default (cr) selector that both route through
5432 // parse_ps_internal_knots.
5433 let ds = continuous_dataset(
5434 &["y", "x"],
5435 (0..32)
5436 .map(|i| {
5437 let x = i as f64 / 31.0;
5438 vec![x * x, x]
5439 })
5440 .collect(),
5441 );
5442 let col_map = ds.column_map();
5443
5444 for formula in ["y ~ s(x, bs='ps', k=3)", "y ~ s(x, k=3)"] {
5445 let parsed = parse_formula(formula).expect("parse small-k ps/cr smooth");
5446 let mut notes = Vec::new();
5447 let terms = build_termspec(
5448 &parsed.terms,
5449 &ds,
5450 &col_map,
5451 &mut notes,
5452 &gam_runtime::resource::ResourcePolicy::default_library(),
5453 )
5454 .unwrap_or_else(|err| {
5455 panic!("`{formula}` must degree-reduce, not error; got: {err:?}")
5456 });
5457 let SmoothBasisSpec::BSpline1D { spec, .. } = &terms.smooth_terms[0].basis else {
5458 panic!(
5459 "`{formula}` must lower to a BSpline1D; got {:?}",
5460 terms.smooth_terms[0].basis
5461 );
5462 };
5463 assert_eq!(
5464 spec.degree, 2,
5465 "`{formula}` must drop the cubic default to a quadratic basis"
5466 );
5467 let num_internal = match &spec.knotspec {
5468 BSplineKnotSpec::Generate {
5469 num_internal_knots, ..
5470 } => *num_internal_knots,
5471 BSplineKnotSpec::Automatic {
5472 num_internal_knots: Some(n),
5473 ..
5474 } => *n,
5475 other => panic!("`{formula}` unexpected knotspec: {other:?}"),
5476 };
5477 assert_eq!(
5478 num_internal, 0,
5479 "`{formula}` must have zero internal knots (num_basis = k = 3)"
5480 );
5481 // Resulting basis dimension is num_internal + degree + 1 = 3 = k.
5482 assert!(
5483 spec.penalty_order >= 1 && spec.penalty_order <= spec.degree,
5484 "`{formula}` penalty_order {} must satisfy 1 <= order <= degree={}",
5485 spec.penalty_order,
5486 spec.degree
5487 );
5488 }
5489 }
5490
5491 #[test]
5492 fn formula_shape_constraint_round_trips_and_rejects_bogus() {
5493 let ds = continuous_dataset(
5494 &["y", "x"],
5495 (0..32)
5496 .map(|i| {
5497 let x = i as f64 / 31.0;
5498 vec![x * x, x]
5499 })
5500 .collect(),
5501 );
5502 let col_map = ds.column_map();
5503
5504 let parsed =
5505 parse_formula("y ~ s(x, shape=monotone_increasing)").expect("parse monotone smooth");
5506 let mut notes = Vec::new();
5507 let terms = build_termspec(
5508 &parsed.terms,
5509 &ds,
5510 &col_map,
5511 &mut notes,
5512 &gam_runtime::resource::ResourcePolicy::default_library(),
5513 )
5514 .expect("monotone smooth should build");
5515 assert_eq!(
5516 terms.smooth_terms[0].shape,
5517 ShapeConstraint::MonotoneIncreasing
5518 );
5519
5520 let parsed_bad = parse_formula("y ~ s(x, shape=bogus)").expect("parse bogus shape");
5521 let mut notes_bad = Vec::new();
5522 let err = build_termspec(
5523 &parsed_bad.terms,
5524 &ds,
5525 &col_map,
5526 &mut notes_bad,
5527 &gam_runtime::resource::ResourcePolicy::default_library(),
5528 )
5529 .expect_err("bogus shape must error");
5530 assert!(
5531 format!("{err:?}").contains("unknown shape constraint"),
5532 "got: {err:?}"
5533 );
5534 }
5535
5536 #[test]
5537 fn default_sphere_smooth_uses_spherical_farthest_point_centers() {
5538 let ds = continuous_dataset(
5539 &["y", "lat", "lon"],
5540 (0..24)
5541 .map(|i| {
5542 let t = i as f64 / 24.0;
5543 let lat = -60.0 + 120.0 * t;
5544 let lon = -180.0 + 360.0 * ((7 * i) % 24) as f64 / 24.0;
5545 vec![lat.to_radians().sin(), lat, lon]
5546 })
5547 .collect(),
5548 );
5549 let parsed = parse_formula("y ~ sphere(lat, lon)").expect("parse");
5550 let col_map = ds.column_map();
5551 let mut notes = Vec::new();
5552 let terms = build_termspec(
5553 &parsed.terms,
5554 &ds,
5555 &col_map,
5556 &mut notes,
5557 &gam_runtime::resource::ResourcePolicy::default_library(),
5558 )
5559 .expect("build sphere termspec");
5560 let SmoothBasisSpec::Sphere { spec, .. } = &terms.smooth_terms[0].basis else {
5561 panic!("expected sphere term");
5562 };
5563 assert!(matches!(
5564 spec.center_strategy,
5565 CenterStrategy::FarthestPoint { .. }
5566 ));
5567 }
5568
5569 #[test]
5570 fn one_dimensional_duchon_defaults_to_scale_free_length_scale() {
5571 let ds = continuous_dataset(
5572 &["y", "x"],
5573 (0..32)
5574 .map(|i| {
5575 let x = i as f64 / 31.0;
5576 vec![(std::f64::consts::TAU * x).sin(), x]
5577 })
5578 .collect(),
5579 );
5580 let parsed = parse_formula("y ~ duchon(x)").expect("parse");
5581 let col_map = ds.column_map();
5582 let mut notes = Vec::new();
5583 let terms = build_termspec(
5584 &parsed.terms,
5585 &ds,
5586 &col_map,
5587 &mut notes,
5588 &gam_runtime::resource::ResourcePolicy::default_library(),
5589 )
5590 .expect("build default duchon termspec");
5591 let SmoothBasisSpec::Duchon { spec, .. } = &terms.smooth_terms[0].basis else {
5592 panic!("expected Duchon term");
5593 };
5594 assert_eq!(spec.length_scale, None);
5595 }
5596
5597 #[test]
5598 fn formula_duchon_default_does_not_enable_collocation_operators() {
5599 let ds = continuous_dataset(
5600 &["y", "x", "z"],
5601 (0..40)
5602 .map(|i| {
5603 let x = (i as f64 / 39.0).fract();
5604 let z = ((7 * i) as f64 / 39.0).fract();
5605 vec![x + z, x, z]
5606 })
5607 .collect(),
5608 );
5609 let parsed = parse_formula("y ~ duchon(x, z)").expect("parse");
5610 let col_map = ds.column_map();
5611 let mut notes = Vec::new();
5612 let terms = build_termspec(
5613 &parsed.terms,
5614 &ds,
5615 &col_map,
5616 &mut notes,
5617 &gam_runtime::resource::ResourcePolicy::default_library(),
5618 )
5619 .expect("build default 2D duchon termspec");
5620 let SmoothBasisSpec::Duchon { spec, .. } = &terms.smooth_terms[0].basis else {
5621 panic!("expected Duchon term");
5622 };
5623 assert!(matches!(
5624 spec.operator_penalties.mass,
5625 OperatorPenaltySpec::Disabled
5626 ));
5627 assert!(matches!(
5628 spec.operator_penalties.tension,
5629 OperatorPenaltySpec::Disabled
5630 ));
5631 assert!(matches!(
5632 spec.operator_penalties.stiffness,
5633 OperatorPenaltySpec::Disabled
5634 ));
5635 }
5636
5637 #[test]
5638 fn one_dimensional_duchon_length_scale_opts_into_hybrid_mode() {
5639 let ds = continuous_dataset(
5640 &["y", "x"],
5641 (0..32)
5642 .map(|i| {
5643 let x = i as f64 / 31.0;
5644 vec![(std::f64::consts::TAU * x).sin(), x]
5645 })
5646 .collect(),
5647 );
5648 let parsed = parse_formula("y ~ duchon(x, length_scale=0.25)").expect("parse");
5649 let col_map = ds.column_map();
5650 let mut notes = Vec::new();
5651 let terms = build_termspec(
5652 &parsed.terms,
5653 &ds,
5654 &col_map,
5655 &mut notes,
5656 &gam_runtime::resource::ResourcePolicy::default_library(),
5657 )
5658 .expect("build hybrid duchon termspec");
5659 let SmoothBasisSpec::Duchon { spec, .. } = &terms.smooth_terms[0].basis else {
5660 panic!("expected Duchon term");
5661 };
5662 assert_eq!(spec.length_scale, Some(0.25));
5663 }
5664
5665 #[test]
5666 fn multidimensional_duchon_default_uses_low_rank_mgcv_sized_basis() {
5667 let ds = continuous_dataset(
5668 &["y", "x1", "x2"],
5669 (0..500)
5670 .map(|i| {
5671 let x1 = 2.0 * (i as f64 / 499.0) - 1.0;
5672 let x2 = (((37 * i) % 500) as f64 / 499.0) * 2.0 - 1.0;
5673 vec![(2.0 * x1).sin() + (1.5 * x2).cos(), x1, x2]
5674 })
5675 .collect(),
5676 );
5677 let parsed = parse_formula("y ~ duchon(x1, x2)").expect("parse");
5678 let col_map = ds.column_map();
5679 let mut notes = Vec::new();
5680 let terms = build_termspec(
5681 &parsed.terms,
5682 &ds,
5683 &col_map,
5684 &mut notes,
5685 &gam_runtime::resource::ResourcePolicy::default_library(),
5686 )
5687 .expect("build default 2D duchon termspec");
5688 let SmoothBasisSpec::Duchon { spec, .. } = &terms.smooth_terms[0].basis else {
5689 panic!("expected Duchon term");
5690 };
5691 let CenterStrategy::Auto(inner) = &spec.center_strategy else {
5692 panic!("expected auto center strategy");
5693 };
5694 assert!(matches!(
5695 inner.as_ref(),
5696 CenterStrategy::FarthestPoint { num_centers: 30 }
5697 ));
5698 }
5699
5700 #[test]
5701 fn parse_matern_nu_accepts_equivalent_half_integer_forms() {
5702 let cases = [
5703 ("1/2", MaternNu::Half),
5704 (" 1 / 2 ", MaternNu::Half),
5705 (".5", MaternNu::Half),
5706 ("0.50", MaternNu::Half),
5707 ("half", MaternNu::Half),
5708 ("3 / 2", MaternNu::ThreeHalves),
5709 ("1.50", MaternNu::ThreeHalves),
5710 ("5 / 2", MaternNu::FiveHalves),
5711 ("2.500000000000", MaternNu::FiveHalves),
5712 ("7 / 2", MaternNu::SevenHalves),
5713 ("3.50", MaternNu::SevenHalves),
5714 ("9 / 2", MaternNu::NineHalves),
5715 ("4.50", MaternNu::NineHalves),
5716 ];
5717 for (raw, expected) in cases {
5718 let parsed = parse_matern_nu(raw).expect(raw);
5719 assert!(
5720 matches!(
5721 (parsed, expected),
5722 (MaternNu::Half, MaternNu::Half)
5723 | (MaternNu::ThreeHalves, MaternNu::ThreeHalves)
5724 | (MaternNu::FiveHalves, MaternNu::FiveHalves)
5725 | (MaternNu::SevenHalves, MaternNu::SevenHalves)
5726 | (MaternNu::NineHalves, MaternNu::NineHalves)
5727 ),
5728 "parsed {raw:?} as {parsed:?}, expected {expected:?}"
5729 );
5730 }
5731 }
5732
5733 #[test]
5734 fn parse_matern_nu_rejects_unsupported_or_invalid_values() {
5735 for raw in ["1", "2", "11/2", "1/0", "nan", "fast"] {
5736 let err = parse_matern_nu(raw).expect_err(raw);
5737 assert!(
5738 err.contains("supported half-integer values"),
5739 "unexpected error for {raw:?}: {err}"
5740 );
5741 }
5742 }
5743
5744 #[test]
5745 fn parse_ps_k_promotes_underexpressive_cubic_basis() {
5746 let mut opts = BTreeMap::new();
5747 opts.insert("k".to_string(), "4".to_string());
5748 let (internal, inferred, eff_degree) = parse_ps_internal_knots(&opts, 3, 20).expect("k=4");
5749 assert_eq!(internal, 2);
5750 assert_eq!(eff_degree, 3);
5751 assert!(!inferred);
5752
5753 opts.insert("k".to_string(), "6".to_string());
5754 let (internal, inferred, eff_degree) = parse_ps_internal_knots(&opts, 3, 20).expect("k=6");
5755 assert_eq!(internal, 2);
5756 assert_eq!(eff_degree, 3);
5757 assert!(!inferred);
5758
5759 opts.insert("k".to_string(), "10".to_string());
5760 let (internal, inferred, eff_degree) = parse_ps_internal_knots(&opts, 3, 20).expect("k=10");
5761 assert_eq!(internal, 6);
5762 assert_eq!(eff_degree, 3);
5763 assert!(!inferred);
5764 }
5765
5766 #[test]
5767 fn parse_ps_internal_knots_drops_degree_for_small_k() {
5768 // mgcv's `s(x, bs="ps", k=3)` with the default cubic basis silently
5769 // reduces to a quadratic (`degree=2`) marginal. `k=3, degree=3`
5770 // should yield a quadratic basis with zero internal knots
5771 // (`num_basis = k = 3`).
5772 let mut opts = BTreeMap::new();
5773 opts.insert("k".to_string(), "3".to_string());
5774 let (internal, inferred, eff_degree) = parse_ps_internal_knots(&opts, 3, 20).expect("k=3");
5775 assert_eq!(eff_degree, 2);
5776 assert_eq!(internal, 0);
5777 assert!(!inferred);
5778
5779 // `k=2` reduces to a linear (`degree=1`) marginal — the smallest
5780 // non-trivial spline basis.
5781 opts.insert("k".to_string(), "2".to_string());
5782 let (internal, inferred, eff_degree) = parse_ps_internal_knots(&opts, 3, 20).expect("k=2");
5783 assert_eq!(eff_degree, 1);
5784 assert_eq!(internal, 0);
5785 assert!(!inferred);
5786
5787 // The under-2 case is structurally under-specified and rejected even
5788 // by the degree-reducing variant: no B-spline basis has fewer than
5789 // two functions.
5790 opts.insert("k".to_string(), "1".to_string());
5791 let err = parse_ps_internal_knots(&opts, 3, 20)
5792 .expect_err("k=1 is below the irreducible spline floor");
5793 assert!(err.contains("requires k >= 2"), "unexpected error: {err}");
5794
5795 // When the user already passed `k >= degree+1`, the helper must
5796 // preserve the existing knot geometry exactly.
5797 opts.insert("k".to_string(), "4".to_string());
5798 let (internal, inferred, eff_degree) = parse_ps_internal_knots(&opts, 3, 20).expect("k=4");
5799 assert_eq!(eff_degree, 3);
5800 assert_eq!(internal, 2);
5801 assert!(!inferred);
5802 }
5803
5804 #[test]
5805 fn factor_smooth_marginal_degree_reduces_for_small_k() {
5806 let ds = factor_dataset();
5807 let col_map = ds.column_map();
5808
5809 for (k, expected_degree) in [(3usize, 2usize), (2usize, 1usize)] {
5810 let parsed =
5811 parse_formula(&format!("y ~ s(x, g, bs=fs, k={k})")).expect("parse factor smooth");
5812 let mut notes = Vec::new();
5813 let terms = build_termspec(
5814 &parsed.terms,
5815 &ds,
5816 &col_map,
5817 &mut notes,
5818 &gam_runtime::resource::ResourcePolicy::default_library(),
5819 )
5820 .unwrap_or_else(|err| panic!("fs k={k} should degree-reduce, got: {err:?}"));
5821 let SmoothBasisSpec::FactorSmooth { spec } = &terms.smooth_terms[0].basis else {
5822 panic!(
5823 "expected factor smooth, got {:?}",
5824 terms.smooth_terms[0].basis
5825 );
5826 };
5827 assert_eq!(spec.marginal.degree, expected_degree);
5828 assert!(
5829 spec.marginal.penalty_order <= spec.marginal.degree,
5830 "penalty_order {} must be clamped to degree {}",
5831 spec.marginal.penalty_order,
5832 spec.marginal.degree
5833 );
5834 let basis_size = match spec.marginal.knotspec {
5835 BSplineKnotSpec::Generate {
5836 num_internal_knots, ..
5837 } => num_internal_knots + spec.marginal.degree + 1,
5838 BSplineKnotSpec::Automatic {
5839 num_internal_knots: Some(num_internal_knots),
5840 ..
5841 } => num_internal_knots + spec.marginal.degree + 1,
5842 ref other => panic!("unexpected factor-smooth knotspec: {other:?}"),
5843 };
5844 assert_eq!(basis_size, k);
5845 }
5846 }
5847
5848 /// Build a dataset with a ternary continuous covariate `x ∈ {0,1,2}` and a
5849 /// 2-level categorical group `g`, for the low-cardinality cr-cap tests.
5850 fn ternary_factor_dataset() -> Dataset {
5851 let rows = (0..120)
5852 .map(|i| {
5853 let x = (i % 3) as f64;
5854 let g = (i % 2) as f64;
5855 vec![x + g, x, g]
5856 })
5857 .collect::<Vec<_>>();
5858 Dataset {
5859 headers: vec!["y".into(), "x".into(), "g".into()],
5860 values: Array2::from_shape_vec(
5861 (rows.len(), 3),
5862 rows.into_iter().flat_map(|row| row.into_iter()).collect(),
5863 )
5864 .expect("rectangular ternary factor test data"),
5865 schema: DataSchema {
5866 columns: vec![
5867 SchemaColumn {
5868 name: "y".into(),
5869 kind: ColumnKindTag::Continuous,
5870 levels: vec![],
5871 },
5872 SchemaColumn {
5873 name: "x".into(),
5874 kind: ColumnKindTag::Continuous,
5875 levels: vec![],
5876 },
5877 SchemaColumn {
5878 name: "g".into(),
5879 kind: ColumnKindTag::Categorical,
5880 levels: vec!["a".into(), "b".into()],
5881 },
5882 ],
5883 },
5884 column_kinds: vec![
5885 ColumnKindTag::Continuous,
5886 ColumnKindTag::Continuous,
5887 ColumnKindTag::Categorical,
5888 ],
5889 }
5890 }
5891
5892 #[test]
5893 fn univariate_cr_smooth_caps_knots_to_data_support() {
5894 // #1541: `s(x, bs=cr, k=10)` on a ternary covariate (3 distinct values)
5895 // must NOT hard-fail in cr-knot selection ("cubic regression spline with
5896 // k=10 requires at least 10 distinct values, got 3"). The cr basis is
5897 // capped to the data support — exactly 3 value-knots at {0,1,2} — which
5898 // is full-rank for the data, so it can still represent any 3 group means.
5899 let ds = continuous_dataset(
5900 &["y", "x"],
5901 (0..90)
5902 .map(|i| vec![(i % 3) as f64, (i % 3) as f64])
5903 .collect(),
5904 );
5905 let col_map = ds.column_map();
5906 let parsed = parse_formula("y ~ s(x, bs=cr, k=10)").expect("parse cr smooth");
5907 let mut notes = Vec::new();
5908 let terms = build_termspec(
5909 &parsed.terms,
5910 &ds,
5911 &col_map,
5912 &mut notes,
5913 &gam_runtime::resource::ResourcePolicy::default_library(),
5914 )
5915 .expect("cr k=10 must cap to data support instead of erroring");
5916 let SmoothBasisSpec::BSpline1D { spec, .. } = &terms.smooth_terms[0].basis else {
5917 panic!("expected BSpline1D for s(x, bs=cr)");
5918 };
5919 let BSplineKnotSpec::NaturalCubicRegression { knots } = &spec.knotspec else {
5920 panic!("expected cr knotspec, got {:?}", spec.knotspec);
5921 };
5922 // Capped to exactly the 3 distinct covariate values.
5923 assert_eq!(knots.len(), 3, "cr basis not capped to 3 distinct values");
5924 assert_eq!(knots.as_slice().unwrap(), &[0.0, 1.0, 2.0]);
5925 // The reduction is surfaced to the user (mgcv warns in the same case).
5926 assert!(
5927 notes.iter().any(|n| n.contains("data-support cap")),
5928 "cap not reported in inference notes: {notes:?}"
5929 );
5930 }
5931
5932 #[test]
5933 fn univariate_cr_smooth_binary_covariate_degrades_to_bspline() {
5934 // #1541: a BINARY covariate has too few distinct values (2) for ANY cr
5935 // spline (needs >= 3 distinct). `s(x, bs=cr)` must degrade to a B-spline
5936 // marginal — the default basis the same data already fits — NOT hard-fail.
5937 let ds = continuous_dataset(
5938 &["y", "x"],
5939 (0..80)
5940 .map(|i| vec![(i % 2) as f64, (i % 2) as f64])
5941 .collect(),
5942 );
5943 let col_map = ds.column_map();
5944 let parsed = parse_formula("y ~ s(x, bs=cr, k=10)").expect("parse cr smooth");
5945 let mut notes = Vec::new();
5946 let terms = build_termspec(
5947 &parsed.terms,
5948 &ds,
5949 &col_map,
5950 &mut notes,
5951 &gam_runtime::resource::ResourcePolicy::default_library(),
5952 )
5953 .expect("binary cr must degrade to B-spline instead of erroring");
5954 let SmoothBasisSpec::BSpline1D { spec, .. } = &terms.smooth_terms[0].basis else {
5955 panic!("expected BSpline1D for s(x, bs=cr)");
5956 };
5957 assert!(
5958 !matches!(
5959 spec.knotspec,
5960 BSplineKnotSpec::NaturalCubicRegression { .. }
5961 ),
5962 "binary covariate must NOT build a cr basis, got {:?}",
5963 spec.knotspec
5964 );
5965 assert!(
5966 notes
5967 .iter()
5968 .any(|n| n.contains("Degraded to the linear B-spline")),
5969 "degradation not reported in inference notes: {notes:?}"
5970 );
5971 }
5972
5973 #[test]
5974 fn sz_factor_smooth_low_cardinality_uses_bspline_marginal() {
5975 // #1605: the `sz` factor-smooth marginal is the SAME penalized B-spline
5976 // the `fs` sibling uses — NOT a natural cubic regression (`cr`) marginal,
5977 // whose hard natural boundary conditions f''=0 bias curved deviations
5978 // (a consistency failure). #1542 (the reason this test exists) is
5979 // subsumed: with a B-spline marginal a low-cardinality covariate no
5980 // longer needs a special cr data-support cap and can never hard-fail the
5981 // way the old cr-marginal `sz` spelling did — the build just succeeds,
5982 // exactly as `fs` already does on the identical data.
5983 let ds = ternary_factor_dataset();
5984 let col_map = ds.column_map();
5985 let parsed = parse_formula("y ~ s(x, g, bs=sz, k=10)").expect("parse sz factor smooth");
5986 let mut notes = Vec::new();
5987 let terms = build_termspec(
5988 &parsed.terms,
5989 &ds,
5990 &col_map,
5991 &mut notes,
5992 &gam_runtime::resource::ResourcePolicy::default_library(),
5993 )
5994 .expect("sz on a ternary covariate must build (B-spline marginal), not hard-fail");
5995 let SmoothBasisSpec::FactorSmooth { spec } = &terms.smooth_terms[0].basis else {
5996 panic!("expected FactorSmooth for s(x, g, bs=sz)");
5997 };
5998 assert!(
5999 !matches!(
6000 spec.marginal.knotspec,
6001 BSplineKnotSpec::NaturalCubicRegression { .. }
6002 ),
6003 "sz marginal must be a B-spline (curvature-capable), not the \
6004 natural-BC cr basis; got {:?}",
6005 spec.marginal.knotspec
6006 );
6007 }
6008
6009 /// A dataset with a genuinely continuous covariate `x` (many distinct
6010 /// values) and a `L`-level grouping factor `g`, suitable for building a
6011 /// real factor-smooth marginal with a non-trivial {const, linear} null
6012 /// space. `y` is unused by the structural penalty checks below.
6013 fn continuous_x_factor_dataset(n: usize, n_groups: usize) -> Dataset {
6014 let rows = (0..n)
6015 .map(|i| {
6016 let x = i as f64 / (n as f64 - 1.0);
6017 let g = (i % n_groups) as f64;
6018 vec![x + g, x, g]
6019 })
6020 .collect::<Vec<_>>();
6021 let levels: Vec<String> = (0..n_groups).map(|k| format!("g{k}")).collect();
6022 Dataset {
6023 headers: vec!["y".into(), "x".into(), "g".into()],
6024 values: Array2::from_shape_vec(
6025 (rows.len(), 3),
6026 rows.into_iter().flat_map(|row| row.into_iter()).collect(),
6027 )
6028 .expect("rectangular continuous-x factor data"),
6029 schema: DataSchema {
6030 columns: vec![
6031 SchemaColumn {
6032 name: "y".into(),
6033 kind: ColumnKindTag::Continuous,
6034 levels: vec![],
6035 },
6036 SchemaColumn {
6037 name: "x".into(),
6038 kind: ColumnKindTag::Continuous,
6039 levels: vec![],
6040 },
6041 SchemaColumn {
6042 name: "g".into(),
6043 kind: ColumnKindTag::Categorical,
6044 levels,
6045 },
6046 ],
6047 },
6048 column_kinds: vec![
6049 ColumnKindTag::Continuous,
6050 ColumnKindTag::Continuous,
6051 ColumnKindTag::Categorical,
6052 ],
6053 }
6054 }
6055
6056 fn factor_smooth_spec_for(formula: &str, ds: &Dataset) -> FactorSmoothSpec {
6057 let col_map = ds.column_map();
6058 let parsed = parse_formula(formula).expect("parse factor smooth formula");
6059 let mut notes = Vec::new();
6060 let terms = build_termspec(
6061 &parsed.terms,
6062 ds,
6063 &col_map,
6064 &mut notes,
6065 &gam_runtime::resource::ResourcePolicy::default_library(),
6066 )
6067 .expect("build factor smooth term");
6068 let SmoothBasisSpec::FactorSmooth { spec } = &terms.smooth_terms[0].basis else {
6069 panic!("expected FactorSmooth basis for `{formula}`");
6070 };
6071 spec.clone()
6072 }
6073
6074 /// #1605: the sum-to-zero factor smooth `s(x, g, bs="sz")` under-fit data
6075 /// drawn from its own model class because its deviation blocks carried ONLY
6076 /// the marginal wiggliness penalty — the {const, linear} null space of every
6077 /// deviation curve was left completely unpenalized, so the single combined
6078 /// wiggliness λ could not separate per-group intercept/slope variance from
6079 /// curvature variance and REML parked it over-smoothed (same defect class as
6080 /// the closed #700, more severe). mgcv's `bs="fs"` sibling avoids the gap by
6081 /// adding a SEPARATE per-null-dimension ridge (one λ each), the
6082 /// double-penalty `I_L ⊗ S_j` structure. The fix gives `sz` the same
6083 /// null-space-ridge structure, mapped into the zero-sum CONTRAST space so the
6084 /// constraint (and `sz`'s distinctness from `fs`) is preserved.
6085 ///
6086 /// This pins the structural defect: after the fix the `sz` deviation build
6087 /// must carry MORE than just its wiggliness penalty(s) — exactly one extra
6088 /// null-space-ridge penalty per marginal null direction, matching the count
6089 /// that `fs` carries — while keeping the narrower `(L-1)·p` zero-sum design
6090 /// (NOT the `L·p` full-rank `fs` design). Before the fix `sz` carried only
6091 /// the wiggliness penalties and this fails.
6092 #[test]
6093 fn sz_factor_smooth_carries_null_space_ridge_like_fs() {
6094 let ds = continuous_x_factor_dataset(180, 4);
6095 let mut workspace = crate::basis::BasisWorkspace::new();
6096
6097 let sz_spec = factor_smooth_spec_for("y ~ s(x, g, bs=sz, k=8)", &ds);
6098 let sz_built = crate::smooth::build_factor_smooth(
6099 ds.values.view(),
6100 &sz_spec,
6101 "sz_term",
6102 &mut workspace,
6103 )
6104 .expect("build sz factor smooth");
6105
6106 let fs_spec = factor_smooth_spec_for("y ~ s(x, g, bs=fs, k=8)", &ds);
6107 let fs_built = crate::smooth::build_factor_smooth(
6108 ds.values.view(),
6109 &fs_spec,
6110 "fs_term",
6111 &mut workspace,
6112 )
6113 .expect("build fs factor smooth");
6114
6115 // Penalty structure (#1074 + #1605). `fs` is the exchangeable
6116 // random-effect smooth: all `L` level blocks share ONE wiggliness λ per
6117 // marginal penalty, plus one rank-1 null-space ridge per marginal null
6118 // direction (the #1605 double penalty). `sz` is the sum-to-zero factor
6119 // smooth and mgcv's `smooth.construct.sz` emits ONE penalty matrix PER
6120 // LEVEL — `L` independent curvature smoothing parameters — so REML can
6121 // shrink a low-amplitude group's deviation hard while leaving a busy
6122 // group nearly unpenalized. We mirror that: the single marginal
6123 // wiggliness penalty is split into its `L` independent zero-sum-contrast
6124 // summands (`L-1` free per-group blocks `(e_k e_kᵀ)⊗S` + the reference
6125 // coupling block `(11ᵀ)⊗S`), each carrying its own λ, and the null-space
6126 // ridges stay POOLED (the per-group intercept/slope shrinkage mgcv pools
6127 // under one variance even for `sz`).
6128 //
6129 // So with `nw` marginal wiggliness penalties and `nn` marginal null
6130 // directions: fs has `nw + nn` penalties; sz has `L·nw + nn`. sz must
6131 // therefore carry strictly MORE penalties than fs (the per-group split),
6132 // and the surplus must be exactly `(L-1)·nw`.
6133 let n_levels = sz_spec
6134 .group_frozen_levels
6135 .as_ref()
6136 .map(|l| l.len())
6137 .unwrap_or(4);
6138 assert!(n_levels >= 3, "test needs >=3 groups, got {n_levels}");
6139
6140 // fs = nw + nn ⇒ nn = fs_penalties - nw. The marginal has nw==1
6141 // wiggliness penalty (a single difference/curvature operator), so the
6142 // per-group split adds exactly (L-1)·nw = (L-1) extra penalties on top of
6143 // fs's count.
6144 let nw = 1usize; // one marginal wiggliness penalty for the B-spline marginal
6145 let expected_sz = fs_built.penalties.len() + (n_levels - 1) * nw;
6146 assert_eq!(
6147 sz_built.penalties.len(),
6148 expected_sz,
6149 "sz must split its wiggliness penalty per level (#1074): expected \
6150 fs_count {} + (L-1)·nw {} = {}, but sz had {}",
6151 fs_built.penalties.len(),
6152 (n_levels - 1) * nw,
6153 expected_sz,
6154 sz_built.penalties.len(),
6155 );
6156 assert!(
6157 sz_built.penalties.len() > fs_built.penalties.len(),
6158 "sz must carry strictly more penalties than fs after the per-group \
6159 split (sz={}, fs={})",
6160 sz_built.penalties.len(),
6161 fs_built.penalties.len(),
6162 );
6163
6164 // The null-space ridges must still be present (the #1605 property that
6165 // keeps the deviation curvature un-over-smoothed). After removing the `L`
6166 // per-group wiggliness blocks, the remainder are the pooled null ridges,
6167 // and there must be at least one (a B-spline marginal has a non-empty
6168 // {const, linear} null space).
6169 let n_wiggliness = n_levels * nw; // L per-group blocks
6170 assert!(
6171 sz_built.penalties.len() > n_wiggliness,
6172 "sz deviation block carries no null-space ridge (penalties={}, \
6173 wiggliness blocks={}); the null space is unpenalized and REML \
6174 over-smooths the deviations",
6175 sz_built.penalties.len(),
6176 n_wiggliness,
6177 );
6178
6179 // The zero-sum constraint must be preserved: the sz design must stay the
6180 // NARROWER `(L-1)·p` contrast design, strictly narrower than the fs
6181 // full-rank `L·p` design. This guards against "fixing" sz by making it
6182 // identical to fs (which would break identifiability / sum-to-zero).
6183 assert!(
6184 sz_built.dim < fs_built.dim,
6185 "sz design width {} must be strictly less than fs width {} \
6186 (zero-sum contrast drops one level block)",
6187 sz_built.dim,
6188 fs_built.dim,
6189 );
6190
6191 // Every penalty/metadata vector must stay parallel (length invariant the
6192 // downstream REML assembly relies on).
6193 assert_eq!(sz_built.penalties.len(), sz_built.nullspaces.len());
6194 assert_eq!(sz_built.penalties.len(), sz_built.penaltyinfo.len());
6195 assert_eq!(sz_built.penalties.len(), sz_built.null_eigenvectors.len());
6196 }
6197
6198 /// #1457: `y ~ s(x, by=g) + g` with a BARE categorical `g` must NOT lower to
6199 /// two `g` design blocks. The bare `+ g` is auto-promoted to a single
6200 /// penalized random-effect block owning the factor's full level offsets; the
6201 /// `by=` branch must then recognize that owner and skip adding its own
6202 /// unpenalized treatment-coded main effect. Before the fix the dedup guard
6203 /// recognized only explicit `group(g)` (a `ParsedTerm::RandomEffect`), so the
6204 /// auto-promoted bare-`+ g` block slipped past and a spurious second `g`
6205 /// block (plus an extra smoothing parameter) was added. Assert exactly ONE
6206 /// `g` random/categorical block, and that adding the bare `+ g` introduces no
6207 /// extra `g` blocks beyond `y ~ s(x, by=g)` alone.
6208 fn factor_dataset_l3() -> Dataset {
6209 // `g` is categorical with THREE levels (encoded 0.0/1.0/2.0).
6210 let rows = (0..30)
6211 .map(|i| {
6212 let x = i as f64 / 29.0;
6213 let g = (i % 3) as f64;
6214 vec![x + g, x, g]
6215 })
6216 .collect::<Vec<_>>();
6217 Dataset {
6218 headers: vec!["y".into(), "x".into(), "g".into()],
6219 values: Array2::from_shape_vec(
6220 (rows.len(), 3),
6221 rows.into_iter().flat_map(|row| row.into_iter()).collect(),
6222 )
6223 .expect("rectangular L=3 factor test data"),
6224 schema: DataSchema {
6225 columns: vec![
6226 SchemaColumn {
6227 name: "y".into(),
6228 kind: ColumnKindTag::Continuous,
6229 levels: vec![],
6230 },
6231 SchemaColumn {
6232 name: "x".into(),
6233 kind: ColumnKindTag::Continuous,
6234 levels: vec![],
6235 },
6236 SchemaColumn {
6237 name: "g".into(),
6238 kind: ColumnKindTag::Categorical,
6239 levels: vec!["a".into(), "b".into(), "c".into()],
6240 },
6241 ],
6242 },
6243 column_kinds: vec![
6244 ColumnKindTag::Continuous,
6245 ColumnKindTag::Continuous,
6246 ColumnKindTag::Categorical,
6247 ],
6248 }
6249 }
6250
6251 #[test]
6252 fn factor_by_smooth_plus_bare_categorical_does_not_duplicate_factor_block() {
6253 let ds = factor_dataset_l3();
6254 let col_map = ds.column_map();
6255
6256 let g_blocks = |formula: &str| -> usize {
6257 let parsed = parse_formula(formula).expect("parse by-smooth formula");
6258 let mut notes = Vec::new();
6259 let terms = build_termspec(
6260 &parsed.terms,
6261 &ds,
6262 &col_map,
6263 &mut notes,
6264 &ResourcePolicy::default_library(),
6265 )
6266 .unwrap_or_else(|err| panic!("`{formula}` must build, got: {err:?}"));
6267 terms
6268 .random_effect_terms
6269 .iter()
6270 .filter(|rt| rt.name == "g")
6271 .count()
6272 };
6273
6274 // Baseline: the standalone factor-by smooth carries exactly ONE `g`
6275 // block (the unpenalized treatment-coded factor main effect added by the
6276 // `by=` branch).
6277 let by_only = g_blocks("y ~ s(x, by=g, k=10)");
6278 assert_eq!(
6279 by_only, 1,
6280 "`y ~ s(x, by=g)` must produce exactly one `g` design block"
6281 );
6282
6283 // The bug: adding a bare `+ g` (auto-promoted to a penalized random
6284 // block owning the same level offsets) must NOT introduce a second `g`
6285 // block. Before the fix this was 2.
6286 let by_plus_bare = g_blocks("y ~ s(x, by=g, k=10) + g");
6287 assert_eq!(
6288 by_plus_bare, 1,
6289 "`y ~ s(x, by=g) + g` must collapse to ONE `g` block (#1457): the bare \
6290 `+ g` already owns the factor's level offsets, so the `by=` branch \
6291 must not add a second, treatment-coded main effect"
6292 );
6293
6294 // The bare `+ g` adds no spurious extra `g` block versus the baseline.
6295 assert_eq!(
6296 by_plus_bare, by_only,
6297 "the bare `+ g` collision must add zero extra `g` blocks (#1457)"
6298 );
6299 }
6300
6301 #[test]
6302 fn parse_tensor_periods_and_origins_aliases() {
6303 let mut opts = BTreeMap::new();
6304 opts.insert(
6305 "boundary".to_string(),
6306 "['periodic', 'periodic']".to_string(),
6307 );
6308 opts.insert("periods".to_string(), "[7, 24]".to_string());
6309 opts.insert("origins".to_string(), "[0, -12]".to_string());
6310 let axes = parse_periodic_axes(&opts, 2).expect("axes");
6311 let periods = parse_periods(&opts, &axes).expect("periods");
6312 let origins = parse_period_origins(&opts, &axes).expect("origins");
6313 assert_eq!(axes, vec![true, true]);
6314 assert_eq!(periods, vec![Some(7.0), Some(24.0)]);
6315 assert_eq!(origins, vec![Some(0.0), Some(-12.0)]);
6316 }
6317
6318 #[test]
6319 fn tensor_smooth_honors_per_margin_k_list() {
6320 let ds = continuous_dataset(
6321 &["y", "theta", "h"],
6322 (0..20)
6323 .map(|i| {
6324 let theta = std::f64::consts::TAU * i as f64 / 20.0;
6325 let h = -1.0 + 2.0 * (i % 5) as f64 / 4.0;
6326 vec![theta.cos() + h, theta, h]
6327 })
6328 .collect(),
6329 );
6330 let parsed = parse_formula(
6331 "y ~ te(theta, h, periodic=[0], period=[2*pi, None], origin=[0, None], k=[9,5])",
6332 )
6333 .expect("parse tensor formula");
6334 let col_map = ds.column_map();
6335 let mut notes = Vec::new();
6336 let terms = build_termspec(
6337 &parsed.terms,
6338 &ds,
6339 &col_map,
6340 &mut notes,
6341 &gam_runtime::resource::ResourcePolicy::default_library(),
6342 )
6343 .expect("build tensor terms");
6344 let SmoothBasisSpec::TensorBSpline { spec, .. } = &terms.smooth_terms[0].basis else {
6345 panic!("expected tensor B-spline");
6346 };
6347 let dims = spec
6348 .marginalspecs
6349 .iter()
6350 .map(|m| match m.knotspec {
6351 BSplineKnotSpec::PeriodicUniform { num_basis, .. } => num_basis,
6352 BSplineKnotSpec::Generate {
6353 num_internal_knots, ..
6354 } => num_internal_knots + m.degree + 1,
6355 // The mgcv-default `cr` margin (#1074) reports its basis size as
6356 // the number of value-knots placed.
6357 BSplineKnotSpec::NaturalCubicRegression { ref knots } => knots.len(),
6358 _ => panic!("unexpected tensor marginal knotspec"),
6359 })
6360 .collect::<Vec<_>>();
6361 assert_eq!(dims, vec![9, 5]);
6362 }
6363
6364 #[test]
6365 fn tensor_smooth_honors_per_margin_k_axis_aliases() {
6366 let ds = continuous_dataset(
6367 &["resp", "x", "y"],
6368 (0..12)
6369 .map(|i| {
6370 let t = i as f64 / 11.0;
6371 vec![t, t, 1.0 - t]
6372 })
6373 .collect(),
6374 );
6375 assert_eq!(
6376 tensor_margin_basis_sizes(&ds, "resp ~ te(x, y, k_x=9, k_y=5)"),
6377 vec![9, 5],
6378 "k_<margin> aliases should materialize requested per-margin values"
6379 );
6380 }
6381
6382 #[test]
6383 fn tensor_smooth_low_cardinality_axis_falls_back_to_lower_degree_basis() {
6384 // mgcv-style: `te(x, b, k=c(5, 2))` with a BINARY second margin (only
6385 // values {0, 1}) is a legitimate request — the binary axis can hold at
6386 // most a 2-function linear basis. We must NOT reject k=2 with a
6387 // "k too small for degree 3" config error; instead, drop the spline
6388 // degree on the binary axis to k_axis - 1 (here 1, linear) while
6389 // keeping the continuous margin at the requested degree=3, k=5.
6390 let ds = continuous_dataset(
6391 &["y", "x", "b"],
6392 (0..40)
6393 .map(|i| {
6394 let x = i as f64 / 39.0;
6395 let b = (i % 2) as f64;
6396 vec![x.sin() + 0.5 * b, x, b]
6397 })
6398 .collect(),
6399 );
6400 let parsed = parse_formula("y ~ te(x, b, k=[5, 2])").expect("parse tensor with k=[5,2]");
6401 let col_map = ds.column_map();
6402 let mut notes = Vec::new();
6403 let terms = build_termspec(
6404 &parsed.terms,
6405 &ds,
6406 &col_map,
6407 &mut notes,
6408 &gam_runtime::resource::ResourcePolicy::default_library(),
6409 )
6410 .expect("build tensor with binary margin");
6411 let SmoothBasisSpec::TensorBSpline { spec, .. } = &terms.smooth_terms[0].basis else {
6412 panic!("expected tensor B-spline for te(x, b)");
6413 };
6414 // Continuous margin keeps requested degree=3 and k=5; binary margin
6415 // drops to degree=1 (linear) so the requested k=2 yields exactly two
6416 // basis functions before tensor-product identifiability is applied.
6417 let continuous = &spec.marginalspecs[0];
6418 let binary = &spec.marginalspecs[1];
6419 assert_eq!(continuous.degree, 3);
6420 assert_eq!(binary.degree, 1);
6421 assert!(
6422 binary.penalty_order >= 1 && binary.penalty_order <= binary.degree,
6423 "binary margin penalty_order {} must satisfy 1 <= order <= degree={}",
6424 binary.penalty_order,
6425 binary.degree
6426 );
6427 let basis_size = |m: &BSplineBasisSpec| match m.knotspec {
6428 BSplineKnotSpec::PeriodicUniform { num_basis, .. } => num_basis,
6429 BSplineKnotSpec::Generate {
6430 num_internal_knots, ..
6431 } => num_internal_knots + m.degree + 1,
6432 BSplineKnotSpec::Automatic {
6433 num_internal_knots: Some(n),
6434 ..
6435 } => n + m.degree + 1,
6436 // The mgcv-default `cr` margin (#1074) reports its basis size as the
6437 // number of value-knots placed.
6438 BSplineKnotSpec::NaturalCubicRegression { ref knots } => knots.len(),
6439 _ => panic!("unexpected tensor marginal knotspec"),
6440 };
6441 assert_eq!(basis_size(continuous), 5);
6442 assert_eq!(basis_size(binary), 2);
6443 }
6444
6445 #[test]
6446 fn tensor_smooth_uniform_k_is_capped_to_a_low_cardinality_margins_distinct_values() {
6447 // Regression: a SINGLE `k=5` applied to every axis of `te(x, b, k=5)`
6448 // with a BINARY second margin (`b ∈ {0, 1}`) must build a valid tensor,
6449 // NOT hard-fail in cr-knot selection ("cubic regression spline with k=5
6450 // requires at least 5 distinct values, got 2"). mgcv caps a margin's
6451 // basis to its data support; the binary axis becomes the 2-function
6452 // (linear) margin, while the continuous axis keeps the requested k=5.
6453 // This is the `te(age, badh, k=5)` real-data case that previously errored.
6454 let ds = continuous_dataset(
6455 &["y", "x", "b"],
6456 (0..40)
6457 .map(|i| {
6458 let x = i as f64 / 39.0;
6459 let b = (i % 2) as f64;
6460 vec![x.sin() + 0.5 * b, x, b]
6461 })
6462 .collect(),
6463 );
6464 let parsed = parse_formula("y ~ te(x, b, k=5)").expect("parse tensor with uniform k=5");
6465 let col_map = ds.column_map();
6466 let mut notes = Vec::new();
6467 let terms = build_termspec(
6468 &parsed.terms,
6469 &ds,
6470 &col_map,
6471 &mut notes,
6472 &gam_runtime::resource::ResourcePolicy::default_library(),
6473 )
6474 .expect("uniform k=5 must auto-cap the binary margin instead of erroring");
6475 let SmoothBasisSpec::TensorBSpline { spec, .. } = &terms.smooth_terms[0].basis else {
6476 panic!("expected tensor B-spline for te(x, b)");
6477 };
6478 let basis_size = |m: &BSplineBasisSpec| match &m.knotspec {
6479 BSplineKnotSpec::PeriodicUniform { num_basis, .. } => *num_basis,
6480 BSplineKnotSpec::Generate {
6481 num_internal_knots, ..
6482 } => num_internal_knots + m.degree + 1,
6483 BSplineKnotSpec::Automatic {
6484 num_internal_knots: Some(n),
6485 ..
6486 } => n + m.degree + 1,
6487 BSplineKnotSpec::NaturalCubicRegression { knots } => knots.len(),
6488 other => panic!("unexpected tensor marginal knotspec: {other:?}"),
6489 };
6490 let binary = &spec.marginalspecs[1];
6491 // Binary margin is reduced to the 2-function linear basis its data
6492 // supports (k capped from 5 to 2, degree dropped to 1).
6493 assert_eq!(basis_size(binary), 2);
6494 assert_eq!(binary.degree, 1);
6495 // The continuous margin is unaffected by the cap (40 distinct values).
6496 assert_eq!(basis_size(&spec.marginalspecs[0]), 5);
6497 }
6498
6499 #[test]
6500 fn tensor_all_tp_margins_with_per_margin_k_routes_to_bspline_tensor() {
6501 // `te(x1, x2, bs=c('tp','tp'), k=c(5,5))` is mgcv's per-margin tp tensor
6502 // with per-margin basis sizes — a tensor product of two 1-D bases, each
6503 // of dimension 5. The list-valued `k=c(5,5)` is honored by
6504 // `parse_tensor_k_list`, producing one penalized B-spline margin per axis
6505 // (each spanning the requested per-axis thin-plate function space). This
6506 // is the same anisotropic-tensor routing the scalar/no-`k` case takes —
6507 // a `te()` request is ALWAYS a tensor product, never a silent isotropic
6508 // thin-plate substitution.
6509 let ds = continuous_dataset(
6510 &["y", "x1", "x2"],
6511 (0..32)
6512 .map(|i| {
6513 let t = i as f64 / 31.0;
6514 vec![t.sin(), t, 1.0 - t]
6515 })
6516 .collect(),
6517 );
6518 let parsed =
6519 parse_formula("y ~ te(x1, x2, bs=c('tp','tp'), k=c(5,5))").expect("parse tensor");
6520 let col_map = ds.column_map();
6521 let mut notes = Vec::new();
6522 let terms = build_termspec(
6523 &parsed.terms,
6524 &ds,
6525 &col_map,
6526 &mut notes,
6527 &gam_runtime::resource::ResourcePolicy::default_library(),
6528 )
6529 .expect("build tensor terms with per-margin k");
6530 let SmoothBasisSpec::TensorBSpline { spec, .. } = &terms.smooth_terms[0].basis else {
6531 panic!(
6532 "expected B-spline tensor when k=c(5,5) is supplied with bs=c('tp','tp'), got {:?}",
6533 terms.smooth_terms[0].basis
6534 );
6535 };
6536 // Since #1074 a `tp` tensor margin (k >= 3) is realized as a
6537 // Lancaster–Salkauskas natural cubic-regression margin (cr basis
6538 // dimension == knot count), not an open `Generate` B-spline. It is
6539 // still a `TensorBSpline` spec with one penalized 1-D margin per axis,
6540 // so the routing assertion above still holds; only the per-margin
6541 // knotspec variant changed. The earlier `_ => panic!` arm pinned the
6542 // pre-#1074 `Generate`-only representation and is stale. Decode every
6543 // margin variant to its basis dimension (mirroring the
6544 // `tensor_margin_basis_sizes` helper).
6545 let dims = spec
6546 .marginalspecs
6547 .iter()
6548 .map(|m| match m.knotspec {
6549 BSplineKnotSpec::Generate {
6550 num_internal_knots, ..
6551 } => num_internal_knots + m.degree + 1,
6552 BSplineKnotSpec::Automatic {
6553 num_internal_knots: Some(num_internal_knots),
6554 ..
6555 } => num_internal_knots + m.degree + 1,
6556 BSplineKnotSpec::PeriodicUniform { num_basis, .. } => num_basis,
6557 BSplineKnotSpec::Provided(ref knots) => knots.len().saturating_sub(m.degree + 1),
6558 BSplineKnotSpec::NaturalCubicRegression { ref knots } => knots.len(),
6559 BSplineKnotSpec::Automatic {
6560 num_internal_knots: None,
6561 ..
6562 } => panic!("test cannot infer automatic knot count"),
6563 })
6564 .collect::<Vec<_>>();
6565 assert_eq!(dims, vec![5, 5]);
6566 }
6567
6568 #[test]
6569 fn tensor_all_tp_margins_without_per_margin_k_builds_anisotropic_tensor() {
6570 // `te(x1, x2, bs=c('tp','tp'))` is a tensor-product request and must
6571 // build a genuine anisotropic tensor product (one smoothing parameter
6572 // per margin), NOT a silently-substituted multi-D isotropic thin-plate
6573 // radial smooth — that would be a different model (`s(x1,x2,bs='tp')`).
6574 // The routing is now consistent whether or not `k` is list-valued: a tp
6575 // margin vector always realizes each axis as a 1-D penalized B-spline
6576 // margin spanning the same per-axis thin-plate function space (#1082).
6577 let ds = continuous_dataset(
6578 &["y", "x1", "x2"],
6579 (0..32)
6580 .map(|i| {
6581 let t = i as f64 / 31.0;
6582 vec![t.sin(), t, 1.0 - t]
6583 })
6584 .collect(),
6585 );
6586 let parsed = parse_formula("y ~ te(x1, x2, bs=c('tp','tp'))").expect("parse tensor");
6587 let col_map = ds.column_map();
6588 let mut notes = Vec::new();
6589 let terms = build_termspec(
6590 &parsed.terms,
6591 &ds,
6592 &col_map,
6593 &mut notes,
6594 &gam_runtime::resource::ResourcePolicy::default_library(),
6595 )
6596 .expect("build tensor terms without per-margin k");
6597 let SmoothBasisSpec::TensorBSpline { spec, .. } = &terms.smooth_terms[0].basis else {
6598 panic!(
6599 "te(...,bs=c('tp','tp')) must route to an anisotropic tensor product, not a \
6600 silent isotropic thin-plate substitution; got {:?}",
6601 terms.smooth_terms[0].basis
6602 );
6603 };
6604 assert_eq!(
6605 spec.marginalspecs.len(),
6606 2,
6607 "tp tensor must carry one penalized B-spline margin per axis"
6608 );
6609 }
6610
6611 #[test]
6612 fn explicit_basis_sizes_are_not_small_n_clamped() {
6613 let ds = continuous_dataset(
6614 &["y", "x1", "x2", "x3", "x4", "x5"],
6615 (0..12)
6616 .map(|i| {
6617 let x = i as f64 / 11.0;
6618 vec![x.sin(), x, x * x, x + 0.1, 1.0 - x, (2.0 * x).sin()]
6619 })
6620 .collect(),
6621 );
6622 let parsed = parse_formula("y ~ s(x1, k=10) + s(x2) + s(x3) + s(x4) + s(x5)")
6623 .expect("parse multi-smooth formula");
6624 let col_map = ds.column_map();
6625 let mut notes = Vec::new();
6626 let terms = build_termspec(
6627 &parsed.terms,
6628 &ds,
6629 &col_map,
6630 &mut notes,
6631 &gam_runtime::resource::ResourcePolicy::default_library(),
6632 )
6633 .expect("build multi-smooth terms");
6634 let SmoothBasisSpec::BSpline1D { spec, .. } = &terms.smooth_terms[0].basis else {
6635 panic!("expected first smooth to be B-spline");
6636 };
6637 assert!(matches!(
6638 &spec.knotspec,
6639 BSplineKnotSpec::Generate {
6640 num_internal_knots: 6,
6641 ..
6642 }
6643 ));
6644 }
6645
6646 #[test]
6647 fn explicit_duchon_centers_are_not_small_n_bumped() {
6648 let ds = continuous_dataset(
6649 &["y", "x1", "x2", "x3", "x4", "x5"],
6650 (0..12)
6651 .map(|i| {
6652 let x = i as f64 / 11.0;
6653 vec![x.sin(), x, x * x, x + 0.1, 1.0 - x, (2.0 * x).sin()]
6654 })
6655 .collect(),
6656 );
6657 // Pure 1D Duchon at default options resolves the nullspace to Linear
6658 // (2s < d forces escalation), giving 2 polynomial nullspace columns;
6659 // the well-posedness gate requires num_centers > polynomial_cols, so
6660 // 3 is the smallest valid count. It is still well below the small-N
6661 // bump target of polynomial_cols + 4 = 6, so this exercises the
6662 // "explicit value is honored" path the test name advertises.
6663 let parsed = parse_formula("y ~ duchon(x1, centers=3) + s(x2) + s(x3) + s(x4) + s(x5)")
6664 .expect("parse multi-smooth formula");
6665 let col_map = ds.column_map();
6666 let mut notes = Vec::new();
6667 let terms = build_termspec(
6668 &parsed.terms,
6669 &ds,
6670 &col_map,
6671 &mut notes,
6672 &gam_runtime::resource::ResourcePolicy::default_library(),
6673 )
6674 .expect("build multi-smooth terms");
6675 let SmoothBasisSpec::Duchon { spec, .. } = &terms.smooth_terms[0].basis else {
6676 panic!("expected first smooth to be Duchon");
6677 };
6678 assert!(matches!(
6679 spec.center_strategy,
6680 CenterStrategy::FarthestPoint { num_centers: 3 }
6681 ));
6682 }
6683
6684 #[test]
6685 fn inferred_tensor_basis_cap_uses_coordinate_support_not_duplicate_rows() {
6686 let mut unique_rows = Vec::new();
6687 for i in 0..50 {
6688 let theta = i as f64 / 50.0;
6689 for j in 0..16 {
6690 let h = -1.0 + 2.0 * (j as f64) / 15.0;
6691 let y = theta.cos() + h;
6692 unique_rows.push(vec![y, theta, h]);
6693 }
6694 }
6695 let mut repeated_rows = Vec::new();
6696 for _ in 0..12 {
6697 repeated_rows.extend(unique_rows.iter().cloned());
6698 }
6699
6700 let unique = continuous_dataset(&["y", "theta", "h"], unique_rows);
6701 let repeated = continuous_dataset(&["y", "theta", "h"], repeated_rows);
6702
6703 let unique_basis = inferred_tensor_basis_product(&unique);
6704 let repeated_basis = inferred_tensor_basis_product(&repeated);
6705
6706 assert_eq!(
6707 unique_basis, repeated_basis,
6708 "duplicating existing tensor coordinates must not inflate inferred basis width"
6709 );
6710 }
6711
6712 #[test]
6713 fn inferred_three_dim_tensor_basis_stays_bounded_for_reml_selection() {
6714 // Regression for gam#813: the inferred per-margin k must be
6715 // dimension-aware so the 3-D tensor width p = ∏ k_d does not explode.
6716 // With the old 1-D-per-margin rule a 3-D `te` defaulted to 7³=343 at
6717 // small n and 20³=8000 at larger n, making the (non-Kronecker-factorable)
6718 // full-tensor sum-to-zero penalty's O(p³) REML reparameterization a
6719 // multi-minute stall. The dimension-aware budget keeps the product near
6720 // mgcv's te default (≈5³=125) regardless of n.
6721 let make = |n: usize| -> usize {
6722 let mut rows = Vec::with_capacity(n);
6723 for i in 0..n {
6724 let f = i as f64 / n as f64;
6725 rows.push(vec![f.sin(), f, (2.0 * f).cos(), (3.0 * f) % 1.0]);
6726 }
6727 let ds = continuous_dataset(&["y", "x1", "x2", "x3"], rows);
6728 let parsed = parse_formula("y ~ te(x1, x2, x3)").expect("parse 3-D tensor");
6729 let col_map = ds.column_map();
6730 let mut notes = Vec::new();
6731 let terms = build_termspec(
6732 &parsed.terms,
6733 &ds,
6734 &col_map,
6735 &mut notes,
6736 &ResourcePolicy::default_library(),
6737 )
6738 .expect("build 3-D tensor termspec");
6739 let SmoothBasisSpec::TensorBSpline { spec, .. } = &terms.smooth_terms[0].basis else {
6740 panic!("expected tensor smooth");
6741 };
6742 spec.marginalspecs
6743 .iter()
6744 .map(|m| match m.knotspec {
6745 BSplineKnotSpec::Generate {
6746 num_internal_knots, ..
6747 } => num_internal_knots + m.degree + 1,
6748 BSplineKnotSpec::Automatic {
6749 num_internal_knots: Some(num_internal_knots),
6750 ..
6751 } => num_internal_knots + m.degree + 1,
6752 // The mgcv-default `cr` margin (#1074) reports its basis size
6753 // as the number of value-knots placed.
6754 BSplineKnotSpec::NaturalCubicRegression { ref knots } => knots.len(),
6755 _ => panic!("unexpected tensor margin knotspec"),
6756 })
6757 .product()
6758 };
6759
6760 // n=30 (the issue's data): was 7³=343, must now be modest.
6761 assert!(
6762 make(60) <= 216,
6763 "3-D te at small n must stay near the mgcv te default, got {}",
6764 make(60)
6765 );
6766 // Larger n must NOT grow the product toward n³ (was 20³=8000).
6767 assert!(
6768 make(2000) <= 216,
6769 "3-D te at large n must not blow ∏k toward the data size, got {}",
6770 make(2000)
6771 );
6772 }
6773
6774 #[test]
6775 fn parse_bspline_boundary_conditions_and_side_selector() {
6776 // Non-zero anchors are rejected at parse time; the diagnostic must
6777 // name the side and value, which doubles as a check that the
6778 // `side=left` filter routes the global `anchor=` value to the
6779 // left endpoint (not the right).
6780 let mut opts = BTreeMap::new();
6781 opts.insert("boundary_conditions".to_string(), "anchored".to_string());
6782 opts.insert("side".to_string(), "left".to_string());
6783 opts.insert("anchor".to_string(), "2.5".to_string());
6784 let err = parse_bspline_boundary_conditions(&opts)
6785 .expect_err("non-zero left anchor must be rejected")
6786 .to_string();
6787 assert!(
6788 err.contains("left") && err.contains("2.5"),
6789 "rejection should name the affected side and value: {err}"
6790 );
6791
6792 // Side-specific aliases (`start_bc`/`end_bc`) plus the side-specific
6793 // anchor key (`right_anchor`) must funnel the value onto the right
6794 // endpoint — verified through the rejection diagnostic.
6795 let mut opts = BTreeMap::new();
6796 opts.insert("start_bc".to_string(), "clamped".to_string());
6797 opts.insert("end_bc".to_string(), "zero".to_string());
6798 opts.insert("right_anchor".to_string(), "-1.0".to_string());
6799 let err = parse_bspline_boundary_conditions(&opts)
6800 .expect_err("non-zero right anchor must be rejected")
6801 .to_string();
6802 assert!(
6803 err.contains("right") && err.contains("-1"),
6804 "rejection should name the affected side and value: {err}"
6805 );
6806
6807 // With anchors at zero the basis builder accepts the configuration,
6808 // so the same alias plumbing yields a clean `Anchored { value: 0.0 }`
6809 // on the right and `Clamped` on the left.
6810 let mut opts = BTreeMap::new();
6811 opts.insert("start_bc".to_string(), "clamped".to_string());
6812 opts.insert("end_bc".to_string(), "zero".to_string());
6813 let parsed = parse_bspline_boundary_conditions(&opts).expect("boundary conditions");
6814 assert!(matches!(
6815 parsed.left,
6816 BSplineEndpointBoundaryCondition::Clamped
6817 ));
6818 assert!(matches!(
6819 parsed.right,
6820 BSplineEndpointBoundaryCondition::Anchored { value } if value.abs() < 1e-12
6821 ));
6822 }
6823
6824 #[test]
6825 fn categorical_by_numeric_interaction_expands_treatment_coded_cells() {
6826 // `y ~ x:g` is an INTERACTION-ONLY numeric-by-factor model: there is no
6827 // `x` main effect, so the marginal parent that would identify a dropped
6828 // reference level is ABSENT. The expansion must therefore be marginality-
6829 // aware (gam#1158) and DUMMY-code `g` — keep ALL levels — yielding the
6830 // "common intercept, separate slopes" design (one x-slope column per
6831 // group). Treatment-coding here (dropping the reference level) would pin
6832 // the reference group's slope to zero, a rank-deficient fit; that wrong
6833 // behaviour is what this test now guards against. (The treatment-coded
6834 // path is exercised when the `x` parent is present — see
6835 // `categorical_by_numeric_interaction_keeps_treatment_coding_with_parent`.)
6836 let ds = factor_dataset();
6837 // `g` is categorical with two levels (encoded 0.0 → "a", 1.0 → "b").
6838 let parsed = parse_formula("y ~ x:g").expect("parse `y ~ x:g`");
6839 let col_map = ds.column_map();
6840 let mut notes = Vec::new();
6841 let terms = build_termspec(
6842 &parsed.terms,
6843 &ds,
6844 &col_map,
6845 &mut notes,
6846 &ResourcePolicy::default_library(),
6847 )
6848 .expect("factor-aware `x:g` interaction must build, not error");
6849
6850 assert_eq!(
6851 terms.linear_terms.len(),
6852 2,
6853 "interaction-only `x:g` keeps ALL factor levels (full dummy coding): one slope column per group"
6854 );
6855
6856 let x_col = *col_map.get("x").expect("x column");
6857 let g_col = *col_map.get("g").expect("g column");
6858
6859 // Both level gates must appear exactly once across the two cell columns,
6860 // and each cell carries `x` as a product factor (not a raw column for g).
6861 let mut seen_bits = std::collections::HashSet::new();
6862 for term in &terms.linear_terms {
6863 assert!(
6864 term.is_interaction(),
6865 "the categorical-by-numeric cell is a Wilkinson-Rogers interaction"
6866 );
6867 assert_eq!(term.feature_cols, vec![x_col]);
6868 assert_eq!(term.categorical_levels.len(), 1);
6869 let (gate_col, gate_bits) = term.categorical_levels[0];
6870 assert_eq!(gate_col, g_col);
6871 assert!(seen_bits.insert(gate_bits), "each level appears once");
6872
6873 // Realize and check it equals `1[g == gate_bits] * x` row by row.
6874 let column = term
6875 .realized_design_column(ds.values.view())
6876 .expect("realize cell column");
6877 let n = ds.values.nrows();
6878 assert_eq!(column.len(), n);
6879 for row in 0..n {
6880 let x = ds.values[[row, x_col]];
6881 let g = ds.values[[row, g_col]];
6882 let expected = if g.to_bits() == gate_bits { x } else { 0.0 };
6883 assert!(
6884 (column[row] - expected).abs() < 1e-12,
6885 "row {row}: g={g}, x={x}, expected {expected}, got {}",
6886 column[row]
6887 );
6888 }
6889 }
6890 // Both the reference level "a" (0.0) and the non-reference "b" (1.0) are
6891 // kept — the reference level is NOT dropped in the interaction-only form.
6892 assert!(seen_bits.contains(&0.0_f64.to_bits()));
6893 assert!(seen_bits.contains(&1.0_f64.to_bits()));
6894 }
6895
6896 #[test]
6897 fn categorical_by_numeric_interaction_keeps_treatment_coding_with_parent() {
6898 // With the `x` main effect PRESENT (`y ~ x + x:g`), the marginal parent
6899 // that identifies a dropped reference level exists, so `x:g` keeps its
6900 // historical treatment coding: the reference level "a" is dropped and
6901 // only the non-reference slope-deviation column for "b" is emitted. This
6902 // guards that the marginality-aware fix (gam#1158) does NOT regress the
6903 // parent-present form, which must stay column-space-identical to mgcv's
6904 // `x + x:g`.
6905 let ds = factor_dataset();
6906 let parsed = parse_formula("y ~ x + x:g").expect("parse `y ~ x + x:g`");
6907 let col_map = ds.column_map();
6908 let mut notes = Vec::new();
6909 let terms = build_termspec(
6910 &parsed.terms,
6911 &ds,
6912 &col_map,
6913 &mut notes,
6914 &ResourcePolicy::default_library(),
6915 )
6916 .expect("`x + x:g` must build");
6917
6918 // One main-effect `x` column plus one treatment-coded interaction cell.
6919 let x_col = *col_map.get("x").expect("x column");
6920 let g_col = *col_map.get("g").expect("g column");
6921 let interaction_cells: Vec<_> = terms
6922 .linear_terms
6923 .iter()
6924 .filter(|t| t.is_interaction())
6925 .collect();
6926 assert_eq!(
6927 interaction_cells.len(),
6928 1,
6929 "with `x` present, `x:g` is treatment-coded → one cell (reference dropped)"
6930 );
6931 let term = interaction_cells[0];
6932 assert_eq!(term.feature_cols, vec![x_col]);
6933 assert_eq!(term.categorical_levels.len(), 1);
6934 let (gate_col, gate_bits) = term.categorical_levels[0];
6935 assert_eq!(gate_col, g_col);
6936 // The dropped reference is "a" (0.0); the kept gate is "b" (1.0).
6937 assert_eq!(gate_bits, 1.0_f64.to_bits());
6938 }
6939
6940 #[test]
6941 fn categorical_by_categorical_interaction_expands_full_cross_cells() {
6942 // `y ~ f:g` is an INTERACTION-ONLY factor-by-factor model: neither `f`
6943 // nor `g` appears as a main effect, so neither marginal parent is
6944 // present and BOTH factors must be dummy-coded (gam#1159). The correct
6945 // design is the SATURATED cell-means model: the full cross of ALL levels
6946 // (3 * 2 = 6 cells) minus ONE reference cell (the lexicographically-first
6947 // level of every factor, here f0:g0) absorbed by the intercept — rank
6948 // 6-1 = 5 cell columns + intercept, column-space-identical to `f*g`.
6949 // Treatment-coding both factors (the old behaviour) kept only
6950 // (3-1)*(2-1) = 2 cells and collapsed the rest onto the intercept, a
6951 // rank-deficient fit; that is the bug this test now guards against.
6952 let n = 30usize;
6953 let mut rows = Vec::with_capacity(n);
6954 for i in 0..n {
6955 let y = (i as f64).sin();
6956 let f = (i % 3) as f64; // 3 levels: 0,1,2
6957 let g = (i % 2) as f64; // 2 levels: 0,1
6958 rows.push(vec![y, f, g]);
6959 }
6960 let values = Array2::from_shape_vec(
6961 (n, 3),
6962 rows.into_iter().flat_map(|row| row.into_iter()).collect(),
6963 )
6964 .expect("rectangular cross-factor data");
6965 let ds = Dataset {
6966 headers: vec!["y".into(), "f".into(), "g".into()],
6967 values,
6968 schema: DataSchema {
6969 columns: vec![
6970 SchemaColumn {
6971 name: "y".into(),
6972 kind: ColumnKindTag::Continuous,
6973 levels: vec![],
6974 },
6975 SchemaColumn {
6976 name: "f".into(),
6977 kind: ColumnKindTag::Categorical,
6978 levels: vec!["f0".into(), "f1".into(), "f2".into()],
6979 },
6980 SchemaColumn {
6981 name: "g".into(),
6982 kind: ColumnKindTag::Categorical,
6983 levels: vec!["g0".into(), "g1".into()],
6984 },
6985 ],
6986 },
6987 column_kinds: vec![
6988 ColumnKindTag::Continuous,
6989 ColumnKindTag::Categorical,
6990 ColumnKindTag::Categorical,
6991 ],
6992 };
6993
6994 let parsed = parse_formula("y ~ f:g").expect("parse `y ~ f:g`");
6995 let col_map = ds.column_map();
6996 let mut notes = Vec::new();
6997 let terms = build_termspec(
6998 &parsed.terms,
6999 &ds,
7000 &col_map,
7001 &mut notes,
7002 &ResourcePolicy::default_library(),
7003 )
7004 .expect("factor-by-factor `f:g` interaction must build, not error");
7005
7006 assert_eq!(
7007 terms.linear_terms.len(),
7008 5,
7009 "saturated 3*2 = 6 cross cells minus one reference cell (f0:g0) = 5"
7010 );
7011
7012 let f_col = *col_map.get("f").expect("f column");
7013 let g_col = *col_map.get("g").expect("g column");
7014 // The dropped reference cell pairs each factor's lexicographically-first
7015 // level: f0 (0.0) and g0 (0.0). It must NOT appear among the emitted
7016 // cells; every OTHER cross cell must.
7017 let f0 = 0.0_f64.to_bits();
7018 let g0 = 0.0_f64.to_bits();
7019 let mut emitted = std::collections::HashSet::new();
7020 for term in &terms.linear_terms {
7021 // No numeric operand: the realized column is a pure cell indicator.
7022 assert!(term.feature_cols.is_empty());
7023 assert_eq!(term.categorical_levels.len(), 2);
7024 let mut gates = std::collections::HashMap::new();
7025 for &(col, bits) in &term.categorical_levels {
7026 gates.insert(col, bits);
7027 }
7028 let f_bits = *gates.get(&f_col).expect("f gate present");
7029 let g_bits = *gates.get(&g_col).expect("g gate present");
7030 // The reference cell f0:g0 must have been dropped.
7031 assert!(
7032 !(f_bits == f0 && g_bits == g0),
7033 "the reference cell f0:g0 must be absorbed by the intercept, not emitted"
7034 );
7035 emitted.insert((f_bits, g_bits));
7036
7037 let column = term
7038 .realized_design_column(ds.values.view())
7039 .expect("realize cross cell");
7040 for row in 0..n {
7041 let f = ds.values[[row, f_col]];
7042 let g = ds.values[[row, g_col]];
7043 let expected = if f.to_bits() == f_bits && g.to_bits() == g_bits {
7044 1.0
7045 } else {
7046 0.0
7047 };
7048 assert!(
7049 (column[row] - expected).abs() < 1e-12,
7050 "row {row}: expected {expected}, got {}",
7051 column[row]
7052 );
7053 }
7054 assert!(
7055 column.iter().any(|&v| v == 1.0),
7056 "each cross cell must be observed in the data"
7057 );
7058 }
7059 // Every non-reference cross cell is present exactly once: all 6 cells
7060 // except f0:g0.
7061 let f_levels = [0.0_f64.to_bits(), 1.0_f64.to_bits(), 2.0_f64.to_bits()];
7062 let g_levels = [0.0_f64.to_bits(), 1.0_f64.to_bits()];
7063 for &fb in &f_levels {
7064 for &gb in &g_levels {
7065 if fb == f0 && gb == g0 {
7066 continue;
7067 }
7068 assert!(
7069 emitted.contains(&(fb, gb)),
7070 "saturated cross cell must be present"
7071 );
7072 }
7073 }
7074 }
7075}