Skip to main content

gam_terms/
term_builder.rs

1//! Term construction: bridge from parsed formula terms to `TermCollectionSpec`.
2//!
3//! This module takes the AST produced by `inference::formula_dsl` and a loaded
4//! dataset, resolves column references, infers knot counts and center strategies,
5//! and produces a `TermCollectionSpec` ready for `build_term_collection_design`.
6
7use std::collections::{BTreeMap, BTreeSet, HashMap};
8use std::path::PathBuf;
9
10use ndarray::{Array2, ArrayView1};
11
12use crate::basis::{
13    BSplineBasisSpec, BSplineBoundaryConditions, BSplineEndpointBoundaryCondition,
14    BSplineIdentifiability, BSplineKnotSpec, CenterCountRequest, CenterStrategy,
15    ConstantCurvatureBasisSpec, ConstantCurvatureIdentifiability, DuchonBasisSpec,
16    DuchonNullspaceOrder, DuchonOperatorPenaltySpec, MaternBasisSpec, MaternIdentifiability,
17    MaternNu, MeasureJetBasisSpec, MeasureJetIdentifiability, OneDimensionalBoundary,
18    SpatialIdentifiability, SphereMethod, SphereWahbaKernel, SphericalSplineBasisSpec,
19    SphericalSplineIdentifiability, ThinPlateBasisSpec, auto_spatial_center_strategy,
20    default_num_centers, default_spatial_center_strategy, default_spherical_harmonic_degree,
21    plan_spatial_basis, thin_plate_penalty_order,
22};
23use crate::inference::formula_dsl::{
24    ParsedTerm, SmoothKind, option_bool, option_f64, option_f64_strict, option_usize,
25    option_usize_any, option_usize_any_strict, option_usize_strict, strip_quotes,
26};
27use crate::smooth::{
28    BySmoothKind, ByVarKind, ByVariableSpec, FactorSmoothFlavour, FactorSmoothSpec,
29    LinearCoefficientGeometry, LinearTermSpec, RandomEffectTermSpec, ShapeConstraint,
30    SmoothBasisSpec, SmoothTermSpec, TensorBSplineIdentifiability,
31    TensorBSplinePenaltyDecomposition, TensorBSplineSpec, TermCollectionSpec,
32};
33use gam_data::{ColumnKindTag, DataError, EncodedDataset as Dataset};
34use gam_problem::types::ColIdx;
35use gam_runtime::resource::ResourcePolicy;
36
37/// Default B-spline degree when a smooth's `degree=` option is absent. Cubic
38/// (degree 3) is the standard GAM convention: C² continuity with a low knot
39/// count.
40const DEFAULT_BSPLINE_DEGREE: usize = 3;
41
42/// Default difference-penalty order when a smooth's `penalty_order=` (alias
43/// `m=`) option is absent. Second-order (curvature) is the standard P-spline
44/// convention.
45const DEFAULT_PENALTY_ORDER: usize = 2;
46
47/// Default basis dimension for one-dimensional cyclic cubic P-splines.
48///
49/// Periodic smooths spend no coefficients on free endpoints, so they should not
50/// inherit the larger open B-spline knot ceiling by default.  This is still only
51/// a default: callers can request a richer periodic space with `k=`.
52const CYCLIC_DEFAULT_BASIS_DIM: usize = 12;
53
54/// Default shared-marginal basis dimension for `bs="fs"`/`bs="sz"` factor smooths,
55/// matching mgcv's factor-smooth default `k=10`. A factor smooth shares one
56/// marginal across all levels; a modest basis recovers the shared signal without
57/// over-fitting each group's within-group noise (gam#903). Overridden by an
58/// explicit `k`/`basis_dim`.
59const FACTOR_SMOOTH_DEFAULT_BASIS_DIM: usize = 10;
60
61/// Default row-chunk size for the out-of-core PCA-basis smooth when the
62/// `chunk_size=` option is absent. Streams the design in row blocks to bound
63/// peak memory independent of the dataset row count.
64const DEFAULT_PCA_CHUNK_SIZE: usize = 4096;
65
66// ---------------------------------------------------------------------------
67// Typed errors
68// ---------------------------------------------------------------------------
69
70/// Typed errors emitted by term-builder helpers. `Display` reproduces the exact
71/// pre-refactor `format!(...)` text byte-for-byte, so callers that string-match
72/// on the message (tests, log assertions) keep working unchanged. Public-API
73/// functions still return `Result<_, String>` and use `.to_string()` shims at
74/// their boundary to stay compatible with callers in protected modules.
75#[derive(Clone, Debug)]
76pub enum TermBuilderError {
77    /// Column-resolution / column-kind lookup failures whose context is purely
78    /// internal (column-kind table out-of-sync, alias map missing an entry,
79    /// etc.). User-facing "this formula references a column that doesn't
80    /// exist" diagnostics use the dedicated `ColumnNotFound` variant so the
81    /// FFI boundary can lift the structured payload into a Python
82    /// `ColumnNotFoundError` without parsing prose.
83    MissingColumn { reason: String },
84    /// A formula referenced a column that is not present in the input data.
85    /// Mirrors `DataError::ColumnNotFound` field-for-field so the conversion
86    /// across module boundaries is a pure data move (no re-derivation, no
87    /// string re-parsing). Public callers see byte-identical `Display`
88    /// output to the legacy `missing_column_message` text.
89    ColumnNotFound {
90        name: String,
91        role: Option<String>,
92        available: Vec<String>,
93        similar: Vec<String>,
94        tsv_hint: bool,
95    },
96    /// User-specified configuration is internally inconsistent (e.g. too few
97    /// variables for a smooth type, conflicting size options, requested basis
98    /// dimension below the polynomial nullspace).
99    IncompatibleConfig { reason: String },
100    /// Option parsing failure: malformed numeric expression, unknown option
101    /// key, out-of-range integer, list-length mismatch, etc.
102    InvalidOption { reason: String },
103    /// User requested a feature that is intentionally not supported (unknown
104    /// smooth type / method / kernel / identifiability, non-zero anchor,
105    /// internal-only token, etc.).
106    UnsupportedFeature { reason: String },
107    /// Input data is degenerate for the requested term (constant column,
108    /// non-finite categorical entries, ...).
109    DegenerateData { reason: String },
110    /// Term-collection-stage formula error — a node that the caller was
111    /// supposed to resolve upstream reached the builder.
112    MalformedFormula { reason: String },
113}
114
115impl std::fmt::Display for TermBuilderError {
116    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
117        match self {
118            TermBuilderError::MissingColumn { reason }
119            | TermBuilderError::IncompatibleConfig { reason }
120            | TermBuilderError::InvalidOption { reason }
121            | TermBuilderError::UnsupportedFeature { reason }
122            | TermBuilderError::DegenerateData { reason }
123            | TermBuilderError::MalformedFormula { reason } => f.write_str(reason),
124            // Delegate to the canonical `DataError::ColumnNotFound` formatter
125            // so a single source of truth defines the human text. The
126            // intermediate `DataError` constructed here owns its strings only
127            // for the duration of the Display call — no allocation cost
128            // beyond the original payload that this variant already holds.
129            TermBuilderError::ColumnNotFound {
130                name,
131                role,
132                available,
133                similar,
134                tsv_hint,
135            } => {
136                let canonical = DataError::ColumnNotFound {
137                    name: name.clone(),
138                    role: role.clone(),
139                    available: available.clone(),
140                    similar: similar.clone(),
141                    tsv_hint: *tsv_hint,
142                };
143                std::fmt::Display::fmt(&canonical, f)
144            }
145        }
146    }
147}
148
149impl From<TermBuilderError> for String {
150    fn from(err: TermBuilderError) -> String {
151        err.to_string()
152    }
153}
154
155/// Catchall lift for the term-builder's internal `Result<_, String>` helpers
156/// (numeric expression parsing, option lookup, boundary-condition parsing,
157/// ...) that flow into `build_termspec` via `?`. Maps to
158/// `IncompatibleConfig`, which is the most appropriate generic bucket for
159/// option/config-style failures — leaf sites that emit structured payloads
160/// (`From<DataError>` for column-not-found) bypass this fallback.
161impl From<String> for TermBuilderError {
162    fn from(reason: String) -> Self {
163        Self::IncompatibleConfig { reason }
164    }
165}
166
167/// Typed lift from data-layer errors. `DataError::ColumnNotFound` becomes
168/// `TermBuilderError::ColumnNotFound` field-for-field — no stringification,
169/// no information loss — so the FFI boundary downstream can dispatch on
170/// the typed variant. Other `DataError` variants degrade into
171/// `MissingColumn` since they describe column-resolution-time failures
172/// without a dedicated structured destination.
173impl From<DataError> for TermBuilderError {
174    fn from(err: DataError) -> Self {
175        match err {
176            DataError::ColumnNotFound {
177                name,
178                role,
179                available,
180                similar,
181                tsv_hint,
182            } => Self::ColumnNotFound {
183                name,
184                role,
185                available,
186                similar,
187                tsv_hint,
188            },
189            DataError::SchemaMismatch { reason }
190            | DataError::ParseError { reason }
191            | DataError::EncodingFailure { reason }
192            | DataError::EmptyInput { reason }
193            | DataError::InvalidValue { reason } => Self::MissingColumn { reason },
194        }
195    }
196}
197
198// Constructor helpers — keep error-site code compact and consistent.
199impl TermBuilderError {
200    #[inline]
201    fn missing_column(reason: impl Into<String>) -> Self {
202        TermBuilderError::MissingColumn {
203            reason: reason.into(),
204        }
205    }
206    #[inline]
207    fn incompatible_config(reason: impl Into<String>) -> Self {
208        TermBuilderError::IncompatibleConfig {
209            reason: reason.into(),
210        }
211    }
212    #[inline]
213    fn invalid_option(reason: impl Into<String>) -> Self {
214        TermBuilderError::InvalidOption {
215            reason: reason.into(),
216        }
217    }
218    #[inline]
219    fn unsupported_feature(reason: impl Into<String>) -> Self {
220        TermBuilderError::UnsupportedFeature {
221            reason: reason.into(),
222        }
223    }
224    #[inline]
225    fn degenerate_data(reason: impl Into<String>) -> Self {
226        TermBuilderError::DegenerateData {
227            reason: reason.into(),
228        }
229    }
230    #[inline]
231    fn malformed_formula(reason: impl Into<String>) -> Self {
232        TermBuilderError::MalformedFormula {
233            reason: reason.into(),
234        }
235    }
236}
237
238// ---------------------------------------------------------------------------
239// Column resolution
240// ---------------------------------------------------------------------------
241
242/// Resolve a bare column name to its index, returning a typed
243/// `DataError::ColumnNotFound` on miss so the FFI boundary can surface a
244/// structured `gamfit.ColumnNotFoundError(column=…, available=…)` rather
245/// than rely on string-classification of human prose. Internal callers that
246/// still flow `Result<_, String>` get byte-identical text via
247/// `From<DataError> for String`.
248pub fn resolve_col(col_map: &HashMap<String, usize>, name: &str) -> Result<usize, DataError> {
249    col_map
250        .get(name)
251        .copied()
252        .ok_or_else(|| DataError::column_not_found(col_map, name, None))
253}
254
255/// Like `resolve_col` but tags the missing-column payload with a role label
256/// (`"response"`, `"entry"`, `"exit"`, `"event"`, `"z"`, `"id"`, …) so the
257/// boundary-side Python exception can disambiguate which formula slot held
258/// the bad reference.
259pub fn resolve_role_col(
260    col_map: &HashMap<String, usize>,
261    name: &str,
262    role: &str,
263) -> Result<usize, DataError> {
264    col_map
265        .get(name)
266        .copied()
267        .ok_or_else(|| DataError::column_not_found(col_map, name, Some(role)))
268}
269
270fn encoded_levels_for_column(ds: &Dataset, col: ColIdx) -> Vec<(u64, String)> {
271    let mut seen = BTreeSet::<u64>::new();
272    for value in ds.values.column(col.get()) {
273        if value.is_finite() {
274            seen.insert(value.to_bits());
275        }
276    }
277    let schema_levels = ds
278        .schema
279        .columns
280        .get(col.get())
281        .map(|column| column.levels.as_slice())
282        .unwrap_or(&[]);
283    seen.into_iter()
284        .enumerate()
285        .map(|(idx, bits)| {
286            let fallback = format!("level{}", idx + 1);
287            let label = schema_levels.get(idx).cloned().unwrap_or(fallback);
288            (bits, label)
289        })
290        .collect()
291}
292
293pub fn column_map_with_alias(
294    col_map: &HashMap<String, usize>,
295    alias: &str,
296    target_column: &str,
297) -> HashMap<String, usize> {
298    let mut aliased = col_map.clone();
299    if let Some(idx) = col_map.get(target_column).copied() {
300        aliased.entry(alias.to_string()).or_insert(idx);
301    }
302    aliased
303}
304
305// ---------------------------------------------------------------------------
306// ParsedTerm[] + Dataset → TermCollectionSpec
307// ---------------------------------------------------------------------------
308
309pub fn build_termspec(
310    terms: &[ParsedTerm],
311    ds: &Dataset,
312    col_map: &HashMap<String, usize>,
313    inference_notes: &mut Vec<String>,
314    policy: &ResourcePolicy,
315) -> Result<TermCollectionSpec, TermBuilderError> {
316    let mut linear_terms = Vec::<LinearTermSpec>::new();
317    let mut random_terms = Vec::<RandomEffectTermSpec>::new();
318    let mut smooth_terms = Vec::<SmoothTermSpec>::new();
319    let smooth_coordinate_count = terms
320        .iter()
321        .map(|term| match term {
322            ParsedTerm::Smooth { vars, .. } => vars.len(),
323            _ => 0,
324        })
325        .sum::<usize>();
326
327    for t in terms {
328        match t {
329            ParsedTerm::Linear {
330                name,
331                explicit,
332                coefficient_min,
333                coefficient_max,
334            } => {
335                let col = resolve_col(col_map, name)?;
336                let auto_kind = ds.column_kinds.get(col).copied().ok_or_else(|| {
337                    TermBuilderError::missing_column(format!(
338                        "internal column-kind lookup failed for '{name}'"
339                    ))
340                    .to_string()
341                })?;
342                if *explicit {
343                    linear_terms.push(LinearTermSpec {
344                        name: name.clone(),
345                        feature_col: col,
346                        feature_cols: vec![col],
347                        categorical_levels: vec![],
348                        // Parametric linear terms are unpenalized by default
349                        // (MLE, matching mgcv/glm); see #749.
350                        double_penalty: false,
351                        coefficient_geometry: LinearCoefficientGeometry::Unconstrained,
352                        coefficient_min: *coefficient_min,
353                        coefficient_max: *coefficient_max,
354                    });
355                } else {
356                    match auto_kind {
357                        ColumnKindTag::Continuous | ColumnKindTag::Binary => {
358                            linear_terms.push(LinearTermSpec {
359                                name: name.clone(),
360                                feature_col: col,
361                                feature_cols: vec![col],
362                                categorical_levels: vec![],
363                                // Unpenalized parametric effect by default (#749).
364                                double_penalty: false,
365                                coefficient_geometry: LinearCoefficientGeometry::Unconstrained,
366                                coefficient_min: *coefficient_min,
367                                coefficient_max: *coefficient_max,
368                            });
369                        }
370                        ColumnKindTag::Categorical => {
371                            if coefficient_min.is_some() || coefficient_max.is_some() {
372                                return Err(TermBuilderError::incompatible_config(format!(
373                                    "coefficient constraints are not supported for categorical auto-random-effect term '{name}'; use group({name}) or an unconstrained numeric term"
374                                )));
375                            }
376                            random_terms.push(RandomEffectTermSpec {
377                                name: name.clone(),
378                                feature_col: col,
379                                drop_first_level: false,
380                                penalized: true,
381                                frozen_levels: None,
382                            });
383                        }
384                    }
385                }
386            }
387            ParsedTerm::BoundedLinear {
388                name,
389                min,
390                max,
391                prior,
392            } => {
393                let col = resolve_col(col_map, name)?;
394                let auto_kind = ds.column_kinds.get(col).copied().ok_or_else(|| {
395                    TermBuilderError::missing_column(format!(
396                        "internal column-kind lookup failed for '{name}'"
397                    ))
398                    .to_string()
399                })?;
400                if !matches!(auto_kind, ColumnKindTag::Continuous | ColumnKindTag::Binary) {
401                    return Err(TermBuilderError::incompatible_config(format!(
402                        "bounded() currently supports only numeric columns, got categorical '{name}'"
403                    )));
404                }
405                linear_terms.push(LinearTermSpec {
406                    name: name.clone(),
407                    feature_col: col,
408                    feature_cols: vec![col],
409                    categorical_levels: vec![],
410                    double_penalty: false,
411                    coefficient_geometry: LinearCoefficientGeometry::Bounded {
412                        min: *min,
413                        max: *max,
414                        prior: prior.clone(),
415                    },
416                    coefficient_min: None,
417                    coefficient_max: None,
418                });
419            }
420            ParsedTerm::RandomEffect { name } => {
421                let col = resolve_col(col_map, name)?;
422                random_terms.push(RandomEffectTermSpec {
423                    name: name.clone(),
424                    feature_col: col,
425                    drop_first_level: false,
426                    penalized: true,
427                    frozen_levels: None,
428                });
429            }
430            ParsedTerm::Smooth {
431                label,
432                vars,
433                kind,
434                options,
435            } => {
436                let smooth_vars = vars.clone();
437                let by_name = options.get("by").cloned();
438                // `bs="sz"` (sum-to-zero), like `bs="fs"`/`bs="re"`, is a
439                // factor-smooth family handled natively by `build_smooth_basis`'s
440                // fs/sz/re path: it detects the categorical factor among the
441                // variables and emits a `SmoothBasisSpec::FactorSmooth { Sz }`
442                // with the correct single-penalty marginal and modest default
443                // basis. Route sz straight through `build_smooth_basis` rather
444                // than intercepting it into a legacy `FactorSumToZero` envelope
445                // here (which left `sz(fac, x)` mis-typed as `FactorSumToZero`
446                // instead of the expected `FactorSmooth { Sz }`).
447                let cols = smooth_vars
448                    .iter()
449                    .map(|v| resolve_col(col_map, v))
450                    .collect::<Result<Vec<_>, _>>()?;
451                let mut inner_options = options.clone();
452                inner_options.remove("by");
453                // `ordered=` is consumed here (ByVarKind::Factor routing) and
454                // must not propagate to the inner basis builder, which has no
455                // allow-list entry for it and would reject it as an unknown option.
456                inner_options.remove("ordered");
457                // Pop the shape constraint before `build_smooth_basis` runs so
458                // it never reaches the per-kind `validate_known_options`
459                // allow-lists (the constraint is a property of the smooth term,
460                // not of any one basis kind). Basis-incompatible requests still
461                // fail loudly downstream via `shape_supports_basis`.
462                let shape = match inner_options.remove("shape") {
463                    None => ShapeConstraint::None,
464                    Some(raw) => crate::smooth::parse_shape_constraint(&raw)
465                        .map_err(TermBuilderError::invalid_option)?,
466                };
467                let inner_basis = build_smooth_basis(
468                    *kind,
469                    &smooth_vars,
470                    &cols,
471                    &inner_options,
472                    ds,
473                    inference_notes,
474                    policy,
475                    smooth_coordinate_count,
476                )?;
477                let inner_basis = match inner_basis {
478                    SmoothBasisSpec::FactorSmooth {
479                        spec:
480                            FactorSmoothSpec {
481                                continuous_cols,
482                                group_col,
483                                marginal,
484                                flavour: FactorSmoothFlavour::Sz,
485                                frozen_global_orthogonality,
486                                ..
487                            },
488                    } => {
489                        if continuous_cols.len() != 1 {
490                            return Err(TermBuilderError::incompatible_config(format!(
491                                "sz factor-smooth currently expects exactly one continuous covariate, found {}",
492                                continuous_cols.len()
493                            )));
494                        }
495                        SmoothBasisSpec::FactorSumToZero {
496                            inner: Box::new(SmoothBasisSpec::BSpline1D {
497                                feature_col: continuous_cols[0],
498                                spec: marginal,
499                            }),
500                            by_col: group_col,
501                            levels: encoded_levels_for_column(ds, ColIdx::new(group_col))
502                                .into_iter()
503                                .map(|(bits, _)| bits)
504                                .collect(),
505                            frozen_global_orthogonality,
506                        }
507                    }
508                    other => other,
509                };
510                if let Some(by_name) = by_name {
511                    let by_col = resolve_col(col_map, &by_name)?;
512                    match ds.column_kinds.get(by_col).copied().ok_or_else(|| {
513                        format!("internal column-kind lookup failed for by variable '{by_name}'")
514                    })? {
515                        ColumnKindTag::Categorical => {
516                            let levels = encoded_levels_for_column(ds, ColIdx::new(by_col));
517                            // A penalized random block for this factor already
518                            // owns its full level offsets when EITHER an explicit
519                            // `group(factor)` appears, OR a *bare* categorical
520                            // `+ factor` does — the latter is auto-promoted to a
521                            // penalized random-effect block (see the
522                            // `ParsedTerm::Linear` / `ColumnKindTag::Categorical`
523                            // arm above, `penalized: true`). Both representations
524                            // carry the same per-level offsets, so #1457: the
525                            // `by=` branch must NOT additionally add its own
526                            // unpenalized treatment-coded main effect, which would
527                            // double-represent the factor (two `g` design blocks +
528                            // a spurious extra smoothing parameter).
529                            let penalized_group_owner_present =
530                                terms.iter().any(|other| match other {
531                                    ParsedTerm::RandomEffect { name } => name == &by_name,
532                                    ParsedTerm::Linear {
533                                        name,
534                                        explicit: false,
535                                        ..
536                                    } if name == &by_name => col_map
537                                        .get(name)
538                                        .and_then(|c| ds.column_kinds.get(*c).copied())
539                                        .map(|kind| matches!(kind, ColumnKindTag::Categorical))
540                                        .unwrap_or(false),
541                                    _ => false,
542                                });
543                            // Add an unpenalized treatment-coded fixed main
544                            // effect for a standalone factor-by smooth, unless
545                            // the same factor already has an explicit
546                            // `group(factor)` term OR a bare categorical `+
547                            // factor` that was auto-promoted to a penalized
548                            // random block (#1457).  In those mixed-model forms
549                            // the penalized random intercept is the coherent
550                            // owner of level offsets; adding a no-pooling fixed
551                            // factor effect would bypass random-effect
552                            // shrinkage and degrade BLUP-style predictions.
553                            if !random_terms.iter().any(|rt| rt.name == by_name)
554                                && !penalized_group_owner_present
555                            {
556                                random_terms.push(RandomEffectTermSpec {
557                                    name: by_name.clone(),
558                                    feature_col: by_col,
559                                    drop_first_level: true,
560                                    penalized: false,
561                                    frozen_levels: None,
562                                });
563                            }
564                            // Unordered factor-by smooths are independent
565                            // level-specific smooths. Preserve that
566                            // term-spec structure explicitly so later
567                            // hierarchy/identifiability passes can see the
568                            // per-level ownership rather than a generic
569                            // BySmooth envelope.
570                            for (level_bits, level_label) in levels {
571                                smooth_terms.push(SmoothTermSpec {
572                                    name: format!("{label}:by={by_name}[{level_label}]"),
573                                    basis: SmoothBasisSpec::ByVariable {
574                                        inner: Box::new(inner_basis.clone()),
575                                        by_col,
576                                        kind: BySmoothKind::Level { level_bits },
577                                        by: ByVariableSpec::Level {
578                                            value_bits: level_bits,
579                                            label: level_label,
580                                        },
581                                    },
582                                    shape: shape.clone(),
583                                    joint_null_rotation: None,
584                                });
585                            }
586                        }
587                        ColumnKindTag::Binary | ColumnKindTag::Continuous => {
588                            smooth_terms.push(SmoothTermSpec {
589                                name: label.clone(),
590                                basis: SmoothBasisSpec::ByVariable {
591                                    inner: Box::new(inner_basis),
592                                    by_col,
593                                    kind: BySmoothKind::Numeric,
594                                    by: ByVariableSpec::Numeric,
595                                },
596                                shape,
597                                joint_null_rotation: None,
598                            });
599                        }
600                    }
601                } else {
602                    smooth_terms.push(SmoothTermSpec {
603                        name: label.clone(),
604                        basis: inner_basis,
605                        shape,
606                        joint_null_rotation: None,
607                    });
608                }
609            }
610            ParsedTerm::LinkWiggle { .. }
611            | ParsedTerm::TimeWiggle { .. }
612            | ParsedTerm::LinkConfig { .. }
613            | ParsedTerm::SurvivalConfig { .. } => {
614                // Consumed at formula level, not design terms.
615            }
616            ParsedTerm::LogSlopeSurface { .. } => {
617                return Err(TermBuilderError::malformed_formula(
618                    "logslope(...) declarations must be resolved by the marginal-slope formula path before building a term spec",
619                ));
620            }
621            ParsedTerm::Interaction { vars } => {
622                // A linear `:` interaction realizes one design column equal to
623                // the elementwise product of its operands. Numeric (continuous/
624                // binary) operands multiply directly; a categorical operand is
625                // a factor, so the product is expanded factor-aware: one design
626                // column per surviving cell of the factor(s), each an indicator
627                // `1[factor == level]` gating the numeric product.
628                //
629                // Coding is MARGINALITY-AWARE (gam#1158, gam#1159). A categorical
630                // operand `g` is treatment-coded (its lexicographically first
631                // reference level dropped) ONLY when the lower-order term obtained
632                // by removing `g` from this interaction is also present in the
633                // model — that lower-order term is what makes the dropped level
634                // identifiable, exactly mgcv's marginality rule. When that parent
635                // is ABSENT (the interaction-only form), dropping the reference
636                // level instead pins a group to the reference fit (a rank-deficient
637                // design), so we keep ALL levels (full dummy coding) and rely on a
638                // single intercept cell-drop below for identifiability:
639                //   * `y ~ x:g` with no `x` main effect → "common intercept,
640                //     separate slopes": every group keeps its own x-slope.
641                //   * `y ~ g:h` with no `g`/`h` main effects → the saturated
642                //     cell-means model: full cross of all levels minus one
643                //     reference cell absorbed by the intercept.
644                // When the parents ARE present (`x + x:g`, or `g*h` = `g + h +
645                // g:h`), the historical treatment coding is preserved so those
646                // forms stay correct.
647                //
648                // A main effect for var V is a `Linear`/`BoundedLinear`/
649                // `RandomEffect` ParsedTerm whose referenced name is V (an
650                // auto-detected categorical `Linear` becomes a RandomEffect main
651                // effect; either spelling counts). We only treat such standalone
652                // main-effect terms as parents — not V appearing inside another
653                // interaction.
654                let main_effect_present = |target: &str| -> bool {
655                    terms.iter().any(|other| match other {
656                        ParsedTerm::Linear { name, .. }
657                        | ParsedTerm::BoundedLinear { name, .. }
658                        | ParsedTerm::RandomEffect { name } => name == target,
659                        _ => false,
660                    })
661                };
662                // The lower-order parent of dropping operand `drop_var` from this
663                // interaction is present iff EVERY other operand is a main effect.
664                // For the two cases we care about (`x:g`, `g:h`) the interaction
665                // has two operands, so this reduces to "is the single remaining
666                // operand a main effect"; the general form handles any arity.
667                let parent_present = |drop_var: &str| -> bool {
668                    vars.iter()
669                        .filter(|v| v.as_str() != drop_var)
670                        .all(|v| main_effect_present(v))
671                };
672
673                let mut numeric_cols = Vec::<usize>::new();
674                // Per categorical operand: (var name, col, kept levels, was the
675                // reference level dropped / treatment-coded?).
676                let mut categorical_factors =
677                    Vec::<(String, usize, Vec<(u64, String)>, bool)>::new();
678                for var in vars {
679                    let col = resolve_col(col_map, var)?;
680                    let kind = ds.column_kinds.get(col).copied().ok_or_else(|| {
681                        TermBuilderError::missing_column(format!(
682                            "internal column-kind lookup failed for '{var}'"
683                        ))
684                        .to_string()
685                    })?;
686                    match kind {
687                        ColumnKindTag::Continuous | ColumnKindTag::Binary => numeric_cols.push(col),
688                        ColumnKindTag::Categorical => {
689                            let mut levels = encoded_levels_for_column(ds, ColIdx::new(col));
690                            // Treatment-code (drop the reference level) only when
691                            // the marginal parent that identifies it is present;
692                            // otherwise keep every level (full dummy coding).
693                            let treatment_coded = parent_present(var);
694                            if treatment_coded && levels.len() > 1 {
695                                levels.remove(0);
696                            }
697                            if levels.is_empty() {
698                                return Err(TermBuilderError::incompatible_config(format!(
699                                    "interaction `{}` references categorical column `{var}` with no usable levels",
700                                    vars.join(":")
701                                )));
702                            }
703                            categorical_factors.push((var.clone(), col, levels, treatment_coded));
704                        }
705                    }
706                }
707
708                let label = vars.join(":");
709
710                if categorical_factors.is_empty() {
711                    // Pure numeric `:` interaction — single product column,
712                    // identical to the historical behaviour.
713                    linear_terms.push(LinearTermSpec {
714                        name: label,
715                        feature_col: numeric_cols[0],
716                        feature_cols: numeric_cols,
717                        categorical_levels: vec![],
718                        // Parametric `:` interaction column is unpenalized by
719                        // default, same as any other linear term (#749).
720                        double_penalty: false,
721                        coefficient_geometry: LinearCoefficientGeometry::Unconstrained,
722                        coefficient_min: None,
723                        coefficient_max: None,
724                    });
725                    inference_notes.push(format!(
726                        "wired linear interaction `{}` as product of numeric columns",
727                        vars.join(":")
728                    ));
729                } else {
730                    // Factor-aware expansion: cartesian product over the kept
731                    // levels of every categorical operand. Each cell yields one
732                    // column gating the numeric product (or, with no numeric
733                    // operand, a pure cell indicator).
734                    let mut cells: Vec<Vec<(usize, u64, String)>> = vec![Vec::new()];
735                    for (_var, col, levels, _treatment_coded) in &categorical_factors {
736                        let mut next = Vec::with_capacity(cells.len() * levels.len());
737                        for cell in &cells {
738                            for (bits, level_label) in levels {
739                                let mut extended = cell.clone();
740                                extended.push((*col, *bits, level_label.clone()));
741                                next.push(extended);
742                            }
743                        }
744                        cells = next;
745                    }
746
747                    // Intercept-identifiability cell drop. When the cells are PURE
748                    // INDICATORS (no numeric operand) and at least one factor was
749                    // dummy-coded (kept all its levels), the full set of cell
750                    // columns sums to the all-ones intercept and is rank-deficient
751                    // against it. Drop exactly ONE reference cell — the cell where
752                    // every factor sits at its reference (lexicographically first)
753                    // level — so the remaining saturated cells are identifiable
754                    // (rank n_g*n_h - 1 cells + intercept). With a numeric operand
755                    // the cells gate `x` and sum to `x`, not the intercept, so no
756                    // cell is dropped (the collinearity there is with the absent
757                    // `x` main effect, which is exactly why full coding is right).
758                    let any_dummy_coded = categorical_factors
759                        .iter()
760                        .any(|(_, _, _, treatment_coded)| !*treatment_coded);
761                    if numeric_cols.is_empty() && any_dummy_coded {
762                        // The reference cell pairs each factor's column with the
763                        // bits of its lexicographically-first (index 0) level.
764                        let reference_cell: Vec<(usize, u64)> = categorical_factors
765                            .iter()
766                            .map(|(_, col, _, _)| {
767                                let levels = encoded_levels_for_column(ds, ColIdx::new(*col));
768                                (*col, levels[0].0)
769                            })
770                            .collect();
771                        cells.retain(|cell| {
772                            !reference_cell.iter().all(|(rcol, rbits)| {
773                                cell.iter()
774                                    .any(|(col, bits, _)| col == rcol && bits == rbits)
775                            })
776                        });
777                    }
778
779                    let n_cells = cells.len();
780                    for cell in cells {
781                        let cell_suffix = cell
782                            .iter()
783                            .map(|(_, _, level_label)| level_label.as_str())
784                            .collect::<Vec<_>>()
785                            .join(":");
786                        let categorical_levels =
787                            cell.iter().map(|(col, bits, _)| (*col, *bits)).collect();
788                        // `feature_col` is required to point at a real column;
789                        // use the first numeric operand when present, otherwise
790                        // the first categorical column (its raw value is never
791                        // multiplied — `realized_design_column` starts from ones
792                        // and only gates by the level indicators).
793                        let feature_col = numeric_cols
794                            .first()
795                            .copied()
796                            .unwrap_or(categorical_factors[0].1);
797                        linear_terms.push(LinearTermSpec {
798                            name: format!("{label}:{cell_suffix}"),
799                            feature_col,
800                            feature_cols: numeric_cols.clone(),
801                            categorical_levels,
802                            double_penalty: false,
803                            coefficient_geometry: LinearCoefficientGeometry::Unconstrained,
804                            coefficient_min: None,
805                            coefficient_max: None,
806                        });
807                    }
808                    let all_treatment_coded = !any_dummy_coded;
809                    let coding = if all_treatment_coded {
810                        "treatment-coded"
811                    } else {
812                        "marginality-aware (full dummy / saturated)"
813                    };
814                    inference_notes.push(format!(
815                        "wired factor-aware linear interaction `{}` as {} {} cell column(s)",
816                        vars.join(":"),
817                        n_cells,
818                        coding
819                    ));
820                }
821            }
822        }
823    }
824
825    Ok(TermCollectionSpec {
826        linear_terms,
827        random_effect_terms: random_terms,
828        smooth_terms,
829    })
830}
831
832fn split_list_option(raw: &str) -> Vec<String> {
833    let t = raw.trim();
834    // Accept the Python/JSON list form `[a, b]` AND mgcv's R-vector forms
835    // `c(a, b)` / `(a, b)` as bracketed wrappers around a comma-separated body.
836    // mgcv-style formulas pass per-margin numeric options as `k=c(5,5)` /
837    // `period=c(2*pi, pi)`; without R-vector peeling here those entries were
838    // split into `["c(5", "5)"]` and the downstream numeric parser then
839    // misreported the leading garbage as the invalid digit.
840    let inner = t
841        .strip_prefix('[')
842        .and_then(|u| u.strip_suffix(']'))
843        .or_else(|| {
844            t.strip_prefix("c(")
845                .or_else(|| t.strip_prefix("C("))
846                .or_else(|| t.strip_prefix('('))
847                .and_then(|u| u.strip_suffix(')'))
848        })
849        .unwrap_or(t);
850    inner
851        .split(',')
852        .map(|v| v.trim().to_string())
853        .filter(|v| !v.is_empty())
854        .collect()
855}
856
857fn parse_numeric_expr(raw: &str) -> Result<f64, String> {
858    let mut acc = 1.0f64;
859    let normalized = raw.replace(' ', "");
860    if normalized.eq_ignore_ascii_case("none") {
861        return Err("None is not numeric".to_string());
862    }
863    for factor in normalized.split('*') {
864        if factor.is_empty() {
865            return Err(format!("invalid numeric expression '{raw}'"));
866        }
867        let value = if factor.eq_ignore_ascii_case("pi") || factor == "π" {
868            std::f64::consts::PI
869        } else if factor.eq_ignore_ascii_case("tau") || factor == "τ" {
870            std::f64::consts::TAU
871        } else if let Some(prefix) = factor
872            .strip_suffix("pi")
873            .or_else(|| factor.strip_suffix("π"))
874        {
875            let coefficient = if prefix.is_empty() {
876                1.0
877            } else {
878                prefix
879                    .parse::<f64>()
880                    .map_err(|err| format!("invalid numeric expression '{raw}': {err}"))?
881            };
882            coefficient * std::f64::consts::PI
883        } else if let Some(prefix) = factor
884            .strip_suffix("tau")
885            .or_else(|| factor.strip_suffix("τ"))
886        {
887            let coefficient = if prefix.is_empty() {
888                1.0
889            } else {
890                prefix
891                    .parse::<f64>()
892                    .map_err(|err| format!("invalid numeric expression '{raw}': {err}"))?
893            };
894            coefficient * std::f64::consts::TAU
895        } else {
896            factor
897                .parse::<f64>()
898                .map_err(|err| format!("invalid numeric expression '{raw}': {err}"))?
899        };
900        acc *= value;
901    }
902    Ok(acc)
903}
904
905/// Read an endpoint/period option as a numeric *expression* (`2*pi`, `tau`,
906/// `0.5*tau`, `6.283185307179586`, ...) — the same grammar that `period=` and
907/// `origin=` already accept via [`parse_numeric_expr`].
908///
909/// Returns `Ok(None)` when the key is absent, `Ok(Some(v))` when it parses, and
910/// a hard `Err` when the key is *present but unparseable*. The crucial contrast
911/// is with the lenient [`option_f64`], which collapses an unparseable value to
912/// `None` and lets the caller silently substitute the data range — wrapping a
913/// cyclic smooth at the wrong period with no diagnostic (the #815 failure mode).
914fn option_numeric_expr(
915    options: &BTreeMap<String, String>,
916    key: &str,
917) -> Result<Option<f64>, String> {
918    match options.get(key) {
919        None => Ok(None),
920        Some(raw) => parse_numeric_expr(raw)
921            .map(Some)
922            .map_err(|err| format!("option `{key}={raw}` is not a valid numeric value: {err}")),
923    }
924}
925
926fn parse_periods_option(
927    options: &BTreeMap<String, String>,
928    dim: usize,
929) -> Result<Option<Vec<Option<f64>>>, String> {
930    let Some(raw) = options.get("period") else {
931        return Ok(None);
932    };
933    let values = split_list_option(raw);
934    let mut periods = vec![None; dim];
935    if values.len() == 1 && dim == 1 {
936        periods[0] = Some(parse_numeric_expr(&values[0])?);
937    } else {
938        if values.len() != dim {
939            return Err(format!(
940                "period list length {} must match smooth dimension {}",
941                values.len(),
942                dim
943            ));
944        }
945        for (i, v) in values.iter().enumerate() {
946            if v.eq_ignore_ascii_case("none") {
947                continue;
948            }
949            periods[i] = Some(parse_numeric_expr(v)?);
950        }
951    }
952    Ok(Some(periods))
953}
954
955fn parse_periodic_axes_option(
956    options: &BTreeMap<String, String>,
957    dim: usize,
958) -> Result<Option<Vec<Option<f64>>>, String> {
959    let Some(raw_axes) = options.get("periodic") else {
960        return Ok(None);
961    };
962    let mut periods = parse_periods_option(options, dim)?.unwrap_or_else(|| vec![None; dim]);
963    // Scalar boolean form (`periodic=true` / `false`, `yes` / `no`) applies to
964    // every axis — the documented per-axis-flag broadcast (see the doc on
965    // `parse_periodic_axes`, the tensor sibling that already accepts it). A
966    // 1-D `duchon(x, periodic=true)` lands here: the cyclic *domain* is then
967    // resolved from the data range by `parse_cyclic_boundary` (the 1-D builder
968    // consults `boundary` first), so a finite explicit period is NOT required —
969    // we only need to NOT mis-read "true" as an axis index (#1074). `false`
970    // means no axis is periodic.
971    let lowered = raw_axes.trim().to_ascii_lowercase();
972    match lowered.as_str() {
973        "true" | "yes" | "y" => return Ok(Some(periods)),
974        // `false` means NO axis is periodic. Return `None` — NOT
975        // `Some(vec![None; dim])` — because the radial 1-D consumer treats a
976        // `Some([None])` as "periodicity requested, derive the wrap period from
977        // the data range" (see the Duchon builder arm below, which back-fills
978        // `axes[0] = data_span` for a lone `None`) and the 1-D builder routes on
979        // `spec.periodic.is_some()`. Emitting `Some([None])` here therefore
980        // silently produced a *periodic* smooth for an explicit `periodic=false`
981        // — the exact regression this arm now avoids, matching the bracketed
982        // `[false]` form handled by the per-axis boolean block below.
983        "false" | "no" | "n" => return Ok(None),
984        _ => {}
985    }
986    let axes = split_list_option(raw_axes);
987    if axes.is_empty() {
988        return Ok(Some(periods));
989    }
990
991    // Boolean forms `periodic=true` / `periodic=[true, false, ...]`, mirroring
992    // `parse_tensor_periodic_axes`. The radial 1-D builders (`duchon`/`tps`/
993    // `matern`) intentionally DERIVE the wrap period from the closed center
994    // lattice when none is supplied (`prepare_periodic_duchon_centers_1d_with_period`,
995    // gam#580: `None => span`), so a boolean-selected periodic axis legitimately
996    // omits `period`. Without this branch, `duchon(x, periodic=true)`-style
997    // radial formulas failed with the misleading "invalid periodic axis 'true'".
998    let is_bool = |t: &str| {
999        matches!(
1000            t.to_ascii_lowercase().as_str(),
1001            "true" | "yes" | "y" | "false" | "no" | "n"
1002        )
1003    };
1004    let is_truthy = |t: &str| matches!(t.to_ascii_lowercase().as_str(), "true" | "yes" | "y");
1005
1006    // Scalar boolean: `periodic=true` / `periodic=false`.
1007    if axes.len() == 1 && is_bool(&axes[0]) {
1008        if !is_truthy(&axes[0]) {
1009            // Non-periodic: return None so the 1-D builder (which routes on
1010            // `spec.periodic.is_some()`) does NOT take the periodic path.
1011            return Ok(None);
1012        }
1013        // Every axis periodic; honor any explicit per-axis period, else leave
1014        // `None` for the caller (formula arm) / builder to derive the span.
1015        return Ok(Some(periods));
1016    }
1017
1018    // Per-axis boolean list: `periodic=[true, false, ...]` (length must match dim).
1019    if axes.iter().all(|a| is_bool(a)) {
1020        if axes.len() != dim {
1021            return Err(format!(
1022                "periodic flag list length {} must match smooth dimension {dim}",
1023                axes.len()
1024            ));
1025        }
1026        if !axes.iter().any(|a| is_truthy(a)) {
1027            return Ok(None);
1028        }
1029        for (i, a) in axes.iter().enumerate() {
1030            if !is_truthy(a) {
1031                periods[i] = None;
1032            }
1033        }
1034        return Ok(Some(periods));
1035    }
1036
1037    // Index-list form: `periodic=[0, 2]`. Each listed axis must carry an
1038    // explicit finite period — an index gives no per-axis span-derive hint.
1039    for a in &axes {
1040        let axis = a
1041            .parse::<usize>()
1042            .map_err(|err| format!("invalid periodic axis '{a}': {err}"))?;
1043        if axis >= dim {
1044            return Err(format!(
1045                "periodic axis {axis} out of range for {dim}D smooth"
1046            ));
1047        }
1048        if periods[axis].is_none() {
1049            return Err(format!(
1050                "periodic axis {axis} requires period[{axis}] to be finite"
1051            ));
1052        }
1053    }
1054    // Axes not listed are non-periodic even if period list has a finite placeholder.
1055    let listed: std::collections::BTreeSet<usize> = axes
1056        .iter()
1057        .filter_map(|a| a.parse::<usize>().ok())
1058        .collect();
1059    for i in 0..dim {
1060        if !listed.contains(&i) {
1061            periods[i] = None;
1062        }
1063    }
1064    Ok(Some(periods))
1065}
1066
1067// ---------------------------------------------------------------------------
1068// Smooth basis spec construction
1069// ---------------------------------------------------------------------------
1070
1071fn parse_option_list(raw: &str) -> Vec<String> {
1072    let trimmed = raw.trim();
1073    // Accept both the Python/JSON list form `[a, b]` and mgcv's R vector form
1074    // `c(a, b)` (and a bare `(a, b)`) as the bracketed wrapper around a
1075    // comma-separated option list. mgcv writes per-margin options as
1076    // `bs=c('tp','tp')` / `m=c(2,2)`, so the `c(...)` form must round-trip
1077    // through the same splitter the `[...]` form uses.
1078    let inner = trimmed
1079        .strip_prefix('[')
1080        .and_then(|v| v.strip_suffix(']'))
1081        .or_else(|| {
1082            trimmed
1083                .strip_prefix("c(")
1084                .or_else(|| trimmed.strip_prefix("C("))
1085                .or_else(|| trimmed.strip_prefix('('))
1086                .and_then(|v| v.strip_suffix(')'))
1087        })
1088        .unwrap_or(trimmed);
1089    inner
1090        .split(',')
1091        .map(|v| {
1092            v.trim()
1093                .trim_matches('"')
1094                .trim_matches('\'')
1095                .to_ascii_lowercase()
1096        })
1097        .filter(|v| !v.is_empty())
1098        .collect()
1099}
1100
1101fn parse_periodic_axes(
1102    options: &BTreeMap<String, String>,
1103    dim: usize,
1104) -> Result<Vec<bool>, String> {
1105    let mut axes = vec![false; dim];
1106    if let Some(raw) = options.get("periodic").or_else(|| options.get("cyclic")) {
1107        let lowered = raw.trim().to_ascii_lowercase();
1108        match lowered.as_str() {
1109            "true" | "yes" | "y" => {
1110                axes.fill(true);
1111                return Ok(axes);
1112            }
1113            "false" | "no" | "n" => return Ok(axes),
1114            _ => {}
1115        }
1116        for axis_raw in parse_option_list(raw) {
1117            let axis = axis_raw
1118                .parse::<usize>()
1119                .map_err(|err| format!("invalid periodic axis '{axis_raw}': {err}"))?;
1120            if axis >= dim {
1121                return Err(format!(
1122                    "periodic axis {axis} out of range for {dim}D smooth"
1123                ));
1124            }
1125            axes[axis] = true;
1126        }
1127    }
1128    if let Some(raw) = options.get("boundary").or_else(|| options.get("bc")) {
1129        let boundary = parse_option_list(raw);
1130        if boundary.len() == dim {
1131            for (axis, value) in boundary.iter().enumerate() {
1132                if matches!(value.as_str(), "periodic" | "cyclic" | "cc") {
1133                    axes[axis] = true;
1134                }
1135            }
1136        } else if dim == 1
1137            && matches!(
1138                boundary.first().map(String::as_str),
1139                Some("periodic" | "cyclic" | "cc")
1140            )
1141        {
1142            axes[0] = true;
1143        }
1144    }
1145    Ok(axes)
1146}
1147
1148fn parse_optional_numeric_list(
1149    options: &BTreeMap<String, String>,
1150    keys: &[&str],
1151    dim: usize,
1152) -> Result<Vec<Option<f64>>, String> {
1153    let Some(raw) = keys.iter().find_map(|key| options.get(*key)) else {
1154        return Ok(vec![None; dim]);
1155    };
1156    let values = split_list_option(raw);
1157    let mut out = vec![None; dim];
1158    if values.len() == 1 && dim == 1 {
1159        if !values[0].eq_ignore_ascii_case("none") {
1160            out[0] = Some(parse_numeric_expr(&values[0])?);
1161        }
1162        return Ok(out);
1163    }
1164    if values.len() != dim {
1165        return Err(format!(
1166            "numeric option list length {} must match smooth dimension {}",
1167            values.len(),
1168            dim
1169        ));
1170    }
1171    for (i, value) in values.iter().enumerate() {
1172        if !value.eq_ignore_ascii_case("none") {
1173            out[i] = Some(parse_numeric_expr(value)?);
1174        }
1175    }
1176    Ok(out)
1177}
1178
1179fn parse_periods(
1180    options: &BTreeMap<String, String>,
1181    periodic_axes: &[bool],
1182) -> Result<Vec<Option<f64>>, String> {
1183    let dim = periodic_axes.len();
1184    // Broadcast a single-element `period=[v]` onto the lone periodic axis
1185    // of a multi-axis smooth (e.g. `te(th, h, bc=['periodic','natural'],
1186    // period=[2*pi])`): with only one periodic margin, the value can only
1187    // belong there.
1188    let lone_periodic_broadcast = options
1189        .get("period")
1190        .or_else(|| options.get("periods"))
1191        .and_then(|raw| {
1192            let values = split_list_option(raw);
1193            if values.len() != 1 || dim <= 1 {
1194                return None;
1195            }
1196            let mut iter = periodic_axes.iter().enumerate().filter(|(_, p)| **p);
1197            let first = iter.next()?;
1198            if iter.next().is_some() {
1199                return None;
1200            }
1201            Some((first.0, values.into_iter().next().unwrap()))
1202        });
1203    let periods = if let Some((axis, value)) = lone_periodic_broadcast {
1204        let mut out = vec![None; dim];
1205        if !value.eq_ignore_ascii_case("none") {
1206            out[axis] = Some(parse_numeric_expr(&value)?);
1207        }
1208        out
1209    } else {
1210        parse_optional_numeric_list(options, &["period", "periods"], dim)?
1211    };
1212    for (axis, (periodic, period)) in periodic_axes.iter().zip(periods.iter()).enumerate() {
1213        if *periodic
1214            && let Some(value) = period
1215            && (!value.is_finite() || *value <= 0.0)
1216        {
1217            return Err(format!(
1218                "period for periodic axis {axis} must be finite and positive, got {value}"
1219            ));
1220        }
1221    }
1222    Ok(periods)
1223}
1224
1225fn parse_period_origins(
1226    options: &BTreeMap<String, String>,
1227    periodic_axes: &[bool],
1228) -> Result<Vec<Option<f64>>, String> {
1229    parse_optional_numeric_list(
1230        options,
1231        &[
1232            "origin",
1233            "origins",
1234            "period_origin",
1235            "period-origin",
1236            "domain_origin",
1237        ],
1238        periodic_axes.len(),
1239    )
1240}
1241
1242/// Parse a per-axis periodic flag list for tensor smooths. Accepts three forms:
1243/// - `periodic=true` / `periodic=false` (scalar applied to every axis),
1244/// - `periodic=[true, false, ...]` (one flag per axis, length `dim`),
1245/// - `periodic=c(1, 1)` / `c(0, 0)` (a length-`dim` 0/1 mask, mgcv's
1246///   per-margin spelling — distinguished from an axis-index list by the
1247///   repeated 0/1 value), and
1248/// - `periodic=[0, 2, ...]` (axis indices that are periodic; others are not).
1249///
1250/// `boundary=[..., "periodic"/"cyclic"/"cc", ...]` may also flip individual
1251/// axes on; non-matching tokens leave the existing flag unchanged.
1252fn parse_tensor_periodic_axes(
1253    options: &BTreeMap<String, String>,
1254    dim: usize,
1255) -> Result<Vec<bool>, String> {
1256    let mut axes = vec![false; dim];
1257    if let Some(raw) = options.get("periodic").or_else(|| options.get("cyclic")) {
1258        let lowered = raw.trim().to_ascii_lowercase();
1259        match lowered.as_str() {
1260            "true" | "yes" | "y" => {
1261                axes.fill(true);
1262            }
1263            "false" | "no" | "n" => {
1264                // Already false; allow `boundary=` below to flip axes if set.
1265            }
1266            _ => {
1267                let entries = parse_option_list(raw);
1268                let all_bool = !entries.is_empty()
1269                    && entries.iter().all(|v| {
1270                        matches!(
1271                            v.as_str(),
1272                            "true" | "yes" | "y" | "false" | "no" | "n" | "none"
1273                        )
1274                    });
1275                // mgcv writes per-margin flag vectors as `periodic=c(1,1)` /
1276                // `periodic=c(0,0)` — a length-`dim` mask where each entry is a
1277                // 0/1 flag for THAT margin, not an axis index. A bare axis-index
1278                // list (`periodic=[0,1]`, `periodic=[0]`) lists DISTINCT margin
1279                // indices to turn on. The two collide only when the list is all
1280                // 0/1 of length `dim`; disambiguate by the repeated-value
1281                // signature `c(1,1)`/`c(0,0)` (a valid axis-index set never
1282                // repeats an index), which is the canonical mask spelling. This
1283                // is what makes the leading tensor margin honor its periodic flag
1284                // (#1751: `periodic=c(1,1)` previously parsed `1,1` as axis
1285                // indices, marking only axis 1 and dropping axis 0).
1286                let all_zero_one =
1287                    !entries.is_empty() && entries.iter().all(|v| v == "0" || v == "1");
1288                let has_repeat = {
1289                    let mut seen = std::collections::BTreeSet::new();
1290                    !entries.iter().all(|v| seen.insert(v.clone()))
1291                };
1292                let numeric_mask = all_zero_one && entries.len() == dim && has_repeat;
1293                if all_bool || numeric_mask {
1294                    if entries.len() != dim {
1295                        return Err(format!(
1296                            "periodic list length {} must match smooth dimension {}",
1297                            entries.len(),
1298                            dim
1299                        ));
1300                    }
1301                    for (i, v) in entries.iter().enumerate() {
1302                        axes[i] = matches!(v.as_str(), "true" | "yes" | "y" | "1");
1303                    }
1304                } else {
1305                    for axis_raw in entries {
1306                        let axis = axis_raw
1307                            .parse::<usize>()
1308                            .map_err(|err| format!("invalid periodic axis '{axis_raw}': {err}"))?;
1309                        if axis >= dim {
1310                            return Err(format!(
1311                                "periodic axis {axis} out of range for {dim}D smooth"
1312                            ));
1313                        }
1314                        axes[axis] = true;
1315                    }
1316                }
1317            }
1318        }
1319    }
1320    if let Some(raw) = options.get("boundary").or_else(|| options.get("bc")) {
1321        let boundary = parse_option_list(raw);
1322        if boundary.len() == dim {
1323            for (axis, value) in boundary.iter().enumerate() {
1324                if matches!(value.as_str(), "periodic" | "cyclic" | "cc") {
1325                    axes[axis] = true;
1326                }
1327            }
1328        }
1329    }
1330    // A per-margin basis vector (`bs=c('cc','ps')` / `type=[...]`) declares each
1331    // margin's basis family, and a cyclic family (`cc`/`cp`/`cyclic`) makes THAT
1332    // margin periodic — exactly as the 1-D `s(x, bs='cc')` smooth wraps its lone
1333    // axis. Without this, the per-margin `cc` token was validated but discarded:
1334    // every `bs=c(...)` spelling collapsed to the same open B-spline tensor
1335    // (#1752). Only honor the vector form here; a scalar `bs='cc'` on a tensor is
1336    // ambiguous about which margins wrap, so it does not flip any axis on.
1337    if let Some(raw) = options.get("bs").or_else(|| options.get("type"))
1338        && bs_selector_is_vector(raw)
1339    {
1340        let per_margin = parse_option_list(raw);
1341        if per_margin.len() == dim {
1342            for (axis, margin_bs) in per_margin.iter().enumerate() {
1343                if matches!(canonicalize_smooth_type(margin_bs), "cc" | "cp" | "cyclic") {
1344                    axes[axis] = true;
1345                }
1346            }
1347        }
1348    }
1349    Ok(axes)
1350}
1351
1352/// Validate the per-margin `boundary=`/`bc=` tokens on a tensor-product smooth.
1353///
1354/// The tensor `boundary`/`bc` list selects, per margin, whether the margin
1355/// *wraps* (a `periodic`/`cyclic`/`cc` token, consumed by
1356/// [`parse_tensor_periodic_axes`]) or is an ordinary non-periodic margin. In the
1357/// tensor DSL a *non-periodic* margin is spelled `clamped` — in the B-spline
1358/// sense of a **clamped knot vector**, i.e. the standard open spline that is
1359/// free at its two ends and does not wrap (exactly how the callers document it:
1360/// "non-periodic / clamped … free at the two ends, no wrap"). It is therefore an
1361/// inert marker here, not a zero-derivative endpoint reparameterization: a
1362/// cylinder `te(theta, z, boundary=['periodic','clamped'], …)` is a cyclic θ
1363/// margin tensor-producted with an ordinary open z margin, the direct analog of
1364/// mgcv `te(bs=c("cc","ps"))` / `te(bs=c("cc","cr"))`.
1365///
1366/// The periodic selectors and the inert non-periodic markers
1367/// (`clamped`/`open`/`natural`/`free`/`none`/empty) are accepted; anything else
1368/// (e.g. a genuine `anchored` zero-value endpoint constraint, which has no
1369/// ordinary-margin meaning in a tensor) is surfaced as a clean
1370/// unsupported-feature error rather than silently dropped. Previously `clamped`
1371/// itself was rejected, so the cylinder/torus mixed-boundary tensors — the exact
1372/// construction the manifold quality suite builds — could not be fit at all.
1373fn validate_tensor_boundary_tokens(
1374    options: &BTreeMap<String, String>,
1375    dim: usize,
1376) -> Result<(), String> {
1377    let Some(raw) = options.get("boundary").or_else(|| options.get("bc")) else {
1378        return Ok(());
1379    };
1380    let entries = parse_option_list(raw);
1381    for (axis, value) in entries.iter().enumerate() {
1382        let inert = matches!(
1383            value.trim().to_ascii_lowercase().as_str(),
1384            "clamped" | "open" | "natural" | "free" | "none" | "" | "periodic" | "cyclic" | "cc"
1385        );
1386        if !inert {
1387            return Err(TermBuilderError::unsupported_feature(format!(
1388                "tensor smooth margin {axis} boundary token '{value}' is not supported \
1389                 (got bc/boundary={raw:?} on a {dim}-D tensor); tensor margins accept the periodic \
1390                 selectors (periodic/cyclic/cc) or the non-periodic markers (clamped/open/natural/free). \
1391                 Apply anchored/zero-value endpoint constraints with a 1-D s(x, bc=...) term instead."
1392            ))
1393            .to_string());
1394        }
1395    }
1396    Ok(())
1397}
1398
1399fn tensor_k_axis_option_axis(
1400    key: &str,
1401    cols: &[usize],
1402    ds: &Dataset,
1403) -> Result<Option<usize>, String> {
1404    let Some(suffix) = key.strip_prefix("k_") else {
1405        return Ok(None);
1406    };
1407    if suffix.is_empty() {
1408        return Err("tensor k axis option must be named k_<axis> or k_<variable>".to_string());
1409    }
1410    if let Ok(axis) = suffix.parse::<usize>() {
1411        return if axis < cols.len() {
1412            Ok(Some(axis))
1413        } else {
1414            Err(format!(
1415                "tensor k axis option `{key}` references axis {axis}, but the smooth has {} margins",
1416                cols.len()
1417            ))
1418        };
1419    }
1420
1421    let mut matches = cols
1422        .iter()
1423        .enumerate()
1424        .filter(|(_, col)| ds.headers.get(**col).is_some_and(|name| name == suffix))
1425        .map(|(axis, _)| axis);
1426    let first = matches.next();
1427    if matches.next().is_some() {
1428        return Err(format!(
1429            "tensor k axis option `{key}` matches more than one margin named `{suffix}`"
1430        ));
1431    }
1432    first.map(Some).ok_or_else(|| {
1433        let margin_names = cols
1434            .iter()
1435            .enumerate()
1436            .map(|(axis, col)| {
1437                let name = ds
1438                    .headers
1439                    .get(*col)
1440                    .map(String::as_str)
1441                    .unwrap_or("<unnamed>");
1442                format!("{axis}:{name}")
1443            })
1444            .collect::<Vec<_>>()
1445            .join(", ");
1446        format!(
1447            "tensor k axis option `{key}` does not match a margin index or name; tensor margins are [{margin_names}]"
1448        )
1449    })
1450}
1451
1452fn is_tensor_k_axis_option_key(key: &str) -> bool {
1453    key.strip_prefix("k_")
1454        .is_some_and(|suffix| !suffix.is_empty())
1455}
1456
1457/// Parse a per-margin basis dimension list (`k=<scalar>`, `k=[k0, k1, ...]`,
1458/// or axis aliases like `k_x=...` / `k_0=...`). A scalar is broadcast across
1459/// all axes; `None` returns the heuristic from the data column.
1460fn parse_tensor_k_list(
1461    options: &BTreeMap<String, String>,
1462    cols: &[usize],
1463    ds: &Dataset,
1464) -> Result<(Vec<usize>, bool), String> {
1465    let mut axis_values = vec![None; cols.len()];
1466    let mut saw_axis_alias = false;
1467    for (key, value) in options {
1468        let Some(axis) = tensor_k_axis_option_axis(key, cols, ds)? else {
1469            continue;
1470        };
1471        saw_axis_alias = true;
1472        if axis_values[axis].is_some() {
1473            return Err(format!("tensor k axis {axis} is specified more than once"));
1474        }
1475        let k: usize = value
1476            .parse()
1477            .map_err(|err| format!("invalid tensor k option `{key}={value}`: {err}"))?;
1478        axis_values[axis] = Some(k);
1479    }
1480
1481    let raw = options
1482        .get("k")
1483        .or_else(|| options.get("basis_dim"))
1484        .or_else(|| options.get("basis-dim"))
1485        .or_else(|| options.get("basisdim"));
1486    if saw_axis_alias {
1487        if raw.is_some() {
1488            return Err(
1489                "tensor k axis aliases cannot be combined with k= or basis_dim=".to_string(),
1490            );
1491        }
1492        if let Some(missing_axis) = axis_values.iter().position(Option::is_none) {
1493            let margin_name = cols
1494                .get(missing_axis)
1495                .and_then(|col| ds.headers.get(*col))
1496                .map(String::as_str)
1497                .unwrap_or("<unnamed>");
1498            return Err(format!(
1499                "tensor k axis aliases must specify every margin; missing axis {missing_axis} ({margin_name})"
1500            ));
1501        }
1502        return Ok((
1503            axis_values
1504                .into_iter()
1505                .map(|k| k.expect("missing axis values rejected above"))
1506                .collect(),
1507            false,
1508        ));
1509    }
1510    let Some(raw) = raw else {
1511        let inferred = heuristic_tensor_margin_knots(cols, ds);
1512        return Ok((inferred, true));
1513    };
1514    let entries = split_list_option(raw);
1515    if entries.len() == 1 {
1516        let k: usize = entries[0]
1517            .parse()
1518            .map_err(|err| format!("invalid tensor k '{}': {err}", entries[0]))?;
1519        return Ok((vec![k; cols.len()], false));
1520    }
1521    if entries.len() != cols.len() {
1522        return Err(format!(
1523            "tensor k list length {} must match smooth dimension {}",
1524            entries.len(),
1525            cols.len()
1526        ));
1527    }
1528    let mut out = Vec::with_capacity(entries.len());
1529    for entry in entries {
1530        let k: usize = entry
1531            .parse()
1532            .map_err(|err| format!("invalid tensor k '{entry}': {err}"))?;
1533        out.push(k);
1534    }
1535    Ok((out, false))
1536}
1537
1538/// Parse the `identifiability=` option for tensor-product smooths. Mirrors the
1539/// vocabulary of the Matern/Duchon parsers so the formula DSL is consistent.
1540///
1541/// `kind` selects the default identifiability when no explicit
1542/// `identifiability=` option is supplied: `te(...)` ([`SmoothKind::Te`]) keeps
1543/// the full-tensor sum-to-zero default, while `ti(...)` ([`SmoothKind::Ti`])
1544/// defaults to per-margin sum-to-zero so the marginal main effects are excluded
1545/// (the mgcv tensor-interaction semantics). An explicit option always wins.
1546fn parse_tensor_identifiability(
1547    options: &BTreeMap<String, String>,
1548    kind: SmoothKind,
1549) -> Result<TensorBSplineIdentifiability, String> {
1550    let Some(raw) = options.get("identifiability").map(String::as_str) else {
1551        return Ok(match kind {
1552            SmoothKind::Ti => TensorBSplineIdentifiability::MarginalSumToZero,
1553            _ => TensorBSplineIdentifiability::default(),
1554        });
1555    };
1556    match raw.trim().to_ascii_lowercase().as_str() {
1557        "none" => Ok(TensorBSplineIdentifiability::None),
1558        "sum_tozero" | "sum-to-zero" | "center_sum_tozero" | "center-sum-to-zero" | "centered"
1559        | "sumtozero" => Ok(TensorBSplineIdentifiability::SumToZero),
1560        "marginal_sum_tozero" | "marginal-sum-to-zero" | "marginal_sumtozero"
1561        | "marginalsumtozero" | "interaction" => {
1562            Ok(TensorBSplineIdentifiability::MarginalSumToZero)
1563        }
1564        other => Err(TermBuilderError::unsupported_feature(format!(
1565            "invalid tensor identifiability '{other}'; expected one of: none, sum_tozero, marginal_sum_tozero"
1566        ))
1567        .to_string()),
1568    }
1569}
1570
1571fn bspline_boundary_declares_periodic_axis(options: &BTreeMap<String, String>) -> bool {
1572    options
1573        .get("boundary")
1574        .or_else(|| options.get("bc"))
1575        .map(|raw| {
1576            parse_option_list(raw)
1577                .into_iter()
1578                .any(|value| matches!(value.as_str(), "periodic" | "cyclic" | "cc"))
1579        })
1580        .unwrap_or(false)
1581}
1582
1583/// Canonical-name lookup for the `bs=`/`type=` smooth selector.
1584///
1585/// User-facing names — including mgcv-compatible spellings whose semantics
1586/// match an existing gamfit smooth exactly — collapse to the engine-internal
1587/// canonical names used by the dispatch in [`build_smooth_basis`]. Adding a
1588/// new exactly-equivalent alias is a one-line entry here; the match arms
1589/// below remain the single dispatch site.
1590///
1591/// Aliases listed here MUST be true semantic equivalents of the canonical
1592/// target, not approximations. mgcv names whose semantics differ from any
1593/// gamfit smooth (e.g. `bs="ts"` shrinkage thin-plate, `bs="ad"` adaptive)
1594/// are intentionally NOT mapped here — they should reach the unsupported-type
1595/// path so users get a real diagnostic instead of a silent semantic
1596/// substitution. mgcv's `bs="cr"`/`"cs"` (cubic regression and its shrinkage
1597/// twin) are handled directly in the [`build_smooth_basis`] dispatch — they
1598/// are not aliased here because the `cr`/`cs` distinction controls a default
1599/// (`double_penalty`) that the canonical-name layer cannot see.
1600///
1601/// Unrecognised inputs pass through unchanged so the dispatch can produce its
1602/// usual "unsupported smooth type" error, preserving the existing diagnostic
1603/// surface for genuine typos.
1604pub(crate) fn canonicalize_smooth_type(raw: &str) -> &str {
1605    match raw {
1606        // Thin-plate spline. mgcv `bs="tp"` is the default thin-plate
1607        // regression spline — exact semantic equivalent of gamfit's `"tps"`.
1608        "tp" => "tps",
1609        // Gaussian process / Matérn. mgcv `bs="gp"` defaults to a Matérn
1610        // covariance kernel with REML smoothing parameter selection, which
1611        // matches gamfit's `"matern"` exactly (same kernel-Gram identity,
1612        // same REML route).
1613        "gp" => "matern",
1614        // Constant-curvature (M_κ) geodesic-kernel smooth (#944). All aliases
1615        // collapse to one canonical type so `bs="curv"`/`bs="mkappa"` cannot
1616        // diverge from `curv(...)`.
1617        "curv" | "constant_curvature" | "mkappa" => "curvature",
1618        // Measure-jet spline: multiscale local-jet-residual energy of the
1619        // empirical measure. No mgcv equivalent (mgcv has no measure-learned
1620        // geometry smooth), so no mgcv alias is mapped.
1621        "mjs" | "measure_jet" | "web" => "measurejet",
1622        other => other,
1623    }
1624}
1625
1626/// Is `margin_bs` a per-margin basis name that the tensor builder realizes as a
1627/// penalized 1-D B-spline margin?
1628///
1629/// gam's tensor product is built from penalized B-spline marginals. mgcv's
1630/// thin-plate (`tp`/`tps`), P-spline (`ps`), B-spline (`bs`), cubic-regression
1631/// (`cr`/`cs`), and cyclic (`cc`/`cp`/`cyclic`) marginals are all penalized
1632/// splines spanning the same per-axis smoothing space, so a B-spline margin
1633/// reproduces the same tensor smoothing class. Margin kinds with fundamentally
1634/// different structure (adaptive, random-effect, sphere) are NOT accepted as
1635/// tensor margins.
1636pub(crate) fn tensor_margin_bs_is_supported(margin_bs: &str) -> bool {
1637    matches!(
1638        canonicalize_smooth_type(margin_bs),
1639        "tps" | "ps" | "bs" | "bspline" | "cr" | "cs" | "cc" | "cp" | "cyclic"
1640    )
1641}
1642
1643/// Does the smooth request a periodic/cyclic axis via its options?
1644///
1645/// Mirrors the boundary-condition reading used by the periodic-aware dispatch
1646/// branches. Factored out so the type resolver and `build_smooth_basis` agree
1647/// on a single notion of "periodic requested".
1648pub(crate) fn smooth_options_declare_periodic(options: &BTreeMap<String, String>) -> bool {
1649    options.contains_key("periodic")
1650        || options.contains_key("cyclic")
1651        || options
1652            .get("boundary")
1653            .or_else(|| options.get("bc"))
1654            .map(|boundary| {
1655                boundary.to_ascii_lowercase().contains("periodic")
1656                    || boundary.to_ascii_lowercase().contains("cyclic")
1657            })
1658            .unwrap_or(false)
1659}
1660
1661/// Resolve the canonical engine-internal smooth-type name for a term.
1662///
1663/// Reads the user-facing `type=`/`bs=` selector and collapses mgcv-compatible
1664/// aliases (`tp`→`tps`, `gp`→`matern`) via [`canonicalize_smooth_type`], or
1665/// derives the default from the smooth kind/arity when no selector is given.
1666/// This is the single source of truth for the dispatch in
1667/// [`build_smooth_basis`]; other call sites (e.g. predictor-specific basis
1668/// policy) use it so the classification never drifts from the dispatch.
1669/// Is the raw `bs=`/`type=` selector a vector literal (`c('tp','tp')`,
1670/// `['tp','tp']`, `(tp, tp)`) rather than a scalar smooth-type name?
1671///
1672/// mgcv's tensor smooths take a *per-margin* basis vector
1673/// (`te(x1, x2, bs=c('tp','tp'))`). Such a value is not a scalar canonical
1674/// type and must not be fed through [`canonicalize_smooth_type`] — it has to be
1675/// recognized as a tensor request and split into per-margin types. A scalar
1676/// selector (`bs="tp"`) is left untouched.
1677pub(crate) fn bs_selector_is_vector(raw: &str) -> bool {
1678    let trimmed = raw.trim();
1679    let bracketed = (trimmed.starts_with('[') && trimmed.ends_with(']'))
1680        || (trimmed.starts_with("c(") || trimmed.starts_with("C(")) && trimmed.ends_with(')')
1681        || (trimmed.starts_with('(') && trimmed.ends_with(')'));
1682    bracketed && !parse_option_list(trimmed).is_empty()
1683}
1684
1685pub fn resolve_smooth_type_name(
1686    kind: SmoothKind,
1687    n_cols: usize,
1688    options: &BTreeMap<String, String>,
1689) -> String {
1690    let selector = options.get("type").or_else(|| options.get("bs"));
1691    // A per-margin basis vector is a tensor request, never a scalar type. Route
1692    // it to the tensor builder, which reads the per-margin types out of the
1693    // same `bs=` option. (A vector on a non-tensor smooth is ill-formed and
1694    // falls through to the scalar path below so the existing diagnostic fires.)
1695    if let Some(raw) = selector
1696        && bs_selector_is_vector(raw)
1697        && matches!(kind, SmoothKind::Te | SmoothKind::Ti | SmoothKind::T2)
1698    {
1699        return "tensor".to_string();
1700    }
1701    selector
1702        .map(|s| canonicalize_smooth_type(&s.to_ascii_lowercase()).to_string())
1703        .unwrap_or_else(|| match kind {
1704            SmoothKind::Te | SmoothKind::Ti | SmoothKind::T2 => "tensor".to_string(),
1705            SmoothKind::S if n_cols == 1 => "bspline".to_string(),
1706            // Mixed periodic Euclidean radial kernels are not separable on the
1707            // cylinder. Use a tensor product with a cyclic margin so s(theta,h)
1708            // honors seam continuity while preserving the formula-level s(...).
1709            SmoothKind::S if smooth_options_declare_periodic(options) => "tensor".to_string(),
1710            SmoothKind::S => "tps".to_string(),
1711        })
1712}
1713
1714/// Does this canonical smooth type size its basis through the generous spatial
1715/// center heuristic ([`crate::basis::default_num_centers`])?
1716///
1717/// Only the radial spatial bases (thin-plate, Matérn/GP, Duchon) route their
1718/// default basis dimension through `plan_spatial_basis(.., Default, ..)`. The
1719/// B-spline, cyclic, tensor, and factor-smooth bases use their own modest
1720/// knot-based defaults, so they are unaffected by — and must not be perturbed
1721/// by — secondary-predictor basis-parsimony adjustments (#501).
1722pub fn smooth_type_uses_spatial_center_heuristic(canonical_type: &str) -> bool {
1723    matches!(canonical_type, "tps" | "matern" | "duchon")
1724}
1725
1726pub fn build_smooth_basis(
1727    kind: SmoothKind,
1728    vars: &[String],
1729    cols: &[usize],
1730    options: &BTreeMap<String, String>,
1731    ds: &Dataset,
1732    inference_notes: &mut Vec<String>,
1733    policy: &ResourcePolicy,
1734    smooth_coordinate_count: usize,
1735) -> Result<SmoothBasisSpec, String> {
1736    // Fail fast on degenerate input: a smooth whose (non-categorical) coordinate
1737    // columns collapse to a SINGLE distinct point can only ever fit the response
1738    // mean — its design matrix is rank-1. For a UNIVARIATE smooth this is exactly
1739    // "the one column is constant": `smooth(x)`/`matern(x)` on constant `x` would
1740    // otherwise silently fit the mean of `y` with no visible cue (Duchon already
1741    // errors loudly via the basis layer; this makes the diagnosis explicit and
1742    // uniform). For a MULTIVARIATE smooth (tensor, sphere, tps, ...) a single
1743    // constant coordinate is NOT degenerate — the basis still varies along the
1744    // other coordinate(s) and the penalty absorbs the rank-deficient direction
1745    // (e.g. a constant-longitude meridian arc on the sphere is a well-posed 1-D
1746    // slice of S²). Such a term is degenerate only when EVERY coordinate is
1747    // constant at once, i.e. the joint input is a single point. Test the JOINT
1748    // cardinality, not each column independently, so the loud diagnosis still
1749    // fires for the genuinely rank-1 case without rejecting well-posed
1750    // lower-dimensional slices.
1751    let coord_cols: Vec<(&String, usize)> = vars
1752        .iter()
1753        .zip(cols.iter().copied())
1754        .filter(|(_, col)| !matches!(ds.column_kinds.get(*col), Some(ColumnKindTag::Categorical)))
1755        .collect();
1756    if !coord_cols.is_empty() {
1757        let views: Vec<ArrayView1<'_, f64>> = coord_cols
1758            .iter()
1759            .map(|(_, col)| ds.values.column(*col))
1760            .collect();
1761        let n_rows = views[0].len();
1762        let mut distinct_points = std::collections::HashSet::<Vec<u64>>::new();
1763        for r in 0..n_rows {
1764            let key: Vec<u64> = views
1765                .iter()
1766                .map(|v| {
1767                    let x = v[r];
1768                    let norm = if x == 0.0 { 0.0 } else { x };
1769                    norm.to_bits()
1770                })
1771                .collect();
1772            distinct_points.insert(key);
1773            if distinct_points.len() > 1 {
1774                break;
1775            }
1776        }
1777        if distinct_points.len() <= 1 {
1778            return Err(TermBuilderError::degenerate_data(if coord_cols.len() == 1 {
1779                let var = coord_cols[0].0;
1780                format!(
1781                    "smooth term over '{var}' has only one unique value in the training data \
1782                     — a smooth on a constant column is degenerate and would only fit the response mean. \
1783                     Remove `{var}` from the smooth, drop the term, or check the data."
1784                )
1785            } else {
1786                let names = coord_cols
1787                    .iter()
1788                    .map(|(v, _)| v.as_str())
1789                    .collect::<Vec<_>>()
1790                    .join(", ");
1791                format!(
1792                    "smooth term over ({names}) has only one unique joint coordinate in the training \
1793                     data — every coordinate is constant, so the smooth is degenerate and would only \
1794                     fit the response mean. Drop the term or check the data."
1795                )
1796            })
1797            .to_string());
1798        }
1799    }
1800    if let Some(by_name) = options.get("by").cloned() {
1801        let by_col = options
1802            .get("__by_col")
1803            .and_then(|raw| raw.parse::<usize>().ok())
1804            .or_else(|| vars.iter().position(|v| v == &by_name).map(|idx| cols[idx]))
1805            .ok_or_else(|| format!("unknown by= column '{by_name}'"))?;
1806        let mut inner_options = options.clone();
1807        inner_options.remove("by");
1808        inner_options.remove("__by_col");
1809        inner_options.remove("id");
1810        let inner = build_smooth_basis(
1811            kind,
1812            vars,
1813            cols,
1814            &inner_options,
1815            ds,
1816            inference_notes,
1817            policy,
1818            smooth_coordinate_count,
1819        )?;
1820        let by_kind = match ds.column_kinds.get(by_col).copied() {
1821            Some(ColumnKindTag::Categorical) => ByVarKind::Factor {
1822                feature_col: by_col,
1823                ordered: option_bool(options, "ordered").unwrap_or(false),
1824                frozen_levels: None,
1825            },
1826            Some(ColumnKindTag::Continuous | ColumnKindTag::Binary) => ByVarKind::Numeric {
1827                feature_col: by_col,
1828            },
1829            None => {
1830                return Err(format!(
1831                    "internal column-kind lookup failed for by='{by_name}'"
1832                ));
1833            }
1834        };
1835        return Ok(SmoothBasisSpec::BySmooth {
1836            smooth: Box::new(inner),
1837            by_kind,
1838        });
1839    }
1840
1841    let smooth_double_penalty = option_bool(options, "double_penalty").unwrap_or(true);
1842    let type_opt = resolve_smooth_type_name(kind, cols.len(), options);
1843
1844    if matches!(type_opt.as_str(), "fs" | "sz" | "re") {
1845        validate_known_options(
1846            type_opt.as_str(),
1847            options,
1848            &[
1849                "type",
1850                "bs",
1851                "k",
1852                "basis_dim",
1853                "basis-dim",
1854                "basisdim",
1855                "knots",
1856                "knot_placement",
1857                "knot-placement",
1858                "knotplacement",
1859                "degree",
1860                "penalty_order",
1861                "m",
1862                "double_penalty",
1863                "ordered",
1864            ],
1865        )?;
1866        if cols.len() != 2 {
1867            return Err(format!(
1868                "{} factor-smooth currently expects exactly two variables (one numeric, one categorical)",
1869                type_opt
1870            ));
1871        }
1872        let kinds = cols
1873            .iter()
1874            .map(|&c| ds.column_kinds.get(c).copied())
1875            .collect::<Vec<_>>();
1876        let (cont_idx, group_idx) = if type_opt == "re" {
1877            // mgcv random-slope examples are often s(g, x, bs="re").
1878            match (kinds[0], kinds[1]) {
1879                (Some(ColumnKindTag::Categorical), _) => (1usize, 0usize),
1880                (_, Some(ColumnKindTag::Categorical)) => (0usize, 1usize),
1881                _ => (1usize, 0usize),
1882            }
1883        } else {
1884            match (kinds[0], kinds[1]) {
1885                (_, Some(ColumnKindTag::Categorical)) => (0usize, 1usize),
1886                (Some(ColumnKindTag::Categorical), _) => (1usize, 0usize),
1887                _ => {
1888                    return Err(format!(
1889                        "{} factor-smooth requires one categorical factor variable",
1890                        type_opt
1891                    ));
1892                }
1893            }
1894        };
1895        let c = cols[cont_idx];
1896        let (minv, maxv) = col_minmax(ds.values.column(c))?;
1897        let degree = if type_opt == "re" {
1898            1
1899        } else {
1900            option_usize(options, "degree").unwrap_or(DEFAULT_BSPLINE_DEGREE)
1901        };
1902        // For a factor smooth every group's curve is fit from THAT group's rows
1903        // alone, so the marginal's flexibility must respect the least-resolved
1904        // group, not the pooled column. The pooled heuristic can hand the marginal
1905        // a basis that saturates (or exceeds) a small group's sample — e.g. the
1906        // sleepstudy panel has 8 training days per subject, and a default cubic
1907        // basis of 8 functions interpolates each subject's 8 points, leaving no
1908        // room for the wiggliness penalty to collapse the curve toward the
1909        // per-subject line. The factor smooth then fits within-group noise and
1910        // extrapolates badly (held-out forecast worse than the population mean).
1911        //
1912        // Cap the marginal basis below the minimum per-group covariate resolution
1913        // so the penalty always retains residual degrees of freedom to shrink each
1914        // group's curvature toward its linear null space (the random-slope
1915        // estimand). This small-group cap composes with a separate upper bound at
1916        // mgcv's factor-smooth default k=10 (FACTOR_SMOOTH_DEFAULT_BASIS_DIM,
1917        // applied below), so even ample-data groups get the modest SHARED marginal
1918        // a factor smooth wants rather than the full pooled basis. The explicit
1919        // `re` random-effect form takes neither cap: it is a raw linear `[1, x]`
1920        // random effect (0 internal knots), handled in the branch above.
1921        let pooled_internal = heuristic_knots_for_column(ds.values.column(c));
1922        let default_internal = if type_opt == "re" {
1923            // `bs="re"` is a PARAMETRIC random effect, not a smooth of the
1924            // covariate: `s(x, g, bs="re")` is the mgcv random intercept+slope
1925            // `(1 + x | g)`, i.e. a per-group line `[1, x]`, penalized by an iid
1926            // ridge. A degree-1 marginal with ZERO internal knots spans exactly
1927            // that linear space (2 coefficients per group). Using the pooled
1928            // knot heuristic here instead turned the marginal into a
1929            // piecewise-linear B-spline (e.g. 6 functions/group on sleepstudy),
1930            // i.e. a *smooth* with kinks rather than a random slope — many extra
1931            // collinear-across-levels coefficients that ill-condition the joint
1932            // Newton/REML solve (minutes-long fits, and a singular block when
1933            // combined with a separate random intercept `s(g, bs="re")`). The
1934            // raw linear basis is both the correct `re` semantics and fast.
1935            0
1936        } else {
1937            let min_group_resolution =
1938                min_per_group_unique_count(ds.values.column(c), ds.values.column(cols[group_idx]));
1939            // Per-group basis dim = degree + 1 + internal. Hold it well below the
1940            // smallest group's resolution (leave at least two residual points per
1941            // group) so the smooth cannot interpolate that group and the
1942            // wiggliness penalty retains the room to collapse each curve toward
1943            // its linear null space. Never drop below `degree + 2`, which keeps
1944            // exactly the linear span plus a single curvature direction — the
1945            // minimal smoother that can still bend if the data demand it.
1946            let basis_cap = min_group_resolution.saturating_sub(2).max(degree + 2);
1947            let internal_cap = basis_cap.saturating_sub(degree + 1);
1948            let capped = pooled_internal.min(internal_cap.max(1));
1949            // A factor smooth (`fs` AND `sz`) shares ONE marginal across ALL
1950            // levels, each level's curve fit from that group's rows alone. The
1951            // pooled knot heuristic (driven by the full column's sample) hands it
1952            // a much richer basis than the shared signal needs — ~24
1953            // functions/group on the gam#903 factor-smooth-recovery fixtures — so
1954            // REML has the capacity to fit within-group noise and over-fits the
1955            // shared shape (fs: edf 58 vs mgcv's k=10/edf 39; sz: gam 0.068 vs
1956            // mgcv 0.046 truth RMSE), losing the truth-recovery head-to-head with
1957            // the mature tool. mgcv's factor-smooth default `k=10` embodies the
1958            // right convention: a modest shared marginal. Cap the marginal there
1959            // (basis ≈ degree+1+internal ≈ 10) for both flavours when the
1960            // small-group cap above is not already tighter, so REML is not handed
1961            // noise-fitting capacity it does not need. An explicit `k`/`basis_dim`
1962            // overrides this (parse_ps_internal_knots); `re` is the raw linear
1963            // effect handled above.
1964            let fs_default_internal = FACTOR_SMOOTH_DEFAULT_BASIS_DIM
1965                .saturating_sub(degree + 1)
1966                .max(1);
1967            capped.min(fs_default_internal)
1968        };
1969        let (n_knots, _, effective_degree) =
1970            parse_ps_internal_knots(options, degree, default_internal)?;
1971        let penalty_order = option_usize(options, "penalty_order")
1972            .unwrap_or(if effective_degree > 1 { 2 } else { 1 })
1973            .min(effective_degree);
1974        // All factor-smooth flavours (`fs`, `sz`, `re`) place their per-level
1975        // marginal on the SAME penalized B-spline (P-spline) basis. The flavours
1976        // differ ONLY in their penalty/constraint structure (handled below) —
1977        // sz: zero-sum deviation blocks with the per-level null space left
1978        // unpenalized; fs: random-effect double penalty; re: identity ridge.
1979        //
1980        // `sz` USED to route its default-degree marginal to a NATURAL cubic
1981        // regression spline (`cr`), on the belief that mgcv's `bs="sz"` does the
1982        // same and that cr recovers smooth signals more efficiently than the
1983        // (then uncapped) B-spline margin (#1074). That introduced a consistency
1984        // failure (#1605): the `cr` basis enforces the natural boundary
1985        // conditions f''(x_1)=f''(x_k)=0 and extrapolates linearly past the end
1986        // knots, so it CANNOT represent a per-group deviation curve with non-zero
1987        // curvature at the data boundary. Phase-shifted deviation shapes
1988        // (f''(0) = -(2π)² sin(φ) ≠ 0) are then biased toward "free linear +
1989        // anchored wiggle", under-shooting the amplitude — a bias that does NOT
1990        // vanish as n→∞ (n-independent: a genuine consistency failure, not
1991        // finite-sample shrinkage). The earlier #700/#1074 sz fixtures used
1992        // d_g ∝ sin(2πx), whose f'' happens to vanish at x=0 and x=1, so they
1993        // accidentally satisfied the natural BC and never exposed the gap; the
1994        // `fs` sibling, on this very B-spline marginal, recovers the SAME
1995        // phase-shifted data to the noise floor.
1996        //
1997        // The penalized B-spline marginal makes no boundary assumption, so it
1998        // represents arbitrary deviation shapes, and — with the
1999        // FACTOR_SMOOTH_DEFAULT_BASIS_DIM cap above already removing the
2000        // noise-fitting capacity that originally motivated leaving B-splines —
2001        // it recovers the BC-satisfying #700/#1074 signals just as well. Sharing
2002        // one marginal basis across all flavours also lets the B-spline degree/
2003        // knot degradation handle low-cardinality covariates uniformly (what
2004        // `fs` already does), so the `sz`-only cr data-support cap (#1541/#1542)
2005        // — and the asymmetry where only the cr-marginal `sz` spelling hard-
2006        // failed a 3-level ordinal — is no longer needed.
2007        let marginal_knotspec = resolve_nonperiodic_bspline_knotspec(
2008            options,
2009            ds.values.column(c),
2010            (minv, maxv),
2011            effective_degree,
2012            n_knots,
2013        )?;
2014        let marginal = BSplineBasisSpec {
2015            degree: effective_degree,
2016            penalty_order,
2017            knotspec: marginal_knotspec,
2018            // mgcv's `bs="fs"` is a random-effect-style smooth: EVERY per-level
2019            // coefficient, including the marginal null space, is penalized so
2020            // unobserved groups can be predicted — so `fs` keeps the null-space
2021            // (double) penalty. mgcv's `bs="sz"` is a pure across-level
2022            // *deviation* smooth that, under the default `select=FALSE`, leaves
2023            // the per-level null space UNPENALIZED; carrying the double penalty
2024            // there shrinks the genuine deviation signal and over-smooths the
2025            // recovered curves relative to mgcv (gam#700). `re` carries its own
2026            // identity ridge below and ignores this flag. Honour an explicit
2027            // user `double_penalty=` either way.
2028            double_penalty: option_bool(options, "double_penalty")
2029                .unwrap_or(type_opt.as_str() != "sz"),
2030            identifiability: BSplineIdentifiability::None,
2031            boundary_conditions: Default::default(),
2032            boundary: OneDimensionalBoundary::Open,
2033        };
2034        let flavour = match type_opt.as_str() {
2035            "fs" => FactorSmoothFlavour::Fs {
2036                m_null_penalty_orders: vec![
2037                    option_usize(options, "m").unwrap_or(DEFAULT_PENALTY_ORDER),
2038                ],
2039            },
2040            "sz" => FactorSmoothFlavour::Sz,
2041            "re" => FactorSmoothFlavour::Re,
2042            // Outer `matches!` already restricts to fs/sz/re.
2043            other => {
2044                return Err(format!(
2045                    "internal: factor-smooth flavour dispatch reached unexpected type `{}`",
2046                    other
2047                ));
2048            }
2049        };
2050        return Ok(SmoothBasisSpec::FactorSmooth {
2051            spec: FactorSmoothSpec {
2052                continuous_cols: vec![c],
2053                group_col: cols[group_idx],
2054                marginal,
2055                flavour,
2056                group_frozen_levels: None,
2057                frozen_global_orthogonality: None,
2058            },
2059        });
2060    }
2061
2062    match type_opt.as_str() {
2063        "cyclic" | "cc" | "cp" | "cyclic-ps" => {
2064            validate_known_options(
2065                "cyclic",
2066                options,
2067                &[
2068                    "type",
2069                    "bs",
2070                    "by",
2071                    "k",
2072                    "basis_dim",
2073                    "basis-dim",
2074                    "basisdim",
2075                    "degree",
2076                    "penalty_order",
2077                    "period",
2078                    "periods",
2079                    "period_start",
2080                    "period_end",
2081                    "start",
2082                    "end",
2083                    "origin",
2084                    "origins",
2085                    "period_origin",
2086                    "period-origin",
2087                    "domain_origin",
2088                    "double_penalty",
2089                    "id",
2090                    "__by_col",
2091                    "identifiability",
2092                ],
2093            )?;
2094            if cols.len() != 1 {
2095                return Err(format!(
2096                    "periodic smooth expects one variable, got {}",
2097                    cols.len()
2098                ));
2099            }
2100            let c = cols[0];
2101            let (minv, maxv) = col_minmax(ds.values.column(c))?;
2102            let degree = option_usize(options, "degree").unwrap_or(DEFAULT_BSPLINE_DEGREE);
2103            let mut default_internal = heuristic_knots_for_column(ds.values.column(c));
2104            if ds.values.nrows() <= 32 && smooth_coordinate_count >= 5 {
2105                default_internal = default_internal.min(1);
2106            }
2107            // A periodic cubic spline has no free endpoint behaviour to spend
2108            // degrees of freedom on: the wrap constraint removes the ordinary
2109            // boundary wiggle, and the cyclic second-difference penalty leaves
2110            // only the constant direction (handled by the smooth
2111            // identifiability constraint).  An over-rich default would give
2112            // small binomial/continuation-ratio fits a large penalized nuisance
2113            // space whose REML/LAML optimum is driven by finite-sample Bernoulli
2114            // noise rather than the low-frequency periodic signal.  Cap the
2115            // cyclic default in the mgcv `bs="cc"` spirit: a modest basis unless
2116            // the caller explicitly requests `k=...`; high-frequency periodic
2117            // structure remains available through that explicit contract.  Since
2118            // gam#1680 lowered the open-spline univariate default to ≈12
2119            // functions this cap and the open-spline default coincide, so it now
2120            // acts as an explicit floor/guard that keeps the cyclic default lean
2121            // even if the open-spline heuristic is later widened.
2122            let cyclic_default_basis_cap = CYCLIC_DEFAULT_BASIS_DIM.max(degree + 1);
2123            let default_basis = (default_internal + degree + 1).min(cyclic_default_basis_cap);
2124            let num_basis = option_usize_any(options, &["k", "basis_dim", "basis-dim", "basisdim"])
2125                .unwrap_or(default_basis);
2126            if num_basis < degree + 1 {
2127                return Err(format!(
2128                    "periodic smooth: k={} too small for degree {}; expected k >= {}",
2129                    num_basis,
2130                    degree,
2131                    degree + 1
2132                ));
2133            }
2134            // The cyclic arm is periodic on its single axis by construction, so
2135            // resolve the period exactly the way the `s()`/`ps` arm does: honour
2136            // `period=`/`periods=` first (with `origin=` setting the domain
2137            // start), and fall back to the `period_start`/`period_end` endpoint
2138            // form only when `period=` is absent. Previously this arm jumped
2139            // straight to `parse_periodic_domain_1d`, so a `period=<v>`
2140            // declaration was silently dropped and the smooth wrapped at the
2141            // data range (#816). All three helpers route through
2142            // `parse_numeric_expr`, so `period=2*pi` and `period_end=2*pi` parse
2143            // identically (#815).
2144            let periodic_axes = [true];
2145            let periods = parse_periods(options, &periodic_axes)?;
2146            let origins = parse_period_origins(options, &periodic_axes)?;
2147            // Distinguish a *cyclic basis selector* (`bs='cc'`/`cp'`/`cyclic`,
2148            // this whole arm) from a generic B-spline forced periodic by a
2149            // `periodic=`/`boundary=` flag (the `ps`/`bspline` arm). Only the
2150            // latter carries the sample-dependent off-by-ε seam that #1771's
2151            // guard in `parse_periodic_domain_1d` requires an explicit period
2152            // to avoid. A bare `s(x, bs='cc')` opts INTO mgcv's `bs="cc"`
2153            // semantics — the wrap IS the observed data range — exactly like
2154            // the tensor cc-margin fallback (`te(x, z, bs=c('cc','cc'))`). The
2155            // cyclic arm was left routing through the now-strict helper when
2156            // #1771 tightened it, so a bare cyclic smooth hard-errored with
2157            // "periodic B-spline smooth requires an explicit period" even
2158            // though its period is well-defined. Honor `period=`/`periods=`
2159            // first, then the half-open `period_start`/`period_end` endpoint
2160            // form, and only otherwise wrap at the observed `[min, max]` span.
2161            let has_endpoint_decl = ["period_start", "start", "period_end", "end"]
2162                .iter()
2163                .any(|key| options.contains_key(*key));
2164            let (domain_start, period) = if let Some(p) = periods[0] {
2165                (origins[0].unwrap_or(minv), p)
2166            } else if has_endpoint_decl {
2167                parse_periodic_domain_1d(options, minv, maxv)?
2168            } else {
2169                let span = maxv - minv;
2170                if !(span.is_finite() && span > 0.0) {
2171                    return Err(format!(
2172                        "cyclic smooth requires a positive observed data range to derive \
2173                         its period, got [{minv}, {maxv}]"
2174                    ));
2175                }
2176                (origins[0].unwrap_or(minv), span)
2177            };
2178            Ok(SmoothBasisSpec::BSpline1D {
2179                feature_col: c,
2180                spec: BSplineBasisSpec {
2181                    degree,
2182                    penalty_order: option_usize(options, "penalty_order")
2183                        .unwrap_or(DEFAULT_PENALTY_ORDER),
2184                    knotspec: BSplineKnotSpec::PeriodicUniform {
2185                        data_range: (domain_start, domain_start + period),
2186                        num_basis,
2187                    },
2188                    double_penalty: smooth_double_penalty,
2189                    identifiability: BSplineIdentifiability::default(),
2190                    boundary_conditions: Default::default(),
2191                    boundary: OneDimensionalBoundary::Cyclic {
2192                        start: domain_start,
2193                        end: domain_start + period,
2194                    },
2195                },
2196            })
2197        }
2198        "bspline" | "ps" | "p-spline" | "cr" | "cs" => {
2199            // mgcv's `bs="cr"` (cubic regression spline) and `bs="cs"` (its
2200            // shrinkage twin) are penalized cubic-regression smooths that span
2201            // the same per-axis function space as gamfit's `bspline` (cubic
2202            // B-spline, second-derivative penalty). Route both through the
2203            // 1-D B-spline arm; the only semantic difference is whether the
2204            // null space is shrunk: `cr` is the no-shrinkage form (mgcv's
2205            // default) and `cs` is the shrinkage form (mgcv's `cs`/gamfit's
2206            // double_penalty). Without this route, a stand-alone
2207            // `s(x, bs='cr')` (which is otherwise a routine 1-D smooth in
2208            // mgcv-compatible formulae) reached the dispatch's default arm
2209            // and aborted the whole fit with `unsupported smooth type 'cr'`,
2210            // even though the same name was already recognized as a tensor
2211            // margin (`tensor_margin_bs_is_supported`).
2212            let validation_name = match type_opt.as_str() {
2213                "cr" => "cr",
2214                "cs" => "cs",
2215                _ => "bspline",
2216            };
2217            validate_known_options(
2218                validation_name,
2219                options,
2220                &[
2221                    "type",
2222                    "bs",
2223                    "by",
2224                    "k",
2225                    "basis_dim",
2226                    "basis-dim",
2227                    "basisdim",
2228                    "knots",
2229                    "knot_placement",
2230                    "knot-placement",
2231                    "knotplacement",
2232                    "degree",
2233                    "penalty_order",
2234                    "boundary",
2235                    "bc",
2236                    "boundary_conditions",
2237                    "bc_left",
2238                    "bc_right",
2239                    "left_bc",
2240                    "right_bc",
2241                    "start_bc",
2242                    "end_bc",
2243                    "side",
2244                    "anchor",
2245                    "anchor_value",
2246                    "value",
2247                    "anchor_left",
2248                    "left_anchor",
2249                    "anchor_right",
2250                    "right_anchor",
2251                    "periodic",
2252                    "period",
2253                    "periods",
2254                    "period_start",
2255                    "period_end",
2256                    "origin",
2257                    "double_penalty",
2258                    "by",
2259                    "id",
2260                    "__by_col",
2261                    "identifiability",
2262                    "by",
2263                ],
2264            )?;
2265            if cols.len() != 1 {
2266                return Err(TermBuilderError::incompatible_config(format!(
2267                    "bspline smooth expects one variable, got {}",
2268                    cols.len()
2269                ))
2270                .to_string());
2271            }
2272            let c = cols[0];
2273            let (minv, maxv) = col_minmax(ds.values.column(c))?;
2274            let degree = option_usize(options, "degree").unwrap_or(DEFAULT_BSPLINE_DEGREE);
2275            let default_internal = heuristic_knots_for_column(ds.values.column(c));
2276            let (mut n_knots, inferred, effective_degree) =
2277                parse_ps_internal_knots(options, degree, default_internal)?;
2278            let periodic_axes = parse_periodic_axes(options, 1).map_err(|e| e.to_string())?;
2279            // Periodic margins still need enough basis functions to wrap, so
2280            // surface the per-axis degree reduction as a config error when the
2281            // user explicitly asked for a periodic-but-too-small basis. The
2282            // non-periodic path silently degrades degree to match mgcv.
2283            if periodic_axes[0] && effective_degree != degree {
2284                return Err(TermBuilderError::invalid_option(format!(
2285                    "periodic smooth: k={} too small for degree {}; expected k >= {}",
2286                    effective_degree + 1,
2287                    degree,
2288                    degree + 1
2289                ))
2290                .to_string());
2291            }
2292            if inferred && ds.values.nrows() <= 32 && smooth_coordinate_count >= 5 {
2293                n_knots = n_knots.min(1);
2294            }
2295            if inferred {
2296                let unique = unique_count_column(ds.values.column(c));
2297                let ceiling = ((unique as f64).cbrt() as usize).max(20);
2298                inference_notes.push(format!(
2299                    "Automatically set {} internal knots for smooth '{}' from {} unique values (rule: clamp(unique/4, 4..max(20, cbrt(unique))) = clamp(unique/4, 4..{})). Override with knots=... or k=....",
2300                    n_knots,
2301                    vars.join(","),
2302                    unique,
2303                    ceiling,
2304                ));
2305            }
2306            let boundary_conditions =
2307                if periodic_axes[0] && bspline_boundary_declares_periodic_axis(options) {
2308                    BSplineBoundaryConditions::default()
2309                } else {
2310                    parse_bspline_boundary_conditions(options).map_err(|e| e.to_string())?
2311                };
2312            let periods = parse_periods(options, &periodic_axes).map_err(|e| e.to_string())?;
2313            let origins =
2314                parse_period_origins(options, &periodic_axes).map_err(|e| e.to_string())?;
2315            let (knotspec, boundary) = if periodic_axes[0] {
2316                if !boundary_conditions.is_free() {
2317                    return Err(TermBuilderError::incompatible_config(
2318                        "periodic B-splines cannot also declare endpoint boundary conditions",
2319                    )
2320                    .to_string());
2321                }
2322                {
2323                    let (domain_start, p_value) = if periods[0].is_some() {
2324                        (origins[0].unwrap_or(minv), periods[0].unwrap())
2325                    } else {
2326                        parse_periodic_domain_1d(options, minv, maxv).map_err(|e| e.to_string())?
2327                    };
2328                    let domain_end = domain_start + p_value;
2329                    (
2330                        BSplineKnotSpec::PeriodicUniform {
2331                            data_range: (domain_start, domain_end),
2332                            num_basis: n_knots + effective_degree + 1,
2333                        },
2334                        OneDimensionalBoundary::Cyclic {
2335                            start: domain_start,
2336                            end: domain_end,
2337                        },
2338                    )
2339                }
2340            } else if type_opt == "cr" || type_opt == "cs" {
2341                // mgcv `bs="cr"`/`"cs"`: a natural cubic regression spline whose
2342                // basis is indexed by `k` values at quantile-placed knots (#1074),
2343                // NOT a B-spline knot vector. Match gam's `k=` convention by
2344                // requesting the same total basis size the B-spline arm would
2345                // produce (`n_knots` internal + degree + 1), floored at the cr
2346                // minimum of 3 knots. `cr` vs `cs` (shrinkage) is carried by the
2347                // `double_penalty` flag resolved below, which the cr builder reads.
2348                //
2349                // Cap that request to the covariate's data support (#1541): a cr
2350                // basis cannot place more value-knots than there are distinct
2351                // covariate values, so an unclamped `k` on a low-cardinality
2352                // predictor (binary indicator, 3-level ordinal, small count) used
2353                // to hard-fail in `select_cr_knots` instead of reducing like mgcv
2354                // and gam's tensor path. Below the cr minimum (a binary covariate)
2355                // degrade to the B-spline marginal the default `s(x, k=..)` basis
2356                // already fits on the same data — never a hard error.
2357                let k_cr = (n_knots + effective_degree + 1).max(CR_MIN_KNOTS);
2358                let knotspec = match capped_cr_marginal_knotspec(
2359                    ds.values.column(c),
2360                    k_cr,
2361                    &vars.join(","),
2362                    inference_notes,
2363                )? {
2364                    Some(cr_knotspec) => cr_knotspec,
2365                    None => resolve_nonperiodic_bspline_knotspec(
2366                        options,
2367                        ds.values.column(c),
2368                        (minv, maxv),
2369                        effective_degree,
2370                        n_knots,
2371                    )?,
2372                };
2373                (knotspec, parse_cyclic_boundary(options, minv, maxv)?)
2374            } else {
2375                (
2376                    resolve_nonperiodic_bspline_knotspec(
2377                        options,
2378                        ds.values.column(c),
2379                        (minv, maxv),
2380                        effective_degree,
2381                        n_knots,
2382                    )?,
2383                    parse_cyclic_boundary(options, minv, maxv)?,
2384                )
2385            };
2386            // mgcv `bs="cr"` does not shrink the linear null space; only `cs`
2387            // (and the gamfit-flavoured `bspline`/`ps`) do. Honour an explicit
2388            // `double_penalty=` either way.
2389            let double_penalty = if type_opt == "cr" {
2390                option_bool(options, "double_penalty").unwrap_or(false)
2391            } else {
2392                smooth_double_penalty
2393            };
2394            // Clamp the marginal difference penalty to `<= effective_degree`
2395            // so it stays well-defined when the per-axis degree was reduced
2396            // (mirrors the tensor margin path: `create_difference_penalty_matrix`
2397            // requires order < num_basis_functions).
2398            let penalty_order = option_usize(options, "penalty_order")
2399                .unwrap_or(DEFAULT_PENALTY_ORDER)
2400                .min(effective_degree);
2401            Ok(SmoothBasisSpec::BSpline1D {
2402                feature_col: c,
2403                spec: BSplineBasisSpec {
2404                    degree: effective_degree,
2405                    penalty_order,
2406                    knotspec,
2407                    double_penalty,
2408                    identifiability: BSplineIdentifiability::default(),
2409                    boundary,
2410                    boundary_conditions,
2411                },
2412            })
2413        }
2414        "tps" | "thinplate" | "thin-plate" => {
2415            validate_known_options(
2416                "thinplate",
2417                options,
2418                &[
2419                    SECONDARY_CENTER_CAP_OPTION,
2420                    "type",
2421                    "bs",
2422                    "by",
2423                    "length_scale",
2424                    "centers",
2425                    "k",
2426                    "basis_dim",
2427                    "basis-dim",
2428                    "basisdim",
2429                    "knots",
2430                    "include_intercept",
2431                    "double_penalty",
2432                    "by",
2433                    "id",
2434                    "__by_col",
2435                    "identifiability",
2436                    "by",
2437                    "periodic",
2438                    "cyclic",
2439                    "period",
2440                    "period_start",
2441                    "period_end",
2442                    "scale_dims",
2443                ],
2444            )?;
2445            let plan = plan_spatial_basis(
2446                ds.values.nrows(),
2447                cols.len(),
2448                CenterCountRequest::Default,
2449                DuchonNullspaceOrder::Linear,
2450                option_bool(options, "scale_dims").unwrap_or(false),
2451                policy,
2452            )
2453            .map_err(|e| e.to_string())?;
2454            // #1074: the mgcv-sized basis cap (`k = 10·3^(d-1)`) that used to live
2455            // here was DELETED. It masked the real defect — the n-scaling default
2456            // over-sizes a thin-plate field, producing a weakly-identified
2457            // two-penalty ρ-surface the outer optimizer stalls on (row-order
2458            // dependent, #1378), and surplus columns REML can't penalize away on
2459            // weak-signal fits. Capping the basis hid that stall instead of fixing
2460            // it. The default now uses the generic spatial center heuristic; the
2461            // root fix (a well-identified ρ-surface / optimizer that doesn't stall)
2462            // is tracked separately. Explicit `k`/`centers` still take full effect.
2463            let default_centers = plan.centers;
2464            let centers = parse_countwith_basis_alias(
2465                options,
2466                "centers",
2467                cap_default_spatial_centers(options, default_centers),
2468            )?;
2469            let center_strategy = if has_explicit_countwith_basis_alias(options, "centers") {
2470                spatial_center_strategy_for_dimension(centers, cols.len())
2471            } else {
2472                auto_spatial_center_strategy(centers, cols.len())
2473            };
2474            Ok(SmoothBasisSpec::ThinPlate {
2475                feature_cols: cols.to_vec(),
2476                spec: ThinPlateBasisSpec {
2477                    center_strategy,
2478                    periodic: parse_periodic_axes_option(options, cols.len())?,
2479                    // Sentinel: leave at 0.0 when the user didn't pass an
2480                    // explicit length_scale so `auto_init_length_scale_in_place`
2481                    // can replace it with a data-derived initialization. The
2482                    // old hard-coded 1.0 was the documented basin (see
2483                    // smooth.rs `auto_init_length_scale_in_place`) that the
2484                    // spatial optimizer could not escape, leaving TPS terms
2485                    // initialized off the data scale.
2486                    length_scale: option_f64(options, "length_scale").unwrap_or(0.0),
2487                    double_penalty: smooth_double_penalty,
2488                    identifiability: parse_spatial_identifiability(options)
2489                        .map_err(|e| e.to_string())?,
2490                    radial_reparam: None,
2491                },
2492                input_scales: None,
2493            })
2494        }
2495        "sphere" | "s2" | "sos" => {
2496            validate_known_options(
2497                "sphere",
2498                options,
2499                &[
2500                    "type",
2501                    "bs",
2502                    "by",
2503                    "centers",
2504                    "k",
2505                    "basis_dim",
2506                    "basis-dim",
2507                    "basisdim",
2508                    "knots",
2509                    "penalty_order",
2510                    "m",
2511                    "double_penalty",
2512                    "id",
2513                    "__by_col",
2514                    "kernel",
2515                    "method",
2516                    "radians",
2517                    "units",
2518                    "degree",
2519                    "l",
2520                    "max_degree",
2521                    "max-degree",
2522                ],
2523            )?;
2524            if cols.len() != 2 {
2525                return Err(format!(
2526                    "sphere smooth expects exactly two variables (lat, lon), got {}",
2527                    cols.len()
2528                ));
2529            }
2530            let radians = option_bool(options, "radians").unwrap_or_else(|| {
2531                options
2532                    .get("units")
2533                    .map(|u| u.eq_ignore_ascii_case("radian") || u.eq_ignore_ascii_case("radians"))
2534                    .unwrap_or(false)
2535            });
2536            // An explicit `degree`/`l`/`max_degree` names a spherical-harmonic
2537            // truncation, so with no explicit kernel/method it selects the
2538            // Harmonic construction (the Wahba kernel ignores `degree` and would
2539            // silently emit a 1-column kernel design). An explicit kernel/method
2540            // still wins.
2541            let degree_requested = options.contains_key("degree")
2542                || options.contains_key("l")
2543                || options.contains_key("max_degree")
2544                || options.contains_key("max-degree");
2545            let kernel = options
2546                .get("kernel")
2547                .or_else(|| options.get("method"))
2548                .map(|raw| strip_quotes(raw).trim().to_ascii_lowercase())
2549                .unwrap_or_else(|| {
2550                    if degree_requested {
2551                        "harmonic".to_string()
2552                    } else {
2553                        "sobolev".to_string()
2554                    }
2555                });
2556            let (method, wahba_kernel) = match kernel.as_str() {
2557                "sobolev" | "wahba" | "wahba_sobolev" | "wahba-sobolev" => {
2558                    (SphereMethod::Wahba, SphereWahbaKernel::Sobolev)
2559                }
2560                "pseudo" | "mgcv" | "sos" | "wahba_pseudo" | "wahba-pseudo" => {
2561                    (SphereMethod::Wahba, SphereWahbaKernel::Pseudo)
2562                }
2563                "harmonic" | "spherical_harmonic" | "spherical-harmonic" => {
2564                    (SphereMethod::Harmonic, SphereWahbaKernel::Sobolev)
2565                }
2566                other => {
2567                    return Err(format!(
2568                        "unsupported sphere kernel '{other}'; expected sobolev, pseudo, or harmonic"
2569                    ));
2570                }
2571            };
2572            let max_degree = if matches!(method, SphereMethod::Harmonic) {
2573                let degree =
2574                    option_usize_any(options, &["degree", "l", "max_degree", "max-degree"])
2575                        .or_else(|| option_usize(options, "centers"))
2576                        .or_else(|| {
2577                            option_usize_any(options, &["k", "basis_dim", "basis-dim", "basisdim"])
2578                                .and_then(|k| (1..=128).find(|&l| l * (l + 2) >= k))
2579                        })
2580                        .unwrap_or_else(|| default_spherical_harmonic_degree(ds.values.nrows()));
2581                if degree == 0 {
2582                    return Err("sphere smooth requires degree/max_degree >= 1".to_string());
2583                }
2584                if degree > 32 {
2585                    return Err(format!(
2586                        "sphere smooth max_degree={} is too large for the dense harmonic engine (limit 32)",
2587                        degree
2588                    ));
2589                }
2590                Some(degree)
2591            } else {
2592                None
2593            };
2594            let penalty_order = option_usize(options, "penalty_order")
2595                .or_else(|| option_usize(options, "m"))
2596                .unwrap_or(DEFAULT_PENALTY_ORDER);
2597            let center_strategy = if matches!(method, SphereMethod::Wahba) {
2598                let mut centers = parse_countwith_basis_alias(
2599                    options,
2600                    "centers",
2601                    default_num_centers(ds.values.nrows(), cols.len()),
2602                )?;
2603                if penalty_order >= 4 {
2604                    centers = centers.max(30);
2605                }
2606                CenterStrategy::FarthestPoint {
2607                    num_centers: centers,
2608                }
2609            } else {
2610                CenterStrategy::FarthestPoint { num_centers: 0 }
2611            };
2612            Ok(SmoothBasisSpec::Sphere {
2613                feature_cols: cols.to_vec(),
2614                spec: SphericalSplineBasisSpec {
2615                    center_strategy,
2616                    penalty_order,
2617                    double_penalty: smooth_double_penalty,
2618                    radians,
2619                    method,
2620                    max_degree,
2621                    wahba_kernel,
2622                    identifiability: SphericalSplineIdentifiability::CenterSumToZero,
2623                },
2624            })
2625        }
2626        "curvature" => {
2627            // Constant-curvature (M_κ) geodesic-kernel smooth (#944): the
2628            // κ-generic sibling of the intrinsic S² smooth above. The feature
2629            // columns are κ-stereographic chart coordinates; `kappa=` is the
2630            // fixed sectional curvature (default 0 = flat), and the geometry
2631            // comes from `geometry::constant_curvature::ConstantCurvature`.
2632            validate_known_options(
2633                "curvature",
2634                options,
2635                &[
2636                    "type",
2637                    "bs",
2638                    "by",
2639                    "centers",
2640                    "k",
2641                    "basis_dim",
2642                    "basis-dim",
2643                    "basisdim",
2644                    "knots",
2645                    "kappa",
2646                    "length_scale",
2647                    "double_penalty",
2648                    "id",
2649                    "__by_col",
2650                ],
2651            )?;
2652            let kappa = option_f64(options, "kappa").unwrap_or(0.0);
2653            if !kappa.is_finite() {
2654                return Err("curvature smooth requires a finite kappa".to_string());
2655            }
2656            let length_scale = option_f64(options, "length_scale").unwrap_or(0.0);
2657            if !length_scale.is_finite() || length_scale < 0.0 {
2658                return Err(format!(
2659                    "curvature smooth length_scale must be positive (or omitted for auto); got {length_scale}"
2660                ));
2661            }
2662            let centers = parse_countwith_basis_alias(
2663                options,
2664                "centers",
2665                default_num_centers(ds.values.nrows(), cols.len()),
2666            )?;
2667            if centers < 2 {
2668                return Err("curvature smooth requires at least 2 centers".to_string());
2669            }
2670            Ok(SmoothBasisSpec::ConstantCurvature {
2671                feature_cols: cols.to_vec(),
2672                spec: ConstantCurvatureBasisSpec {
2673                    center_strategy: CenterStrategy::FarthestPoint {
2674                        num_centers: centers,
2675                    },
2676                    kappa,
2677                    // 0.0 sentinel = κ-independent auto initialization in the
2678                    // basis builder (median chart center spacing, doubled).
2679                    length_scale,
2680                    // Curvature smooth defaults to NO double-penalty ridge
2681                    // (#1464): the curvature-blind ridge `I` absorbs the data fit
2682                    // independently of κ and rails the fitted curvature to the
2683                    // +chart bound (hyperbolic truth recovered as spherical). The
2684                    // RKHS Gram penalty is already full-rank PD, so the ridge adds
2685                    // no stability. Honour an EXPLICIT `double_penalty=` only.
2686                    double_penalty: option_bool(options, "double_penalty").unwrap_or(false),
2687                    identifiability: ConstantCurvatureIdentifiability::CenterSumToZero,
2688                },
2689            })
2690        }
2691        "measurejet" => {
2692            // Measure-jet spline: multiscale local-jet-residual energy of the
2693            // empirical measure. The feature columns are ambient coordinates
2694            // of data concentrated near an unknown low-dimensional set; the
2695            // geometry (centers, masses, scale band) is read off the measure
2696            // at build time — magic by default, every option optional.
2697            validate_known_options(
2698                "measurejet",
2699                options,
2700                &[
2701                    "type",
2702                    "bs",
2703                    "by",
2704                    "centers",
2705                    "k",
2706                    "basis_dim",
2707                    "basis-dim",
2708                    "basisdim",
2709                    "knots",
2710                    "s",
2711                    "alpha",
2712                    "tau",
2713                    "scales",
2714                    "length_scale",
2715                    "double_penalty",
2716                    "multiscale",
2717                    "learn_length_scale",
2718                    "id",
2719                    "__by_col",
2720                ],
2721            )?;
2722            let order_s = option_f64(options, "s").unwrap_or(0.0);
2723            // 0.0 = auto sentinel; explicit values must sit inside the
2724            // admissible order interval of the affine-jet (r = 2) energy.
2725            if !(order_s.is_finite() && (order_s == 0.0 || (order_s > 0.0 && order_s < 2.0))) {
2726                return Err(format!(
2727                    "measurejet smooth s must lie in (0, 2) (or be omitted for auto); got {order_s}"
2728                ));
2729            }
2730            // Default to the spec Default (α = 1, density-WEIGHTED Hessian
2731            // energy — the module-header default). The density-free α = 3/2
2732            // (q^{−2}) over-smooths low-intrinsic-dimension manifolds where the
2733            // local mass q is tiny and varies along the stratum (#1116:
2734            // 13×-worse-than-matérn on a 1-D curve in 3-D); α = 1's q^{−1} is
2735            // gentler and robust across intrinsic dimensions. An explicit
2736            // `alpha=` still overrides for full-dimensional density-free use.
2737            let alpha =
2738                option_f64(options, "alpha").unwrap_or(MeasureJetBasisSpec::default().alpha);
2739            if !alpha.is_finite() {
2740                return Err("measurejet smooth requires a finite alpha".to_string());
2741            }
2742            let tau0 = option_f64(options, "tau").unwrap_or(1e-3);
2743            if !(tau0.is_finite() && tau0 >= 0.0) {
2744                return Err(format!(
2745                    "measurejet smooth tau must be finite and nonnegative; got {tau0}"
2746                ));
2747            }
2748            let num_scales = option_usize(options, "scales").unwrap_or(0);
2749            let length_scale = option_f64(options, "length_scale").unwrap_or(0.0);
2750            if !length_scale.is_finite() || length_scale < 0.0 {
2751                return Err(format!(
2752                    "measurejet smooth length_scale must be positive (or omitted for auto); got {length_scale}"
2753                ));
2754            }
2755            let centers = parse_countwith_basis_alias(
2756                options,
2757                "centers",
2758                default_num_centers(ds.values.nrows(), cols.len()),
2759            )?;
2760            if centers < 3 {
2761                return Err("measurejet smooth requires at least 3 centers".to_string());
2762            }
2763            // Multiscale (per-scale spectral split + (α, lnτ) ψ dials + the
2764            // affine-preserving ridge) is an explicit opt-in (#1116): default
2765            // single-scale at any center count, the Duchon/Matérn footprint.
2766            let multiscale = option_bool(options, "multiscale").unwrap_or(false);
2767            // REML-learning the representer range ℓ is an explicit opt-in.
2768            // The stable default freezes ℓ at the auto/user value; the
2769            // design-moving coordinate is expensive and can overfit low-signal
2770            // surfaces when enabled implicitly.
2771            let learn_length_scale = option_bool(options, "learn_length_scale").unwrap_or(false);
2772            Ok(SmoothBasisSpec::MeasureJet {
2773                feature_cols: cols.to_vec(),
2774                spec: MeasureJetBasisSpec {
2775                    center_strategy: CenterStrategy::FarthestPoint {
2776                        num_centers: centers,
2777                    },
2778                    order_s,
2779                    alpha,
2780                    tau0,
2781                    num_scales,
2782                    // 0.0 sentinel = auto initialization in the basis builder
2783                    // (median nearest-center spacing).
2784                    length_scale,
2785                    double_penalty: smooth_double_penalty,
2786                    learn_length_scale,
2787                    multiscale,
2788                    identifiability: MeasureJetIdentifiability::CenterSumToZero,
2789                    frozen_quadrature: None,
2790                },
2791                input_scales: None,
2792            })
2793        }
2794        "matern" => {
2795            // Catch typos like `lengt_scale=` / `nyu=` / `centerz=` before
2796            // they get silently ignored and the user wonders why their
2797            // option had no effect. The matern() term accepts exactly
2798            // these options.
2799            validate_known_options(
2800                "matern",
2801                options,
2802                &[
2803                    SECONDARY_CENTER_CAP_OPTION,
2804                    "type",
2805                    "bs",
2806                    "by",
2807                    "nu",
2808                    "length_scale",
2809                    "centers",
2810                    "k",
2811                    "basis_dim",
2812                    "basis-dim",
2813                    "basisdim",
2814                    "knots",
2815                    "include_intercept",
2816                    "double_penalty",
2817                    "by",
2818                    "id",
2819                    "__by_col",
2820                    "identifiability",
2821                    "by",
2822                    "periodic",
2823                    "cyclic",
2824                    "period",
2825                    "period_start",
2826                    "period_end",
2827                    "scale_dims",
2828                ],
2829            )?;
2830            let plan = plan_spatial_basis(
2831                ds.values.nrows(),
2832                cols.len(),
2833                CenterCountRequest::Default,
2834                DuchonNullspaceOrder::Zero,
2835                option_bool(options, "scale_dims").unwrap_or(false),
2836                policy,
2837            )
2838            .map_err(|e| e.to_string())?;
2839            let centers = parse_countwith_basis_alias(
2840                options,
2841                "centers",
2842                cap_default_spatial_centers(
2843                    options,
2844                    default_matern_center_count(ds.values.nrows(), cols.len(), plan.centers),
2845                ),
2846            )?;
2847            let center_strategy = if has_explicit_countwith_basis_alias(options, "centers") {
2848                spatial_center_strategy_for_dimension(centers, cols.len())
2849            } else {
2850                auto_spatial_center_strategy(centers, cols.len())
2851            };
2852            let nu = parse_matern_nu(options.get("nu").map(String::as_str).unwrap_or("5/2"))?;
2853            // The exponential (ν = 1/2) Matérn kernel has a singular Laplacian
2854            // at zero in d ≥ 2, so the operator-collocation penalty machinery
2855            // hits a non-invertible matrix during fit. Surface the cause
2856            // up-front instead of letting the user see the generic
2857            // "Matrix conditioning issue detected" wrapper from PIRLS.
2858            if matches!(nu, MaternNu::Half) && cols.len() >= 2 {
2859                return Err(TermBuilderError::unsupported_feature(format!(
2860                    "matern() with nu=1/2 is not supported for d>=2 (got {} covariates): \
2861                     the exponential kernel's Laplacian is singular at center collisions, \
2862                     which makes the operator-collocation penalty non-invertible. \
2863                     Choose nu>=3/2 (e.g. nu=3/2 or the default nu=5/2) for multi-dimensional smooths.",
2864                    cols.len()
2865                ))
2866                .to_string());
2867            }
2868            let aniso_log_scales = if option_bool(options, "scale_dims").unwrap_or(false) {
2869                Some(vec![0.0; cols.len()])
2870            } else {
2871                None
2872            };
2873            Ok(SmoothBasisSpec::Matern {
2874                feature_cols: cols.to_vec(),
2875                spec: MaternBasisSpec {
2876                    center_strategy,
2877                    periodic: parse_periodic_axes_option(options, cols.len())?,
2878                    // Sentinel: leave at 0.0 when the user didn't pass an
2879                    // explicit length_scale so the planner's
2880                    // `auto_init_length_scale_in_place` can replace it with the
2881                    // SAME data-derived wiggly-side initialization the thin-plate
2882                    // path uses (`max_range / sqrt(n)`), then let the κ-optimizer
2883                    // refine from there.
2884                    //
2885                    // gam#1629: the previous `default_matern_length_scale` seeded
2886                    // the FULL data diameter — the maximally over-smoothed corner.
2887                    // Because that value is non-zero, the `0.0`-gated auto-init was
2888                    // a no-op for Matérn, so the κ-optimizer started in the flat
2889                    // over-smoothed basin and parked there, leaving high-frequency
2890                    // 2-D surfaces unresolved (truth-RMSE ~6× worse than
2891                    // thin-plate/tensor on identical data, and insensitive to `k`).
2892                    // Routing Matérn through the same `0.0` sentinel as thin-plate
2893                    // (see the ThinPlate branch above) starts REML in the resolving
2894                    // regime it can actually escape from.
2895                    length_scale: option_f64(options, "length_scale").unwrap_or(0.0),
2896                    nu,
2897                    include_intercept: option_bool(options, "include_intercept").unwrap_or(false),
2898                    double_penalty: smooth_double_penalty,
2899                    identifiability: parse_matern_identifiability(options)
2900                        .map_err(|e| e.to_string())?,
2901                    aniso_log_scales,
2902                    // Cold build: let the bootstrap-κ spectral test decide whether
2903                    // the double-penalty nullspace shrinkage survives; the freeze
2904                    // step then pins that decision into the FrozenTransform so the
2905                    // κ-optimizer's rebuilds keep the count invariant (gam#787/#860).
2906                    nullspace_shrinkage_survived: None,
2907                },
2908                input_scales: None,
2909            })
2910        }
2911        "duchon" | "ds" => {
2912            validate_known_options(
2913                "duchon",
2914                options,
2915                &[
2916                    SECONDARY_CENTER_CAP_OPTION,
2917                    "type",
2918                    "bs",
2919                    "by",
2920                    "length_scale",
2921                    "centers",
2922                    "k",
2923                    "basis_dim",
2924                    "basis-dim",
2925                    "basisdim",
2926                    "knots",
2927                    "power",
2928                    "p",
2929                    "nullspace_order",
2930                    "order",
2931                    "identifiability",
2932                    "by",
2933                    "periodic",
2934                    "cyclic",
2935                    "period",
2936                    "period_start",
2937                    "period_end",
2938                    "scale_dims",
2939                    "double_penalty",
2940                    "by",
2941                    "id",
2942                    "__by_col",
2943                ],
2944            )?;
2945            if options.contains_key("double_penalty") {
2946                return Err(TermBuilderError::incompatible_config(format!(
2947                    "Duchon smooth '{}' does not support double_penalty; the Duchon smoother already ships its native reproducing-norm penalty plus a null-space shrinkage ridge.",
2948                    vars.join(", ")
2949                ))
2950                .to_string());
2951            }
2952            let requested_nullspace_order = parse_duchon_order(options)?;
2953            let length_scale = option_f64_strict(options, "length_scale")?;
2954            // Resolve `(nullspace_order, power)`. The default (magic) path is a
2955            // structural amplitude/slope/curvature smoother: an affine (`Linear`)
2956            // polynomial nullspace and spectral power `s = (d - 1)/2`, giving the
2957            // cubic kernel `r^3` in 1D. There is no nullspace-order escalation —
2958            // the structural cubic smoother is well-defined for every dimension.
2959            //
2960            // Explicit `power=...` honors the user's value verbatim against their
2961            // requested nullspace order; the kernel validator emits a precise
2962            // diagnostic for any inadmissible combination. In the scale-free
2963            // (non-hybrid) regime fractional powers are admitted and threaded as
2964            // `f64`. The hybrid Duchon-Matérn kernel (`length_scale=Some`) is
2965            // restricted to integer powers.
2966            let (nullspace_order, power) = match parse_duchon_power_policy(options)? {
2967                DuchonPowerPolicy::Explicit(req_power) => {
2968                    if length_scale.is_some() && req_power.fract() != 0.0 {
2969                        return Err(TermBuilderError::incompatible_config(format!(
2970                            "hybrid Duchon-Matern smooth '{}' (length_scale=...) requires an integer power, got power={}; \
2971                             drop length_scale to use the scale-free structural kernel with a fractional power.",
2972                            vars.join(", "),
2973                            req_power,
2974                        ))
2975                        .to_string());
2976                    }
2977                    (requested_nullspace_order, req_power)
2978                }
2979                DuchonPowerPolicy::CubicStructuralDefault => {
2980                    // Magic cubic rule (REQUEST-LAYER default): no explicit power ⇒
2981                    // affine null space + fractional spectral power s = (d-1)/2, i.e.
2982                    // the Duchon kernel φ(r)=r³ in every dimension. An EXPLICIT
2983                    // `power=0` is handled above and is honored as the s=0 Duchon
2984                    // kernel (r²·log r ≡ the thin-plate kernel in even d) — the magic
2985                    // default lives here, not in the basis builder.
2986                    match length_scale {
2987                        None => crate::basis::duchon_cubic_default(cols.len()),
2988                        Some(_) => {
2989                            // The hybrid Matérn-blended kernel (`length_scale=Some`)
2990                            // requires an INTEGER spectral power `s` (the partial-
2991                            // fraction split `1/(ρ^{2p}(κ²+ρ²)^s)` is only defined for
2992                            // integer `s`). The fractional cubic default `s=(d-1)/2` is
2993                            // a half-integer for even `d`, and the basis builder's
2994                            // `power_as_usize` maps a NON-integer to `0` (not its
2995                            // floor) — so for even `d ≥ 4` the realized kernel has
2996                            // `2(p+s) = 2p = 4 ≤ d`, which is non-finite at the origin
2997                            // and crashes the fit (historically a non-finite
2998                            // eigendecomposition; now a fit-time validation error).
2999                            //
3000                            // Rather than emit the fractional cubic and let it truncate
3001                            // into an inadmissible kernel, resolve the SMALLEST
3002                            // admissible integer `(nullspace, s)` at the requested
3003                            // nullspace order. The formula default is the same
3004                            // native-Gram Duchon smoother as the scale-free path, so
3005                            // there is no collocation-operator floor to honor here.
3006                            // Users that opt into operator penalties get the stricter
3007                            // gate at basis-build time from the requested operators.
3008                            let max_op = crate::basis::duchon_max_active_operator_derivative_order(
3009                                &DuchonOperatorPenaltySpec::all_disabled(),
3010                            );
3011                            let (ns, s) = crate::basis::resolve_duchon_orders(
3012                                cols.len(),
3013                                requested_nullspace_order,
3014                                max_op,
3015                                length_scale,
3016                            );
3017                            (ns, s as f64)
3018                        }
3019                    }
3020                }
3021            };
3022            let plan = plan_spatial_basis(
3023                ds.values.nrows(),
3024                cols.len(),
3025                CenterCountRequest::Default,
3026                nullspace_order,
3027                option_bool(options, "scale_dims").unwrap_or(false),
3028                policy,
3029            )
3030            .map_err(|e| e.to_string())?;
3031            let centers_explicit = has_explicit_countwith_basis_alias(options, "centers");
3032            let polynomial_cols = match nullspace_order {
3033                DuchonNullspaceOrder::Zero => 1,
3034                DuchonNullspaceOrder::Linear => cols.len() + 1,
3035                DuchonNullspaceOrder::Degree(degree) => {
3036                    crate::basis::duchon_nullspace_dimension(cols.len(), degree)
3037                }
3038            };
3039            let default_centers = default_duchon_center_count(
3040                ds.values.nrows(),
3041                cols.len(),
3042                plan.centers,
3043                polynomial_cols,
3044            );
3045            let requested_centers = parse_countwith_basis_alias(
3046                options,
3047                "centers",
3048                cap_default_spatial_centers(options, default_centers),
3049            )?;
3050            if requested_centers <= polynomial_cols {
3051                return Err(TermBuilderError::incompatible_config(format!(
3052                    "Duchon smooth '{}' requested basis dimension {} but order={:?} in {}D needs {} polynomial null-space columns; choose centers/k > {}",
3053                    vars.join(", "),
3054                    requested_centers,
3055                    nullspace_order,
3056                    cols.len(),
3057                    polynomial_cols,
3058                    polynomial_cols,
3059                ))
3060                .to_string());
3061            }
3062            let mut centers = requested_centers;
3063            if !centers_explicit && ds.values.nrows() <= 32 && smooth_coordinate_count >= 5 {
3064                centers = centers.max(polynomial_cols + 4);
3065            }
3066            let center_strategy = if centers_explicit {
3067                spatial_center_strategy_for_dimension(centers, cols.len())
3068            } else {
3069                auto_spatial_center_strategy(centers, cols.len())
3070            };
3071            let aniso_log_scales = if option_bool(options, "scale_dims").unwrap_or(false) {
3072                Some(vec![0.0; cols.len()])
3073            } else {
3074                None
3075            };
3076            // Formula-level `duchon(...)` is the native Duchon reproducing-norm
3077            // smoother: the always-on Primary Gram plus the polynomial trend
3078            // ridge. Do not silently add collocated mass/tension penalties here.
3079            // They add extra REML hyperparameters and an O(k)-support quadrature
3080            // build to the default 2-D path, making `duchon(x, z)` materially
3081            // slower than the equivalent thin-plate fit without a principled
3082            // accuracy gain (gam#1718). Lower-order Hilbert-scale penalties remain
3083            // available to callers that construct an explicit DuchonBasisSpec.
3084            let operator_penalties = DuchonOperatorPenaltySpec::all_disabled();
3085            // For a 1-D periodic Duchon with no EXPLICIT period, anchor the wrap
3086            // to the covariate DATA range rather than letting the basis builder
3087            // derive it from the (k-subsampled) center span. The center span is a
3088            // strict subset of the data and undershoots the true period, seaming
3089            // the curve (f(0) ≠ f(2π)); the data range is the caller's actual
3090            // domain. Honors any explicit `period=` (parse_periodic_axes_option
3091            // already threaded it) and leaves multi-D / non-periodic untouched.
3092            let mut periodic = parse_periodic_axes_option(options, cols.len())?;
3093            if cols.len() == 1
3094                && let Some(axes) = periodic.as_mut()
3095                && axes.len() == 1
3096                && axes[0].is_none()
3097            {
3098                let (minv, maxv) = col_minmax(ds.values.column(cols[0]))?;
3099                if maxv > minv {
3100                    axes[0] = Some(maxv - minv);
3101                }
3102            }
3103            Ok(SmoothBasisSpec::Duchon {
3104                feature_cols: cols.to_vec(),
3105                spec: DuchonBasisSpec {
3106                    center_strategy,
3107                    periodic,
3108                    length_scale,
3109                    power,
3110                    nullspace_order,
3111                    identifiability: parse_spatial_identifiability(options)
3112                        .map_err(|e| e.to_string())?,
3113                    aniso_log_scales,
3114                    operator_penalties,
3115                    boundary: if cols.len() == 1 {
3116                        let c = cols[0];
3117                        let (minv, maxv) = col_minmax(ds.values.column(c))?;
3118                        parse_cyclic_boundary(options, minv, maxv)?
3119                    } else {
3120                        OneDimensionalBoundary::Open
3121                    },
3122                    radial_reparam: None,
3123                },
3124                input_scales: None,
3125            })
3126        }
3127        "tensor" | "te" | "ti" | "t2" => {
3128            validate_known_options(
3129                "tensor",
3130                options,
3131                &[
3132                    "type",
3133                    "bs",
3134                    "by",
3135                    "k",
3136                    "basis_dim",
3137                    "basis-dim",
3138                    "basisdim",
3139                    "knot_placement",
3140                    "knot-placement",
3141                    "knotplacement",
3142                    "degree",
3143                    "penalty_order",
3144                    "double_penalty",
3145                    "periodic",
3146                    "cyclic",
3147                    "period",
3148                    "periods",
3149                    "period_start",
3150                    "period_end",
3151                    "origin",
3152                    "origins",
3153                    "period_origin",
3154                    "period-origin",
3155                    "domain_origin",
3156                    "boundary",
3157                    "bc",
3158                    "identifiability",
3159                    "id",
3160                    "__by_col",
3161                ],
3162            )?;
3163            if cols.len() < 2 {
3164                return Err(TermBuilderError::incompatible_config(format!(
3165                    "tensor smooth expects at least 2 variables, got {}",
3166                    cols.len()
3167                ))
3168                .to_string());
3169            }
3170            let dim = cols.len();
3171
3172            // Tensor-product contract (#1082). `te(x1, x2, ...)` ALWAYS builds a
3173            // genuine anisotropic tensor product of per-margin bases (the arm
3174            // below), exactly as mgcv's `te()` does — one smoothing parameter per
3175            // margin, a marginal-Kronecker-sum penalty, and the bilinear null
3176            // space left unpenalized under the default `select = FALSE`. A margin
3177            // vector `bs=c('tp','tp')` requests a thin-plate FUNCTION SPACE per
3178            // axis; the tensor realizes each axis as a 1-D penalized B-spline
3179            // margin spanning that same per-axis space (tp/ps/cr/bs/cc all share
3180            // it). We deliberately do NOT silently swap the requested tensor for a
3181            // single multi-D ISOTROPIC thin-plate radial smooth (`s(x,y,bs='tp')`):
3182            // that is a different model — one isotropic smoothing parameter, no
3183            // per-margin anisotropy — and substituting it while the user wrote a
3184            // tensor formula is dishonest. A user who genuinely wants the isotropic
3185            // radial smooth asks for it directly with `s(x1, x2, bs='tp')`.
3186            // Per-margin basis vector (`bs=c('tp','tp')` / `bs=['ps','cr']`):
3187            // validate each requested margin is a penalized-spline basis that
3188            // the tensor product realizes as a 1-D B-spline margin. mgcv's
3189            // `tp`/`ps`/`cr`/`bs`/`cc` margins are all penalized splines over
3190            // the same per-axis function space, so a B-spline margin recovers
3191            // the same tensor smoothing space; genuinely different margin kinds
3192            // (e.g. adaptive `ad`, random `re`) are rejected loudly rather than
3193            // silently substituted.
3194            if let Some(raw) = options.get("bs").or_else(|| options.get("type"))
3195                && bs_selector_is_vector(raw)
3196            {
3197                let per_margin = parse_option_list(raw);
3198                if per_margin.len() != dim {
3199                    return Err(TermBuilderError::invalid_option(format!(
3200                        "tensor smooth per-margin bs vector has {} entries but the smooth has {} margins",
3201                        per_margin.len(),
3202                        dim
3203                    ))
3204                    .to_string());
3205                }
3206                for (axis, margin_bs) in per_margin.iter().enumerate() {
3207                    if !tensor_margin_bs_is_supported(margin_bs) {
3208                        return Err(TermBuilderError::unsupported_feature(format!(
3209                            "tensor smooth margin {axis} basis '{margin_bs}' is not a supported penalized-spline margin; \
3210                             tensor margins accept tp/tps/ps/bs/cr/cc"
3211                        ))
3212                        .to_string());
3213                    }
3214                }
3215            }
3216            let periodic_axes = parse_tensor_periodic_axes(options, dim)?;
3217            validate_tensor_boundary_tokens(options, dim)?;
3218            let periods_opt = parse_periods(options, &periodic_axes)?;
3219            let origins_opt = parse_period_origins(options, &periodic_axes)?;
3220            let degree = option_usize(options, "degree").unwrap_or(DEFAULT_BSPLINE_DEGREE);
3221            let penalty_order =
3222                option_usize(options, "penalty_order").unwrap_or(if degree > 1 { 2 } else { 1 });
3223            let (mut k_list, k_inferred) = parse_tensor_k_list(options, cols, ds)?;
3224            if ds.values.nrows() <= 32 && smooth_coordinate_count >= 5 {
3225                for k in &mut k_list {
3226                    *k = (*k).min(degree + 2);
3227                }
3228            }
3229            if k_inferred {
3230                inference_notes.push(format!(
3231                    "Automatically set per-margin basis sizes {:?} for tensor smooth '{}' \
3232                     (dimension-aware tensor budget: total ∏k kept near the mgcv-te default \
3233                     and within the data support, distributed geometrically across margins and \
3234                     capped per margin by each column's resolution). \
3235                     Override with k=<int> or k=[k0,k1,...].",
3236                    k_list,
3237                    vars.join(",")
3238                ));
3239            }
3240            // Per-axis requested marginal basis family. mgcv's `te()`/`ti()`
3241            // default marginal basis is the cubic regression spline (`cr`), and
3242            // the te_3d quality gap (#1074) is precisely the marginal-basis
3243            // resolution at small `k`: a `cr` margin places k value-knots at
3244            // data quantiles (finer interior resolution under natural boundary
3245            // constraints) where the cubic B-spline margin has only
3246            // `k-degree-1` interior knots. Resolve each axis to either an
3247            // explicit per-margin `bs` (vector `bs=c('cr','ps')`), a single
3248            // scalar `bs`, or the unset default — and route
3249            // `cr`/`cs`/unset/`tp`/`tps` margins through the natural cubic
3250            // regression builder (`NaturalCubicRegression` knotspec), keeping
3251            // explicit `ps`/`bs`/`bspline` on the B-spline margin.
3252            let per_axis_bs: Vec<Option<String>> =
3253                match options.get("bs").or_else(|| options.get("type")) {
3254                    Some(raw) if bs_selector_is_vector(raw) => {
3255                        let list = parse_option_list(raw);
3256                        (0..dim).map(|a| list.get(a).cloned()).collect()
3257                    }
3258                    Some(raw) => {
3259                        let scalar = raw
3260                            .trim()
3261                            .trim_matches('"')
3262                            .trim_matches('\'')
3263                            .to_ascii_lowercase();
3264                        vec![Some(scalar); dim]
3265                    }
3266                    None => vec![None; dim],
3267                };
3268            // A margin is realized as a natural cubic regression spline when it
3269            // is the (unset) mgcv default, an explicit `cr`/`cs`, or a
3270            // `tp`/`tps` (same per-axis penalized-spline space). Explicit
3271            // B-spline-family margins (`ps`/`bs`/`bspline`/`p-spline`) keep the
3272            // open B-spline margin.
3273            let margin_wants_cr = |bs: &Option<String>| -> bool {
3274                matches!(
3275                    bs.as_deref(),
3276                    None | Some("cr") | Some("cs") | Some("tp") | Some("tps")
3277                )
3278            };
3279            let requested_knot_placement = parse_knot_placement(options)?;
3280            let mut margins: Vec<BSplineBasisSpec> = Vec::with_capacity(dim);
3281            let mut emitted_periods: Vec<Option<f64>> = Vec::with_capacity(dim);
3282            for axis in 0..dim {
3283                let c = cols[axis];
3284                let (data_min, data_max) = col_minmax(ds.values.column(c))?;
3285                // mgcv reduces a tensor margin's basis dimension to what its data
3286                // can support: a cr or B-spline margin cannot place more value
3287                // knots / basis functions than there are DISTINCT covariate
3288                // values on that axis. Without this cap an explicit `k` on a
3289                // low-cardinality margin — e.g. the binary `badh ∈ {0,1}` in
3290                // `te(age, badh, k=5)` — hard-failed in `select_cr_knots` ("cubic
3291                // regression spline with k=5 requires at least 5 distinct values,
3292                // got 2") instead of degrading to the 2-function (linear) margin
3293                // mgcv builds there. The auto-`k` path already caps per margin via
3294                // `heuristic_tensor_margin_knots`; mirror that for explicit `k`.
3295                // The cap propagates correctly: every per-axis quantity below
3296                // (effective degree, knot set, penalty order) is derived from
3297                // `k_axis`, and the marginal basis size is read from the resulting
3298                // knot spec — never from `k_list`. Floor at 2 so a margin still
3299                // carries at least a linear basis (tensor margins require k >= 2).
3300                let k_requested = k_list[axis];
3301                let n_distinct_axis = unique_count_column(ds.values.column(c));
3302                let k_axis = k_requested.min(n_distinct_axis).max(2);
3303                if k_axis < k_requested {
3304                    log::info!(
3305                        "tensor smooth: margin axis {axis} requested k={k_requested}, but the \
3306                         covariate has only {n_distinct_axis} distinct value(s); reducing this \
3307                         margin to k={k_axis} (mgcv-style data-support cap on the per-axis basis)."
3308                    );
3309                }
3310                // Per-axis effective spline degree. The B-spline basis with `k`
3311                // functions is well-defined for any `degree <= k - 1`; mgcv's
3312                // `te(...)` exploits this so a binary tensor margin
3313                // (`k=2` → linear basis) or a ternary margin (`k=3` → quadratic)
3314                // can coexist with a smoother continuous margin under one
3315                // shared `degree=` request. We mirror that: if the caller
3316                // explicitly asks for `k < degree + 1`, drop the degree on
3317                // THAT axis only to the largest feasible spline, and track the
3318                // penalty order so the marginal difference penalty stays
3319                // well-defined (`order < num_basis_functions` is required by
3320                // `create_difference_penalty_matrix`). Apply the same
3321                // per-margin degree shrinkage to periodic tensor margins too:
3322                // a cyclic marginal basis with k=3 cannot be cubic, but it is
3323                // still a valid lower-degree cyclic margin with dimension k,
3324                // matching mgcv's small-k tensor-margin behavior.
3325                if k_axis < 2 {
3326                    return Err(TermBuilderError::invalid_option(format!(
3327                        "tensor smooth: k[{axis}]={k_axis} too small; tensor margins require k >= 2"
3328                    ))
3329                    .to_string());
3330                }
3331                let effective_degree = degree.min(k_axis - 1).max(1);
3332                let effective_penalty_order = penalty_order.min(effective_degree);
3333                // A `cc`/`cp`/`cyclic` per-margin basis declares periodicity
3334                // without necessarily supplying a `period=`: mgcv's `bs="cc"`
3335                // wraps at the covariate's observed data range. Mirror the 1-D
3336                // cyclic fallback (`parse_periodic_domain_1d`) here so a bare
3337                // `te(x, z, bs=c('cc','cc'))` wraps each margin on its own
3338                // [min, max] span instead of hard-erroring (#1752).
3339                let margin_is_cc = matches!(
3340                    canonicalize_smooth_type(per_axis_bs[axis].as_deref().unwrap_or("")),
3341                    "cc" | "cp" | "cyclic"
3342                );
3343                let (knotspec, boundary, axis_period) = if periodic_axes[axis] {
3344                    // A `cc`/`cp`/`cyclic` per-margin basis declares periodicity
3345                    // without necessarily supplying a `period=`; in that case wrap
3346                    // at the covariate's observed [min, max] span, mirroring the
3347                    // 1-D cyclic fallback (`parse_periodic_domain_1d`) so a bare
3348                    // `te(x, z, bs=c('cc','cc'))` wraps each margin on its own
3349                    // range instead of hard-erroring (#1752). An axis made
3350                    // periodic by an explicit `periodic=`/`boundary=` selector
3351                    // (not a cyclic margin basis) still requires an explicit
3352                    // `period=`: a data-derived period there is a sample-dependent
3353                    // off-by-ε seam and is not inferred.
3354                    let (domain_start, period_value) = match periods_opt[axis] {
3355                        Some(period_value) => {
3356                            if !period_value.is_finite() || period_value <= 0.0 {
3357                                return Err(format!(
3358                                    "tensor smooth axis {axis}: period must be a positive finite value, got {period_value}"
3359                                ));
3360                            }
3361                            (origins_opt[axis].unwrap_or(data_min), period_value)
3362                        }
3363                        None if margin_is_cc => {
3364                            let span = data_max - data_min;
3365                            if !span.is_finite() || span <= 0.0 {
3366                                return Err(format!(
3367                                    "tensor smooth axis {axis}: cyclic margin requires a positive \
3368                                     observed data range to derive its period, got [{data_min}, {data_max}]"
3369                                ));
3370                            }
3371                            (origins_opt[axis].unwrap_or(data_min), span)
3372                        }
3373                        None => {
3374                            return Err(format!(
3375                                "tensor smooth axis {axis} is periodic but requires an explicit \
3376                                 period: pass period=<value> (scalar) or period=[..., <value>, ...]. \
3377                                 Deriving the period from the observed data range is sample-dependent \
3378                                 (off-by-ε seam), so it is not inferred."
3379                            ));
3380                        }
3381                    };
3382                    let domain_end = domain_start + period_value;
3383                    (
3384                        BSplineKnotSpec::PeriodicUniform {
3385                            data_range: (domain_start, domain_end),
3386                            num_basis: k_axis,
3387                        },
3388                        OneDimensionalBoundary::Cyclic {
3389                            start: domain_start,
3390                            end: domain_end,
3391                        },
3392                        Some(period_value),
3393                    )
3394                } else if margin_wants_cr(&per_axis_bs[axis])
3395                    && requested_knot_placement != crate::basis::BSplineKnotPlacement::Quantile
3396                    && k_axis >= 3
3397                {
3398                    // mgcv `te()`/`ti()` default cr margin: place exactly
3399                    // `k_axis` Lancaster–Salkauskas value-knots at data
3400                    // quantiles. The cr basis dimension equals the knot count,
3401                    // so this reproduces the requested per-margin `k` directly.
3402                    // A natural cubic regression spline needs at least 3 knots
3403                    // (one interior); a `k_axis < 3` margin (e.g. a binary
3404                    // tensor axis requesting a linear margin) falls through to
3405                    // the B-spline branch below, exactly as before #1074 — mgcv
3406                    // likewise does not build a `cr` margin below k=3. An
3407                    // explicit `knot_placement=quantile` also falls through:
3408                    // that option selects the generated B-spline knot strategy
3409                    // represented by `Automatic { Quantile }`, whereas the cr
3410                    // margin has already materialized its quantile value-knots.
3411                    let cr_knots = crate::basis::select_cr_knots(ds.values.column(c), k_axis)
3412                        .map_err(|e| e.to_string())?;
3413                    (
3414                        BSplineKnotSpec::NaturalCubicRegression { knots: cr_knots },
3415                        OneDimensionalBoundary::Open,
3416                        None,
3417                    )
3418                } else {
3419                    // `num_internal_knots = k - degree - 1` reproduces the
3420                    // requested basis size exactly when degree was reduced for
3421                    // a low-cardinality margin; keep the legacy `.max(1)`
3422                    // floor on the un-reduced path so the existing knot
3423                    // geometry is unchanged whenever the user already passed
3424                    // k >= degree + 1.
3425                    let num_internal_knots = if effective_degree < degree {
3426                        k_axis.saturating_sub(effective_degree + 1)
3427                    } else {
3428                        k_axis.saturating_sub(degree + 1).max(1)
3429                    };
3430                    let knotspec = match requested_knot_placement {
3431                        crate::basis::BSplineKnotPlacement::Uniform => BSplineKnotSpec::Generate {
3432                            data_range: (data_min, data_max),
3433                            num_internal_knots,
3434                        },
3435                        crate::basis::BSplineKnotPlacement::Quantile => {
3436                            crate::basis::auto_knot_vector_1d_quantile(
3437                                ds.values.column(c),
3438                                num_internal_knots,
3439                                effective_degree,
3440                            )
3441                            .map_err(|e| e.to_string())?;
3442                            BSplineKnotSpec::Automatic {
3443                                num_internal_knots: Some(num_internal_knots),
3444                                placement: crate::basis::BSplineKnotPlacement::Quantile,
3445                            }
3446                        }
3447                    };
3448                    (knotspec, OneDimensionalBoundary::Open, None)
3449                };
3450                // A `cr` margin fixes cubic regression geometry; the cr builder
3451                // reads only the knot set + `double_penalty`. Enable null-space
3452                // shrinkage for an explicit `cs` margin. B-spline margins keep
3453                // the resolved effective degree / penalty order with no extra
3454                // null-space penalty (mgcv `select = FALSE` tensor default).
3455                let is_cr_margin =
3456                    matches!(knotspec, BSplineKnotSpec::NaturalCubicRegression { .. });
3457                let margin_double_penalty =
3458                    is_cr_margin && matches!(per_axis_bs[axis].as_deref(), Some("cs"));
3459                margins.push(BSplineBasisSpec {
3460                    degree: effective_degree,
3461                    penalty_order: effective_penalty_order,
3462                    knotspec,
3463                    double_penalty: margin_double_penalty,
3464                    identifiability: BSplineIdentifiability::None,
3465                    boundary,
3466                    boundary_conditions: BSplineBoundaryConditions::default(),
3467                });
3468                emitted_periods.push(axis_period);
3469            }
3470            // #1593: canonicalize the margin order so a tensor smooth is invariant
3471            // to the typed order of its covariates. `te(x, z)` and `te(z, x)` span
3472            // the IDENTICAL tensor-product space under the identical per-margin
3473            // penalty family, but the design is the Khatri–Rao product
3474            // `B_first ⊙ B_second`, so the typed order permutes the design columns
3475            // (and the per-margin penalty blocks `S_first⊗I`, `I⊗S_second`). That
3476            // permutation is a pure relabelling in exact arithmetic — REML is
3477            // invariant to it — yet it reorders the penalized normal-equation / REML
3478            // eigen/Cholesky linear algebra, and the resulting sub-ULP differences
3479            // route the outer λ optimizer to a different terminal point in te's flat
3480            // REML valley (the over-smoothed margin rails to the ρ bound while the
3481            // other lands on a materially different λ̂). So the shipped surface
3482            // drifted ~2–6 % of range with a cosmetic swap of the covariate order
3483            // (the #1378 row-permutation / #1456 rotation flat-valley gauge family).
3484            // Sorting the margins by their source feature-column index makes the same
3485            // physical model build the identical problem regardless of typed order,
3486            // so the fit — and every prediction rebuilt from the resolved spec — is
3487            // genuinely order-invariant. `ti`/`t2` share this arm and become exactly
3488            // invariant too (they were already ~1e-5 by centring each margin
3489            // separately; canonicalization makes the swap bit-identical).
3490            let canon_cols: Vec<usize> = {
3491                let mut perm: Vec<usize> = (0..dim).collect();
3492                perm.sort_by_key(|&a| cols[a]);
3493                if perm.iter().enumerate().any(|(i, &a)| i != a) {
3494                    margins = perm.iter().map(|&a| margins[a].clone()).collect();
3495                    emitted_periods = perm.iter().map(|&a| emitted_periods[a]).collect();
3496                }
3497                perm.iter().map(|&a| cols[a]).collect()
3498            };
3499            let any_periodic = emitted_periods.iter().any(|p| p.is_some());
3500            let periods_vec = if any_periodic {
3501                emitted_periods
3502            } else {
3503                Vec::new()
3504            };
3505            // Tensor smooths (`te`/`ti`/`t2`) must match mgcv's DEFAULT
3506            // `select = FALSE`: the joint null space of the per-margin
3507            // penalties — the bilinear, low-order interaction directions that
3508            // no marginal roughness operator can see — is left UNPENALIZED.
3509            // mgcv only adds a null-space shrinkage penalty there under the
3510            // opt-in `select = TRUE` (which gam exposes as `double_penalty`).
3511            //
3512            // The general smooth default (`smooth_double_penalty`, true) is
3513            // calibrated for 1-D `s()` terms; carrying it into tensors silently
3514            // shrinks the genuinely-present bilinear interaction signal, so
3515            // REML places positive weight on the extra ridge and systematically
3516            // OVER-SMOOTHS the recovered surface relative to mgcv's plain
3517            // `te`/`ti` (gam#700/#701/#702/#703). Default tensors to no extra
3518            // null-space penalty; an explicit user `double_penalty=`/`select=`
3519            // still wins.
3520            let tensor_double_penalty = option_bool(options, "double_penalty").unwrap_or(false);
3521            Ok(SmoothBasisSpec::TensorBSpline {
3522                feature_cols: canon_cols,
3523                spec: TensorBSplineSpec {
3524                    marginalspecs: margins,
3525                    periods: periods_vec,
3526                    double_penalty: tensor_double_penalty,
3527                    identifiability: parse_tensor_identifiability(options, kind)?,
3528                    // `t2` selects mgcv's separable (Wood, Scheipl & Faraway
3529                    // 2013) decomposition. It can arrive either as the `t2(...)`
3530                    // function form (`SmoothKind::T2`) or as a `type="t2"` /
3531                    // `bs="t2"` option on an `s(...)`/`te(...)` term, in which
3532                    // case `kind` is *not* `T2` but the resolved type string is
3533                    // "t2". Keying only off `kind` silently aliased the option
3534                    // form to `te`'s Kronecker-sum penalty (gam#1185); key off
3535                    // the resolved type string as well so both routes build the
3536                    // separable penalty.
3537                    penalty_decomposition: if matches!(kind, SmoothKind::T2)
3538                        || type_opt.as_str() == "t2"
3539                    {
3540                        TensorBSplinePenaltyDecomposition::Separable
3541                    } else {
3542                        TensorBSplinePenaltyDecomposition::MarginalKroneckerSum
3543                    },
3544                },
3545            })
3546        }
3547        "pca" => {
3548            validate_known_options(
3549                "pca",
3550                options,
3551                &[
3552                    "type",
3553                    "bs",
3554                    "by",
3555                    "k",
3556                    "basis_dim",
3557                    "basis-dim",
3558                    "basisdim",
3559                    "lazy_path",
3560                    "path",
3561                    "pca_basis_path",
3562                    "chunk_size",
3563                    "smooth_penalty",
3564                    "centered",
3565                    "double_penalty",
3566                    "id",
3567                    "__by_col",
3568                ],
3569            )?;
3570            let path = options
3571                .get("lazy_path")
3572                .or_else(|| options.get("pca_basis_path"))
3573                .or_else(|| options.get("path"))
3574                .map(|raw| PathBuf::from(strip_quotes(raw)));
3575            let Some(path) = path else {
3576                return Err(TermBuilderError::incompatible_config(
3577                    "pca smooth requires lazy_path=... on the formula path",
3578                )
3579                .to_string());
3580            };
3581            let k = option_usize_any(options, &["k", "basis_dim", "basis-dim", "basisdim"])
3582                .unwrap_or(0);
3583            let chunk_size = option_usize(options, "chunk_size").unwrap_or(DEFAULT_PCA_CHUNK_SIZE);
3584            Ok(SmoothBasisSpec::Pca {
3585                feature_cols: cols.to_vec(),
3586                basis_matrix: Array2::<f64>::zeros((cols.len(), k)),
3587                centered: option_bool(options, "centered").unwrap_or(true),
3588                smooth_penalty: option_f64(options, "smooth_penalty").unwrap_or(1.0),
3589                center_mean: None,
3590                pca_basis_path: Some(path),
3591                chunk_size,
3592            })
3593        }
3594        other => Err(TermBuilderError::unsupported_feature(format!(
3595            "unsupported smooth type '{other}'"
3596        ))
3597        .to_string()),
3598    }
3599}
3600
3601/// Initialise per-axis anisotropic log-scales on eligible spatial smooth specs.
3602pub fn enable_scale_dimensions(spec: &mut TermCollectionSpec) {
3603    for smooth in spec.smooth_terms.iter_mut() {
3604        // A multi-axis thin-plate term cannot carry per-axis anisotropy on its
3605        // single curvature penalty, so `scale_dimensions` was historically a
3606        // silent no-op for `bs="tp"` (gam#1676). Rewrite it to the
3607        // mathematically-equivalent anisotropic s=0 Duchon spline first; the
3608        // Duchon arm below then sees an already-seeded `aniso_log_scales` and
3609        // leaves it untouched.
3610        promote_thin_plate_for_scale_dimensions(&mut smooth.basis);
3611        match &mut smooth.basis {
3612            SmoothBasisSpec::Matern {
3613                feature_cols,
3614                spec: matern,
3615                ..
3616            } => {
3617                if matern.aniso_log_scales.is_none() {
3618                    let d = feature_cols.len();
3619                    matern.aniso_log_scales = Some(vec![0.0; d]);
3620                }
3621            }
3622            SmoothBasisSpec::Duchon {
3623                feature_cols,
3624                spec: duchon,
3625                ..
3626            } => {
3627                if duchon.aniso_log_scales.is_none() {
3628                    let d = feature_cols.len();
3629                    duchon.aniso_log_scales = Some(vec![0.0; d]);
3630                }
3631            }
3632            _ => {}
3633        }
3634    }
3635}
3636
3637/// Rewrite a multi-axis thin-plate term into the mathematically-equivalent
3638/// anisotropic s=0 Duchon spline so that `scale_dimensions` genuinely engages
3639/// (gam#1676).
3640///
3641/// ## Why a rewrite rather than a new field on the TPS builder
3642///
3643/// A canonical thin-plate regression spline carries a *single* curvature
3644/// penalty — the exact `∫|Dᵐ f|²` reproducing-kernel Gram. That penalty has no
3645/// per-axis structure to make one direction more or less relevant than another,
3646/// so per-axis anisotropy (`scale_dimensions`) cannot be expressed on it. The
3647/// flag was therefore a silent no-op for `bs="tp"` while it engaged for
3648/// `duchon()`/`matern()`.
3649///
3650/// The thin-plate kernel `r^{2m−d}` (the `r²·log r` log-case in even `d`) is
3651/// *exactly* the s=0 Duchon kernel (`DuchonBasisSpec::power = 0`,
3652/// `length_scale = None`) at the matching polynomial null-space order
3653/// `m = thin_plate_penalty_order(d)`. The Duchon polyharmonic family already
3654/// carries the per-axis tension ARD that `scale_dimensions` requests: its
3655/// isotropic first-order roughness penalty `Σ‖∇f‖²` splits into `d` directional
3656/// penalties `Σ(∂f/∂x_a)²`, each with its own REML `λ_a`
3657/// (`duchon_operator_penalty_candidates`). So the well-posed *anisotropic
3658/// thin-plate spline is the anisotropic s=0 Duchon spline*. Rewriting to that
3659/// representation reuses the battle-tested Duchon anisotropy / ψ-derivative /
3660/// freeze / predict machinery instead of duplicating it onto the TPS metadata
3661/// path, and keeps the polyharmonic family internally consistent. The codebase
3662/// already promotes infeasible-`k` TPS to Duchon for the same reason (the
3663/// canonical TPS single curvature penalty cannot deliver a requested
3664/// capability); per-axis anisotropy is another such capability.
3665///
3666/// This fires *only* when the user opts into `scale_dimensions`; the default
3667/// thin-plate path (`scale_dimensions` off) is left bit-for-bit unchanged.
3668/// A 1-D thin-plate term is left untouched — anisotropy is meaningless on a
3669/// single axis (its `Σ η = 0` contrast vector is empty), exactly as for a 1-D
3670/// Matérn/Duchon term.
3671fn promote_thin_plate_for_scale_dimensions(basis: &mut SmoothBasisSpec) {
3672    let SmoothBasisSpec::ThinPlate {
3673        feature_cols,
3674        spec,
3675        input_scales,
3676    } = &*basis
3677    else {
3678        return;
3679    };
3680    let d = feature_cols.len();
3681    if d <= 1 {
3682        return;
3683    }
3684    // m = thin_plate_penalty_order(d) is the TPS penalty order; the Duchon
3685    // null-space order naming is `Zero → m=1`, `Linear → m=2`,
3686    // `Degree(g) → m=g+1`, so the s=0 Duchon kernel exponent
3687    // `2(p+s) − d = 2m − d` reproduces the TPS kernel exactly.
3688    let m = thin_plate_penalty_order(d);
3689    let nullspace_order = match m {
3690        0 | 1 => DuchonNullspaceOrder::Zero,
3691        2 => DuchonNullspaceOrder::Linear,
3692        _ => DuchonNullspaceOrder::Degree(m - 1),
3693    };
3694    let duchon_spec = DuchonBasisSpec {
3695        center_strategy: spec.center_strategy.clone(),
3696        periodic: spec.periodic.clone(),
3697        // Pure, scale-free Duchon — the thin-plate kernel has no length scale
3698        // (a global TPS kernel scale is non-identifiable once REML learns the
3699        // smoothing penalty: gam#718/#721/#731/#732). The per-axis relevance
3700        // the user asked for is carried by the tension-ARD `λ_a`, not a κ axis.
3701        length_scale: None,
3702        // s = 0  ⇒  thin-plate kernel `r^{2m−d}`.
3703        power: 0.0,
3704        nullspace_order,
3705        identifiability: spec.identifiability.clone(),
3706        // All-zero geometry seed sentinel: `auto_seed_aniso_contrasts` resolves
3707        // it from the (standardized) knot cloud, and the per-axis tension split
3708        // engages on `aniso.is_some()`.
3709        aniso_log_scales: Some(vec![0.0; d]),
3710        operator_penalties: DuchonOperatorPenaltySpec::default(),
3711        boundary: OneDimensionalBoundary::Open,
3712        radial_reparam: None,
3713    };
3714    let feature_cols = feature_cols.clone();
3715    let input_scales = input_scales.clone();
3716    // All borrows of `*basis` (the `&*basis` destructure above) end with the
3717    // clones on the two preceding lines, so the reassignment is sound.
3718    *basis = SmoothBasisSpec::Duchon {
3719        feature_cols,
3720        spec: duchon_spec,
3721        input_scales,
3722    };
3723}
3724
3725// ---------------------------------------------------------------------------
3726// Data-aware helpers
3727// ---------------------------------------------------------------------------
3728
3729pub fn spatial_center_strategy_for_dimension(num_centers: usize, d: usize) -> CenterStrategy {
3730    if d <= 3 {
3731        // In low-dimensional spatial smooths, an explicit `k` is a resolution
3732        // request rather than a request for marginal quantile-midpoint centers.
3733        // Use deterministic maximin geometry so Matérn/GP and Duchon REML see a
3734        // well-resolved native kernel block with small fill distance instead of
3735        // compensating for holes or endpoint under-resolution by over-smoothing
3736        // low-noise signals (#504).
3737        CenterStrategy::FarthestPoint { num_centers }
3738    } else {
3739        default_spatial_center_strategy(num_centers, d)
3740    }
3741}
3742
3743pub fn col_minmax(col: ArrayView1<'_, f64>) -> Result<(f64, f64), String> {
3744    let min = col.iter().fold(f64::INFINITY, |a, &b| a.min(b));
3745    let max = col.iter().fold(f64::NEG_INFINITY, |a, &b| a.max(b));
3746    if !min.is_finite() || !max.is_finite() {
3747        return Err(TermBuilderError::degenerate_data(
3748            "non-finite data encountered while inferring knot range",
3749        )
3750        .to_string());
3751    }
3752    if (max - min).abs() < 1e-12 {
3753        Ok((min, min + 1e-6))
3754    } else {
3755        Ok((min, max))
3756    }
3757}
3758
3759pub fn unique_count_column(col: ArrayView1<'_, f64>) -> usize {
3760    use std::collections::HashSet;
3761    let mut set = HashSet::<u64>::with_capacity(col.len());
3762    for &v in col {
3763        let norm = if v == 0.0 { 0.0 } else { v };
3764        set.insert(norm.to_bits());
3765    }
3766    set.len().max(1)
3767}
3768
3769/// Minimum knot count for a natural cubic regression spline: `select_cr_knots`
3770/// places one value-knot per basis function and needs at least an interior knot,
3771/// so the sparsest representable cr basis is `{const, linear, curvature}` at
3772/// three knots. Below this a cr spline is not constructible and the caller must
3773/// degrade to the linear B-spline marginal.
3774pub(crate) const CR_MIN_KNOTS: usize = 3;
3775
3776/// Build a cubic-regression marginal knot spec capped to the covariate's data
3777/// support, mgcv-style.
3778///
3779/// A `cr`/`cs`/`sz` marginal places exactly one basis function per value-knot,
3780/// so `select_cr_knots` cannot place more knots than the covariate has DISTINCT
3781/// values — it `bail`s with "cubic regression spline with k=N requires at least
3782/// N distinct values" otherwise. An unclamped `k` on an ordinary low-cardinality
3783/// covariate (a binary indicator, a 3-level ordinal/Likert score, a small count)
3784/// therefore hard-failed the whole fit instead of reducing the basis the way
3785/// mgcv — and gam's own tensor-margin path (996f829d7, `term_builder.rs:2986` /
3786/// the `k_axis >= 3` cr gate at `:3047`) — do. This is the univariate / factor-
3787/// smooth sibling of that tensor cap (#1541, #1542).
3788///
3789/// Returns:
3790/// - `Some(NaturalCubicRegression { .. })` with `k = min(k_requested, n_distinct)`
3791///   value-knots when the data supports a cr spline (`n_distinct >= CR_MIN_KNOTS`).
3792///   A cr basis of exactly `n_distinct` knots is full-rank for the data — it can
3793///   represent any per-distinct-value structure (e.g. 3 arbitrary group means on
3794///   a ternary covariate) — so the cap never costs recoverable signal.
3795/// - `None` when `n_distinct < CR_MIN_KNOTS` (a binary covariate): too few
3796///   distinct values for ANY cr spline, so the caller degrades to the linear
3797///   B-spline marginal — exactly what the default `s(x, k=..)` basis already
3798///   builds on the same data, and what the tensor path's `< 3` branch builds.
3799///
3800/// `inference_notes` records any reduction so the user sees that `k` was capped
3801/// (mgcv emits a warning in the same situation).
3802fn capped_cr_marginal_knotspec(
3803    col: ArrayView1<'_, f64>,
3804    k_cr_requested: usize,
3805    label: &str,
3806    inference_notes: &mut Vec<String>,
3807) -> Result<Option<BSplineKnotSpec>, String> {
3808    let n_distinct = unique_count_column(col);
3809    let k_cr = k_cr_requested.min(n_distinct);
3810    if k_cr < CR_MIN_KNOTS {
3811        inference_notes.push(format!(
3812            "Smooth '{label}': cubic-regression ('cr'/'cs'/'sz') basis requested k={k_cr_requested}, \
3813             but the covariate has only {n_distinct} distinct value(s) — too few to support a cubic \
3814             regression spline (needs >= {CR_MIN_KNOTS} distinct values). Degraded to the linear \
3815             B-spline marginal the default basis builds on the same data."
3816        ));
3817        return Ok(None);
3818    }
3819    if k_cr < k_cr_requested {
3820        inference_notes.push(format!(
3821            "Smooth '{label}': cubic-regression ('cr'/'cs'/'sz') basis reduced from k={k_cr_requested} \
3822             to k={k_cr} to match the covariate's {n_distinct} distinct value(s) (mgcv-style \
3823             data-support cap; a cr basis cannot place more value-knots than the data has)."
3824        ));
3825    }
3826    let cr_knots = crate::basis::select_cr_knots(col, k_cr).map_err(|e| e.to_string())?;
3827    Ok(Some(BSplineKnotSpec::NaturalCubicRegression {
3828        knots: cr_knots,
3829    }))
3830}
3831
3832/// Smallest number of distinct covariate values seen within any single group
3833/// of `group_col`. For a factor smooth this is the resolution that bounds the
3834/// marginal basis: a group with `m` distinct covariate values can only inform
3835/// `m` basis coefficients, so a marginal richer than that interpolates the
3836/// group instead of estimating a penalized trend. Bits are compared exactly so
3837/// integer-valued covariates (days, dose levels) collapse to their true count.
3838fn min_per_group_unique_count(
3839    feature_col: ArrayView1<'_, f64>,
3840    group_col: ArrayView1<'_, f64>,
3841) -> usize {
3842    use std::collections::{HashMap, HashSet};
3843    let mut per_group: HashMap<u64, HashSet<u64>> = HashMap::new();
3844    for (xi, gi) in feature_col.iter().zip(group_col.iter()) {
3845        let xnorm = if *xi == 0.0 { 0.0 } else { *xi };
3846        let gnorm = if *gi == 0.0 { 0.0 } else { *gi };
3847        per_group
3848            .entry(gnorm.to_bits())
3849            .or_default()
3850            .insert(xnorm.to_bits());
3851    }
3852    per_group
3853        .values()
3854        .map(|s| s.len())
3855        .min()
3856        .unwrap_or(1)
3857        .max(1)
3858}
3859
3860/// Default internal-knot count for an *additive* univariate smooth, derived
3861/// from the column's unique-value count.
3862///
3863/// The basis dimension is `internal_knots + degree + 1`, so the cap below maps
3864/// to a default cubic basis of ~12 functions — deliberately close to mgcv's
3865/// univariate default (`k = 10`). A penalized smooth controls its wiggliness
3866/// through the *penalty*, not the basis size: REML/LAML shrinks a too-rich
3867/// basis toward the null, but it cannot do so cleanly when the basis is so
3868/// over-sized that the design becomes weakly identified. Growing the basis with
3869/// `n` (the old `n^(1/3)`-ceilinged `unique/4` rule, which pinned to 20 internal
3870/// knots ⇒ a 24-function basis for any column with ≥80 unique values) therefore
3871/// *hurts* recovery on finite, weak-signal fits: a 4-smooth additive model on
3872/// n=120 asks for ~92 coefficients, the outer optimizer stalls on the resulting
3873/// flat two-penalty (range + null-space) REML surface, and the truth leaks into
3874/// surplus columns the penalty can't shrink away (gam#1680; the same defect was
3875/// documented for thin-plate fields in gam#1074). A k-sweep on the #1680 design
3876/// confirms a basis of ~10–15 recovers truth at RMSE ≈ 0.12 while the old
3877/// 24-function default lands at ≈ 0.39 (~3× worse) — *whether or not* the
3878/// covariates are collinear, so this is basis over-richness, not collinearity.
3879///
3880/// The cap is flat in `n`: a user who genuinely needs a wigglier fit raises `k`
3881/// explicitly (mgcv's contract — opt *in* to more flexibility), and the SPEC
3882/// requires the default to allow recovering the null rather than forcing the
3883/// user to opt out of overfitting. The 4-knot floor stays put because we still
3884/// need enough basis functions to fit a non-trivial smooth at all, and the
3885/// `unique/4` growth below the cap keeps small/sparse columns (n ≤ 32, where
3886/// `unique/4 ≤ 8`) on exactly their previous knot count.
3887pub fn heuristic_knots_for_column(col: ArrayView1<'_, f64>) -> usize {
3888    /// Default cubic basis ≈ `MAX_DEFAULT_INTERNAL_KNOTS + degree + 1` = 12
3889    /// functions, matching mgcv's lean univariate default.
3890    const MAX_DEFAULT_INTERNAL_KNOTS: usize = 8;
3891    let unique = unique_count_column(col);
3892    (unique / 4).clamp(4, MAX_DEFAULT_INTERNAL_KNOTS)
3893}
3894
3895/// Per-margin basis sizes for a tensor-product smooth (`te`/`ti`/`t2`).
3896///
3897/// The 1-D heuristic [`heuristic_knots_for_column`] is calibrated for an
3898/// *additive* margin: a well-resolved column asks for the lean univariate
3899/// default (≈12 basis functions, the mgcv-like cap of 8 internal knots; see
3900/// gam#1680), which is sensible for a single `s(x)` term.
3901/// A tensor product, however, multiplies the per-margin sizes:
3902/// `p = ∏_d k_d`. Reusing the 1-D rule per margin makes `p` explode with the
3903/// tensor dimension — a 3-D `te(x,y,z)` at the 1-D ceiling of 12/margin is
3904/// `12³ ≈ 1728` columns, and every REML evaluation pays an O(p³) dense
3905/// penalty reparameterization (the full-tensor sum-to-zero constraint is not
3906/// Kronecker-factorable), turning model selection over tensor candidates into
3907/// a multi-minute single-threaded stall (gam#813). It also requests far more
3908/// coefficients than the data can identify whenever `p ≫ n`.
3909///
3910/// mgcv's `te(...)` uses a small per-margin default (`k = 5`, i.e. `5^d`).
3911/// We match that spirit while staying data-adaptive: budget the *total* tensor
3912/// column count `p_target` and distribute it geometrically across the margins
3913/// so `∏ k_d ≈ p_target`, never asking a margin for more functions than its
3914/// own unique values (and the data set) can support.
3915fn heuristic_tensor_margin_knots(cols: &[usize], ds: &Dataset) -> Vec<usize> {
3916    let d = cols.len().max(1);
3917    let degree = DEFAULT_BSPLINE_DEGREE;
3918    let min_k = degree + 2; // smallest margin that carries a difference penalty
3919    let n = ds.values.nrows();
3920
3921    // Per-margin 1-D ceiling: never request more basis functions than the
3922    // margin's own resolution (unique values) supports. This caps each axis
3923    // independently before the joint budget is applied.
3924    let per_margin_cap: Vec<usize> = cols
3925        .iter()
3926        .map(|&c| heuristic_knots_for_column(ds.values.column(c)).max(min_k))
3927        .collect();
3928
3929    // Total-basis budget. A tensor with ∏k ≫ n coefficients is rank-deficient
3930    // and pure REML cost; cap the product at a generous fraction of n while
3931    // honoring mgcv's small default for the common small-d case. The budget
3932    // grows with n but the geometric split below keeps each margin modest.
3933    //   d=2 → up to ~7²=49 (mgcv-`te`-like), d=3 → ~5³=125, larger d shrinks
3934    // per-margin further so the product never blows past the data support.
3935    let mgcv_like_per_margin = match d {
3936        2 => 7usize,
3937        3 => 5usize,
3938        _ => 4usize,
3939    };
3940    let mgcv_like_total = (mgcv_like_per_margin as f64).powi(d as i32);
3941    let data_budget = (n as f64) * 0.8;
3942    let p_target = mgcv_like_total
3943        .max(min_k.pow(d as u32) as f64)
3944        .min(data_budget);
3945
3946    // Geometric per-margin target so ∏k ≈ p_target, then clamp each margin to
3947    // its own 1-D resolution cap and the difference-penalty floor.
3948    let geo_per_margin = p_target.powf(1.0 / d as f64).round() as usize;
3949    let unclamped: Vec<usize> = per_margin_cap
3950        .iter()
3951        .map(|&cap| geo_per_margin.clamp(min_k, cap))
3952        .collect();
3953
3954    // The per-margin clamps can pull some axes below `geo_per_margin` (a
3955    // low-resolution column), leaving headroom in the joint budget. Redistribute
3956    // that headroom to the margins that can still grow, so the realized ∏k stays
3957    // close to p_target instead of systematically under-shooting it.
3958    let mut k_list = unclamped;
3959    loop {
3960        let product: f64 = k_list.iter().map(|&k| k as f64).product();
3961        if product >= p_target {
3962            break;
3963        }
3964        // Grow the axis with the most remaining headroom (cap − current),
3965        // breaking ties toward the largest cap. Stop when none can grow.
3966        let Some(idx) = k_list
3967            .iter()
3968            .zip(per_margin_cap.iter())
3969            .enumerate()
3970            .filter(|&(_, (k, cap))| k < cap)
3971            .max_by_key(|&(_, (k, cap))| (cap - k, *cap))
3972            .map(|(i, _)| i)
3973        else {
3974            break;
3975        };
3976        k_list[idx] += 1;
3977    }
3978    k_list
3979}
3980
3981pub fn heuristic_centers(n: usize, d: usize) -> usize {
3982    default_num_centers(n, d)
3983}
3984
3985// ---------------------------------------------------------------------------
3986// Smooth option parsers
3987// ---------------------------------------------------------------------------
3988
3989fn parse_endpoint_side(
3990    value: &str,
3991    context: &str,
3992) -> Result<BSplineEndpointBoundaryCondition, String> {
3993    match value.trim().to_ascii_lowercase().as_str() {
3994        "" | "none" | "open" | "unconstrained" | "free" => {
3995            Ok(BSplineEndpointBoundaryCondition::Free)
3996        }
3997        "clamped" | "clamp" | "zero_derivative" | "zero-derivative" => {
3998            Ok(BSplineEndpointBoundaryCondition::Clamped)
3999        }
4000        "anchored" | "anchor" | "zero" | "zero_value" | "zero-value" => {
4001            Ok(BSplineEndpointBoundaryCondition::Anchored { value: 0.0 })
4002        }
4003        other => Err(format!(
4004            "unsupported {context} boundary condition '{other}'; expected free, clamped, or anchored"
4005        )),
4006    }
4007}
4008
4009fn boundary_anchor_value(
4010    options: &BTreeMap<String, String>,
4011    side: &str,
4012    fallback: Option<f64>,
4013) -> Option<f64> {
4014    [
4015        format!("anchor_{side}"),
4016        format!("{side}_anchor"),
4017        format!("anchor-value-{side}"),
4018    ]
4019    .iter()
4020    .find_map(|key| option_f64(options, key))
4021    .or(fallback)
4022}
4023
4024fn apply_anchor_value(
4025    cond: BSplineEndpointBoundaryCondition,
4026    value: Option<f64>,
4027) -> BSplineEndpointBoundaryCondition {
4028    match cond {
4029        BSplineEndpointBoundaryCondition::Anchored { .. } => {
4030            BSplineEndpointBoundaryCondition::Anchored {
4031                value: value.unwrap_or(0.0),
4032            }
4033        }
4034        other => other,
4035    }
4036}
4037
4038fn parse_bspline_boundary_conditions(
4039    options: &BTreeMap<String, String>,
4040) -> Result<BSplineBoundaryConditions, String> {
4041    let fallback_anchor = option_f64(options, "anchor")
4042        .or_else(|| option_f64(options, "anchor_value"))
4043        .or_else(|| option_f64(options, "value"));
4044    let global_boundary_conditions = options
4045        .get("boundary_conditions")
4046        .or_else(|| options.get("bc"));
4047    let mut boundary_conditions = BSplineBoundaryConditions::default();
4048
4049    if let Some(raw_boundary_conditions) = global_boundary_conditions {
4050        let cond = parse_endpoint_side(raw_boundary_conditions, "boundary_conditions")?;
4051        let side = options
4052            .get("side")
4053            .map(|s| s.trim().to_ascii_lowercase())
4054            .unwrap_or_else(|| "both".to_string());
4055        match side.as_str() {
4056            "both" | "all" | "endpoints" => {
4057                boundary_conditions.left = cond;
4058                boundary_conditions.right = cond;
4059            }
4060            "left" | "start" | "lower" => boundary_conditions.left = cond,
4061            "right" | "end" | "upper" => boundary_conditions.right = cond,
4062            other => {
4063                return Err(format!(
4064                    "unsupported B-spline boundary side '{other}'; expected left, right, or both"
4065                ));
4066            }
4067        }
4068    }
4069
4070    if let Some(raw) = options
4071        .get("bc_left")
4072        .or_else(|| options.get("left_bc"))
4073        .or_else(|| options.get("bc_start"))
4074        .or_else(|| options.get("start_bc"))
4075    {
4076        boundary_conditions.left = parse_endpoint_side(raw, "left endpoint")?;
4077    }
4078    if let Some(raw) = options
4079        .get("bc_right")
4080        .or_else(|| options.get("right_bc"))
4081        .or_else(|| options.get("bc_end"))
4082        .or_else(|| options.get("end_bc"))
4083    {
4084        boundary_conditions.right = parse_endpoint_side(raw, "right endpoint")?;
4085    }
4086
4087    boundary_conditions.left = apply_anchor_value(
4088        boundary_conditions.left,
4089        boundary_anchor_value(options, "left", fallback_anchor),
4090    );
4091    boundary_conditions.right = apply_anchor_value(
4092        boundary_conditions.right,
4093        boundary_anchor_value(options, "right", fallback_anchor),
4094    );
4095
4096    // Non-zero anchors require an affine offset term that the current basis
4097    // builder does not synthesize (see `build_bspline_basis_1d` in
4098    // src/terms/basis.rs). Surface the rejection at parse time with the side
4099    // and value in the diagnostic, instead of letting the value-only error
4100    // emerge deep inside the basis builder where the user has no context
4101    // about which anchor key (`anchor`, `left_anchor`, `right_anchor`, …)
4102    // routed into which endpoint.
4103    reject_nonzero_anchor("left", boundary_conditions.left)?;
4104    reject_nonzero_anchor("right", boundary_conditions.right)?;
4105
4106    Ok(boundary_conditions)
4107}
4108
4109fn reject_nonzero_anchor(side: &str, cond: BSplineEndpointBoundaryCondition) -> Result<(), String> {
4110    if let BSplineEndpointBoundaryCondition::Anchored { value } = cond {
4111        if value.abs() > 1e-12 {
4112            return Err(format!(
4113                "non-zero {side} anchor {value} requires an affine offset term that is not yet supported; only anchored value 0 is accepted at parse time"
4114            ));
4115        }
4116    }
4117    Ok(())
4118}
4119
4120/// Resolve the requested internal-knot count and effective spline degree for
4121/// a 1-D penalized B-spline smooth. This mirrors the tensor-margin per-axis
4122/// degree-reduction policy: a 1-D B-spline basis with `k` functions
4123/// is well-defined for any `degree <= k - 1`, so an explicit
4124/// `s(x, bs="ps", k=3)` with default `degree=3` is interpreted as the
4125/// largest representable spline (`effective_degree = k - 1 = 2`, quadratic)
4126/// rather than rejected. The `penalty_order` carried by the caller must be
4127/// clamped to `<= effective_degree` so the marginal difference penalty
4128/// stays well-defined; the returned `effective_degree` makes that explicit.
4129///
4130/// Mirrors the tensor margin treatment in the `te(...)` builder so a
4131/// standalone smooth, a factor smooth, and a tensor margin all interpret
4132/// "small k" the same way.
4133fn parse_ps_internal_knots(
4134    options: &BTreeMap<String, String>,
4135    degree: usize,
4136    default_internal_knots: usize,
4137) -> Result<(usize, bool, usize), String> {
4138    const MIN_EXPRESSIVE_INTERNAL_KNOTS: usize = 2;
4139    // Strict variants: reject `k=-1`, `k=1.5`, `knots=-2` etc. with a
4140    // focused error instead of silently dropping the value and using the
4141    // default. Lenient `option_usize` / `option_usize_any` silently swallow
4142    // unparseable values, which leaves the user thinking they configured
4143    // something when they did not.
4144    // A list-valued `knots=[...]` carries explicit internal positions, not a
4145    // count; it is consumed by `parse_explicit_internal_knots`. Treat it as
4146    // "count not specified" here so the strict integer parse does not reject
4147    // the bracketed value (the Provided path ignores the returned count).
4148    let knots_internal = if knots_option_is_list(options) {
4149        None
4150    } else {
4151        option_usize_strict(options, "knots")?
4152    };
4153    let basis_dim = option_usize_any_strict(options, &["k", "basis_dim", "basis-dim", "basisdim"])?;
4154    if knots_internal.is_some() && basis_dim.is_some() {
4155        return Err(TermBuilderError::incompatible_config(
4156            "ps/bspline smooth: specify either knots=<internal_knots> or k=<basis_dim> (not both)",
4157        )
4158        .to_string());
4159    }
4160    if let Some(k) = basis_dim {
4161        if k < 2 {
4162            return Err(TermBuilderError::invalid_option(format!(
4163                "ps/bspline smooth: k={} too small; B-spline basis requires k >= 2",
4164                k
4165            ))
4166            .to_string());
4167        }
4168        // `degree <= k - 1` is required for the B-spline basis to be
4169        // well-defined; reduce on this axis only when the user asked for
4170        // a smaller k than the cubic default supports. This matches mgcv's
4171        // behaviour (e.g. `s(x, bs="ps", k=3)` becomes a quadratic basis)
4172        // and the per-axis reduction the tensor builder already does.
4173        let effective_degree = degree.min(k - 1).max(1);
4174        let num_internal_knots = if effective_degree < degree {
4175            // Reproduce the requested basis size exactly when degree was
4176            // reduced for a low-cardinality axis: num_basis = k.
4177            k.saturating_sub(effective_degree + 1)
4178        } else {
4179            (k - degree - 1).max(MIN_EXPRESSIVE_INTERNAL_KNOTS)
4180        };
4181        Ok((num_internal_knots, false, effective_degree))
4182    } else {
4183        Ok((
4184            knots_internal.unwrap_or(default_internal_knots),
4185            knots_internal.is_none(),
4186            degree,
4187        ))
4188    }
4189}
4190
4191/// True when the `knots` option value is a *list* literal (`[...]`, `c(...)`,
4192/// or `(...)`) rather than a scalar count. mgcv's `knots=` accepts both: a
4193/// single integer is an internal-knot count, while a vector is explicit
4194/// internal knot positions. We disambiguate purely on the wrapper syntax so a
4195/// bare `knots=5` keeps its historical count meaning.
4196fn knots_option_is_list(options: &BTreeMap<String, String>) -> bool {
4197    options
4198        .get("knots")
4199        .map(|raw| {
4200            let t = raw.trim();
4201            t.starts_with('[') || t.starts_with("c(") || t.starts_with("C(") || t.starts_with('(')
4202        })
4203        .unwrap_or(false)
4204}
4205
4206/// Parse `knots=[k0, k1, ...]` (or `c(...)` / `(...)`) into explicit internal
4207/// knot positions. Returns `Ok(None)` when `knots` is absent or a scalar count
4208/// (handled by [`parse_ps_internal_knots`]); `Ok(Some(positions))` when it is a
4209/// non-empty numeric list; and an error for an empty or unparseable list.
4210fn parse_explicit_internal_knots(
4211    options: &BTreeMap<String, String>,
4212) -> Result<Option<Vec<f64>>, String> {
4213    if !knots_option_is_list(options) {
4214        return Ok(None);
4215    }
4216    let raw = options
4217        .get("knots")
4218        .expect("knots_option_is_list implies the key is present");
4219    let tokens = split_list_option(raw);
4220    if tokens.is_empty() {
4221        return Err(TermBuilderError::invalid_option(format!(
4222            "knots={raw} is an empty list; supply at least one internal knot position \
4223             (e.g. knots=[0.2, 0.5, 0.8]) or a scalar count (e.g. knots=8)"
4224        ))
4225        .to_string());
4226    }
4227    let mut positions = Vec::with_capacity(tokens.len());
4228    for tok in &tokens {
4229        let value = parse_numeric_expr(tok).map_err(|err| {
4230            TermBuilderError::invalid_option(format!(
4231                "knots list entry '{tok}' is not a numeric position: {err}"
4232            ))
4233            .to_string()
4234        })?;
4235        positions.push(value);
4236    }
4237    Ok(Some(positions))
4238}
4239
4240/// Resolve the `knot_placement=` option for an automatically generated knot
4241/// vector. Accepts `"uniform"` (the default, equal spacing on the data range)
4242/// and `"quantile"` (interior knots at empirical data quantiles, better for
4243/// skewed covariates). Unknown values are rejected so typos do not silently
4244/// fall back to uniform.
4245fn parse_knot_placement(
4246    options: &BTreeMap<String, String>,
4247) -> Result<crate::basis::BSplineKnotPlacement, String> {
4248    use crate::basis::BSplineKnotPlacement;
4249    match options
4250        .get("knot_placement")
4251        .or_else(|| options.get("knot-placement"))
4252        .or_else(|| options.get("knotplacement"))
4253    {
4254        None => Ok(BSplineKnotPlacement::Uniform),
4255        Some(raw) => match raw
4256            .trim()
4257            .trim_matches('"')
4258            .trim_matches('\'')
4259            .to_ascii_lowercase()
4260            .as_str()
4261        {
4262            "uniform" | "even" | "equal" => Ok(BSplineKnotPlacement::Uniform),
4263            "quantile" | "quantiles" | "data" | "empirical" => Ok(BSplineKnotPlacement::Quantile),
4264            other => Err(TermBuilderError::invalid_option(format!(
4265                "knot_placement={other} is not recognised; expected \"uniform\" or \"quantile\""
4266            ))
4267            .to_string()),
4268        },
4269    }
4270}
4271
4272/// Build the non-periodic 1D B-spline knot spec for the `ps`/`bspline` and
4273/// factor-smooth marginal paths, honoring (in priority order):
4274///   1. `knots=[...]` explicit internal positions  → [`BSplineKnotSpec::Provided`]
4275///   2. `knot_placement="quantile"`                 → [`BSplineKnotSpec::Automatic`]
4276///   3. uniform generation                          → [`BSplineKnotSpec::Generate`]
4277///
4278/// `data` is the covariate column (used to clamp explicit positions to the
4279/// observed range and to drive quantile placement); `n_knots` is the resolved
4280/// internal-knot count from [`parse_ps_internal_knots`] used for the automatic
4281/// strategies.
4282fn resolve_nonperiodic_bspline_knotspec(
4283    options: &BTreeMap<String, String>,
4284    data: ArrayView1<'_, f64>,
4285    data_range: (f64, f64),
4286    degree: usize,
4287    n_knots: usize,
4288) -> Result<BSplineKnotSpec, String> {
4289    use crate::basis::{BSplineKnotPlacement, clamped_knot_vector_from_internal_positions};
4290    if let Some(positions) = parse_explicit_internal_knots(options)? {
4291        if option_usize_any_strict(options, &["k", "basis_dim", "basis-dim", "basisdim"])?.is_some()
4292        {
4293            return Err(TermBuilderError::incompatible_config(
4294                "ps/bspline smooth: specify either explicit knots=[...] positions or \
4295                 k=<basis_dim> (not both); the basis size is fixed by the knot vector",
4296            )
4297            .to_string());
4298        }
4299        let knots = clamped_knot_vector_from_internal_positions(data_range, &positions, degree)
4300            .map_err(|e| e.to_string())?;
4301        return Ok(BSplineKnotSpec::Provided(knots));
4302    }
4303    match parse_knot_placement(options)? {
4304        BSplineKnotPlacement::Uniform => Ok(BSplineKnotSpec::Generate {
4305            data_range,
4306            num_internal_knots: n_knots,
4307        }),
4308        BSplineKnotPlacement::Quantile => {
4309            // Validate the column up-front so an unfittable request surfaces a
4310            // user-correctable error at parse time rather than deep in basis
4311            // construction. The same data drives the eventual quantile knots.
4312            crate::basis::auto_knot_vector_1d_quantile(data, n_knots, degree)
4313                .map_err(|e| e.to_string())?;
4314            Ok(BSplineKnotSpec::Automatic {
4315                num_internal_knots: Some(n_knots),
4316                placement: BSplineKnotPlacement::Quantile,
4317            })
4318        }
4319    }
4320}
4321
4322/// Reject unknown option keys with a focused error that names the term and
4323/// the offending key, plus suggests near-matches from the known-key list.
4324/// Without this, typos like `lengt_scale=0.1` or `nyu=5/2` are silently
4325/// dropped, the term uses the default, and the user has no idea why their
4326/// option had no effect.
4327pub fn validate_known_options(
4328    term_name: &str,
4329    options: &BTreeMap<String, String>,
4330    known: &[&str],
4331) -> Result<(), String> {
4332    let known_set: std::collections::BTreeSet<&&str> = known.iter().collect();
4333    for key in options.keys() {
4334        if !known_set.contains(&key.as_str()) {
4335            if term_name == "tensor" && is_tensor_k_axis_option_key(key) {
4336                continue;
4337            }
4338            // Suggest near-matches (substring or shared prefix ≥ 3).
4339            let key_l = key.to_ascii_lowercase();
4340            let mut suggestions: Vec<&str> = known
4341                .iter()
4342                .filter(|k| {
4343                    let kl = k.to_ascii_lowercase();
4344                    kl.contains(&key_l) || key_l.contains(&kl) || {
4345                        let n = kl
4346                            .chars()
4347                            .zip(key_l.chars())
4348                            .take_while(|(a, b)| a == b)
4349                            .count();
4350                        n >= 3
4351                    }
4352                })
4353                .copied()
4354                .collect();
4355            suggestions.sort_unstable();
4356            suggestions.dedup();
4357            let hint = if suggestions.is_empty() {
4358                String::new()
4359            } else {
4360                format!(" — did you mean one of [{}]?", suggestions.join(", "))
4361            };
4362            return Err(TermBuilderError::invalid_option(format!(
4363                "{term_name}() does not accept option `{key}`{hint}. Valid options: [{}]",
4364                {
4365                    let mut sorted = known.to_vec();
4366                    sorted.sort_unstable();
4367                    sorted.join(", ")
4368                }
4369            ))
4370            .to_string());
4371        }
4372    }
4373    Ok(())
4374}
4375
4376/// Private (engine-injected) option that caps the *default* spatial center
4377/// count for a secondary (distributional) predictor's smooth — see
4378/// `solver::fit_orchestration::apply_secondary_predictor_basis_parsimony` and #501.
4379///
4380/// It is deliberately NOT one of the user-facing count aliases recognised by
4381/// [`has_explicit_countwith_basis_alias`], so it never flips the spatial basis
4382/// onto the explicit (hard) center-placement strategy: the cap lowers the
4383/// *default* count while the `Auto` strategy is retained, so the count is still
4384/// softly reduced when the data can't support it.
4385pub const SECONDARY_CENTER_CAP_OPTION: &str = "__secondary_center_cap";
4386
4387/// Apply the secondary-predictor center cap to a *default* spatial center
4388/// count. A no-op when the cap option is absent (the common case) or when the
4389/// user supplied an explicit count (then `default_count` is ignored downstream
4390/// by [`parse_countwith_basis_alias`] anyway).
4391pub(crate) fn cap_default_spatial_centers(
4392    options: &BTreeMap<String, String>,
4393    default_count: usize,
4394) -> usize {
4395    match option_usize(options, SECONDARY_CENTER_CAP_OPTION) {
4396        Some(cap) => default_count.min(cap),
4397        None => default_count,
4398    }
4399}
4400
4401fn default_matern_center_count(n: usize, d: usize, planned_count: usize) -> usize {
4402    // #1074: the mgcv-sized basis cap (`k = 10·3^(d-1)`) was DELETED here too — it
4403    // masked the same over-sizing/under-penalization defect by shrinking the basis
4404    // rather than fixing the optimizer. The default now uses the generic n-scaling
4405    // plan. A small-n floor against a numerically-fragile two-column kernel block
4406    // is a legitimate degenerate guard and is kept. Explicit `k`/`centers` still
4407    // take full effect upstream.
4408    let low_n_floor = (d + 4).min(n);
4409    planned_count.max(low_n_floor).max(1)
4410}
4411
4412fn default_duchon_center_count(
4413    n: usize,
4414    d: usize,
4415    planned_count: usize,
4416    polynomial_cols: usize,
4417) -> usize {
4418    // Duchon fits pay a larger setup cost than Matérn/TPS because the
4419    // constrained radial block is rotated through its center Gram and several
4420    // operator-collocation penalties.  The old generic spatial default handed a
4421    // 2-D Gaussian Duchon at n≈500 more than one hundred centers, so cold fits
4422    // spent most of their time in dense O(k³) eigensolves even though the REML
4423    // smoother uses a low-rank basis.  mgcv's Duchon spline default is the
4424    // thin-plate-style `k = 10 * 3^(d - 1)` (30 in 2-D); use that as the
4425    // implicit low-rank cap while preserving the user's explicit `centers=`/`k=`
4426    // request above.  The polynomial null space must still fit, so tiny
4427    // high-order bases are raised to the smallest admissible count.
4428    let mgcv_default = 10usize.saturating_mul(3usize.saturating_pow(d.saturating_sub(1) as u32));
4429    let low_n_floor = (polynomial_cols + 1).min(n).max(1);
4430    planned_count.min(mgcv_default).max(low_n_floor)
4431}
4432
4433pub fn parse_countwith_basis_alias(
4434    options: &BTreeMap<String, String>,
4435    primarykey: &str,
4436    default_count: usize,
4437) -> Result<usize, String> {
4438    // Strict: reject unparseable values (e.g. `centers=many`, `centers=-1`,
4439    // `centers=1.5`) instead of silently dropping them and falling through
4440    // to the default. Without this the user gets the auto-inferred count
4441    // silently and never realizes their explicit option was ignored.
4442    let primary = option_usize_strict(options, primarykey)?;
4443    let basis_dim = option_usize_any_strict(
4444        options,
4445        &["k", "basis_dim", "basis-dim", "basisdim", "knots"],
4446    )?;
4447    if primary.is_some() && basis_dim.is_some() {
4448        return Err(TermBuilderError::incompatible_config(format!(
4449            "specify either {}=<count> or k=<basis_dim> (not both)",
4450            primarykey
4451        ))
4452        .to_string());
4453    }
4454    Ok(primary.or(basis_dim).unwrap_or(default_count))
4455}
4456
4457pub fn has_explicit_countwith_basis_alias(
4458    options: &BTreeMap<String, String>,
4459    primarykey: &str,
4460) -> bool {
4461    options.contains_key(primarykey)
4462        || ["k", "basis_dim", "basis-dim", "basisdim", "knots"]
4463            .iter()
4464            .any(|alias| options.contains_key(*alias))
4465}
4466
4467pub fn parse_cyclic_boundary(
4468    options: &BTreeMap<String, String>,
4469    minv: f64,
4470    maxv: f64,
4471) -> Result<OneDimensionalBoundary, String> {
4472    let cyclic = option_bool(options, "cyclic")
4473        .or_else(|| option_bool(options, "periodic"))
4474        .unwrap_or(false);
4475    if !cyclic {
4476        return Ok(OneDimensionalBoundary::Open);
4477    }
4478    let start = match option_numeric_expr(options, "period_start")? {
4479        Some(v) => v,
4480        None => option_numeric_expr(options, "start")?.unwrap_or(minv),
4481    };
4482    let end = match option_numeric_expr(options, "period_end")? {
4483        Some(v) => v,
4484        None => option_numeric_expr(options, "end")?.unwrap_or(maxv),
4485    };
4486    if end <= start {
4487        return Err(format!(
4488            "cyclic smooth requires period_end/end ({end}) > period_start/start ({start})"
4489        ));
4490    }
4491    Ok(OneDimensionalBoundary::Cyclic { start, end })
4492}
4493
4494/// Parse the periodic-uniform domain for a one-dimensional cyclic smooth.
4495///
4496/// Returns the `(domain_start, period)` pair derived from
4497/// `period_start` / `start`, `period_end` / `end`, falling back to the
4498/// data range `[minv, maxv)` when neither bound is provided. The period
4499/// must be strictly positive.
4500pub fn parse_periodic_domain_1d(
4501    options: &BTreeMap<String, String>,
4502    minv: f64,
4503    maxv: f64,
4504) -> Result<(f64, f64), String> {
4505    let start_opt = match option_numeric_expr(options, "period_start")? {
4506        Some(v) => Some(v),
4507        None => option_numeric_expr(options, "start")?,
4508    };
4509    let end_opt = match option_numeric_expr(options, "period_end")? {
4510        Some(v) => Some(v),
4511        None => option_numeric_expr(options, "end")?,
4512    };
4513    // Reject the pure data-range fallback. A B-spline periodic smooth that takes
4514    // its wrap from the observed [min, max] is sample-dependent and silently
4515    // wrong: uniform draws on a true period of 2π land on [ε, 2π−ε], so using
4516    // (max−min) as the period seams the curve with an off-by-ε discontinuity and
4517    // the fit drifts with the sample. (Unlike the radial closed-lattice Duchon
4518    // path, whose centers DO tile a full period, so its span-derive is exact —
4519    // see `parse_periodic_axes_option`.) Require the caller to name the period
4520    // explicitly via `period=`/`period_end`. The end is only defaulted to `maxv`
4521    // when a `period_start`/`start` was given (a half-open declaration); a bare
4522    // periodic smooth with neither bound is an error.
4523    if end_opt.is_none() && start_opt.is_none() {
4524        return Err(
4525            "periodic B-spline smooth requires an explicit period: pass period=<value> \
4526             (e.g. period=2*pi) or period_start=/period_end=. Deriving the period from the \
4527             observed data range is sample-dependent and produces an off-by-ε seam, so it is \
4528             not inferred."
4529                .to_string(),
4530        );
4531    }
4532    let start = start_opt.unwrap_or(minv);
4533    let end = end_opt.unwrap_or(maxv);
4534    if !(start.is_finite() && end.is_finite()) {
4535        return Err(format!(
4536            "periodic smooth domain requires finite endpoints, got ({start}, {end})"
4537        ));
4538    }
4539    if end <= start {
4540        return Err(format!(
4541            "periodic smooth requires period_end/end ({end}) > period_start/start ({start})"
4542        ));
4543    }
4544    Ok((start, end - start))
4545}
4546
4547fn parse_matern_nu(raw: &str) -> Result<MaternNu, String> {
4548    let trimmed = raw.trim();
4549    let lowered = trimmed.to_ascii_lowercase();
4550    match lowered.as_str() {
4551        "1/2" | "0.5" | "half" => return Ok(MaternNu::Half),
4552        "3/2" | "1.5" => return Ok(MaternNu::ThreeHalves),
4553        "5/2" | "2.5" => return Ok(MaternNu::FiveHalves),
4554        "7/2" | "3.5" => return Ok(MaternNu::SevenHalves),
4555        "9/2" | "4.5" => return Ok(MaternNu::NineHalves),
4556        _ => {}
4557    }
4558
4559    let value = if let Some((num, den)) = trimmed.split_once('/') {
4560        let num = num
4561            .trim()
4562            .parse::<f64>()
4563            .map_err(|err| format!("{}: {err}", unsupported_matern_nu_message(raw)))?;
4564        let den = den
4565            .trim()
4566            .parse::<f64>()
4567            .map_err(|err| format!("{}: {err}", unsupported_matern_nu_message(raw)))?;
4568        if den == 0.0 || !num.is_finite() || !den.is_finite() {
4569            return Err(unsupported_matern_nu_message(raw));
4570        }
4571        num / den
4572    } else {
4573        trimmed
4574            .parse::<f64>()
4575            .map_err(|err| format!("{}: {err}", unsupported_matern_nu_message(raw)))?
4576    };
4577
4578    const TOL: f64 = 1e-12;
4579    if (value - 0.5).abs() <= TOL {
4580        Ok(MaternNu::Half)
4581    } else if (value - 1.5).abs() <= TOL {
4582        Ok(MaternNu::ThreeHalves)
4583    } else if (value - 2.5).abs() <= TOL {
4584        Ok(MaternNu::FiveHalves)
4585    } else if (value - 3.5).abs() <= TOL {
4586        Ok(MaternNu::SevenHalves)
4587    } else if (value - 4.5).abs() <= TOL {
4588        Ok(MaternNu::NineHalves)
4589    } else {
4590        Err(unsupported_matern_nu_message(raw))
4591    }
4592}
4593
4594fn unsupported_matern_nu_message(raw: &str) -> String {
4595    TermBuilderError::unsupported_feature(format!(
4596        "unsupported Matern nu '{raw}'; supported half-integer values are 1/2, 3/2, 5/2, 7/2, and 9/2"
4597    ))
4598    .to_string()
4599}
4600
4601#[derive(Clone, Debug, serde::Serialize, serde::Deserialize)]
4602pub enum DuchonPowerPolicy {
4603    Explicit(f64),
4604    /// No explicit `power=` given: defer to the cubic structural default, which
4605    /// the builder resolves dimension-aware as `s = (d − 1)/2` (so `φ(r) = r³`
4606    /// in every dimension). There is no triple-operator minimum any more.
4607    CubicStructuralDefault,
4608}
4609
4610pub fn parse_duchon_power_policy(
4611    options: &BTreeMap<String, String>,
4612) -> Result<DuchonPowerPolicy, String> {
4613    if let Some(raw_nu) = options.get("nu") {
4614        return Err(TermBuilderError::incompatible_config(format!(
4615            "Duchon smooths use power=<number>, not nu='{}'. Use power=1.5, power=2, etc.",
4616            raw_nu
4617        ))
4618        .to_string());
4619    }
4620    match options.get("power") {
4621        Some(raw) => {
4622            let value = raw.parse::<f64>().map_err(|err| {
4623                TermBuilderError::invalid_option(format!(
4624                    "invalid Duchon power '{}'; expected a non-negative number such as power=1.5 or power=2: {}",
4625                    raw, err
4626                ))
4627                .to_string()
4628            })?;
4629            if !value.is_finite() || value < 0.0 {
4630                return Err(TermBuilderError::invalid_option(format!(
4631                    "invalid Duchon power '{}'; expected a finite non-negative number such as power=1.5 or power=2",
4632                    raw
4633                ))
4634                .to_string());
4635            }
4636            Ok(DuchonPowerPolicy::Explicit(value))
4637        }
4638        None => Ok(DuchonPowerPolicy::CubicStructuralDefault),
4639    }
4640}
4641
4642pub fn parse_duchon_power(options: &BTreeMap<String, String>) -> Result<f64, String> {
4643    match parse_duchon_power_policy(options)? {
4644        DuchonPowerPolicy::Explicit(power) => Ok(power),
4645        // Context-free placeholder: the bare option parser has no column count,
4646        // so it cannot compute the dimension-aware cubic power `s = (d − 1)/2`.
4647        // The dimension-aware resolution happens later in `build_smooth_basis`;
4648        // this 1.5 is only a stand-in for callers that need a concrete number
4649        // without data context (e.g. round-trip parser tests).
4650        DuchonPowerPolicy::CubicStructuralDefault => Ok(1.5),
4651    }
4652}
4653
4654pub fn parse_duchon_order(
4655    options: &BTreeMap<String, String>,
4656) -> Result<DuchonNullspaceOrder, String> {
4657    match options.get("order") {
4658        // Structural cubic Duchon is affine-by-default: an unspecified order is
4659        // the `Linear` (constant + linear) null space, matching the magic
4660        // default. An explicit `order=0` still selects the constant-only space.
4661        None => Ok(DuchonNullspaceOrder::Linear),
4662        Some(raw) => match raw.parse::<usize>() {
4663            Ok(0) => Ok(DuchonNullspaceOrder::Zero),
4664            Ok(1) => Ok(DuchonNullspaceOrder::Linear),
4665            Ok(other) => Ok(DuchonNullspaceOrder::Degree(other)),
4666            Err(_) => Err(TermBuilderError::invalid_option(format!(
4667                "invalid Duchon order '{}'; expected a non-negative integer such as order=0, order=1, or order=2",
4668                raw
4669            ))
4670            .to_string()),
4671        },
4672    }
4673}
4674
4675fn parse_matern_identifiability(
4676    options: &BTreeMap<String, String>,
4677) -> Result<MaternIdentifiability, TermBuilderError> {
4678    let Some(raw) = options.get("identifiability").map(String::as_str) else {
4679        return Ok(MaternIdentifiability::default());
4680    };
4681    match raw.trim().to_ascii_lowercase().as_str() {
4682        "none" => Ok(MaternIdentifiability::None),
4683        "sum_tozero" | "sum-to-zero" | "center_sum_tozero" | "center-sum-to-zero" | "centered" => {
4684            Ok(MaternIdentifiability::CenterSumToZero)
4685        }
4686        "linear" | "center_linear_orthogonal" | "center-linear-orthogonal" => {
4687            Ok(MaternIdentifiability::CenterLinearOrthogonal)
4688        }
4689        other => Err(TermBuilderError::unsupported_feature(format!(
4690            "invalid Matérn identifiability '{other}'; expected one of: none, sum_tozero, linear"
4691        ))),
4692    }
4693}
4694
4695fn parse_spatial_identifiability(
4696    options: &BTreeMap<String, String>,
4697) -> Result<SpatialIdentifiability, TermBuilderError> {
4698    let Some(raw) = options.get("identifiability").map(String::as_str) else {
4699        return Ok(SpatialIdentifiability::default());
4700    };
4701    match raw.trim().to_ascii_lowercase().as_str() {
4702        "none" => Ok(SpatialIdentifiability::None),
4703        "orthogonal"
4704        | "orthogonal_to_parametric"
4705        | "orthogonal-to-parametric"
4706        | "parametric_orthogonal" => Ok(SpatialIdentifiability::OrthogonalToParametric),
4707        "frozen" => Err(TermBuilderError::unsupported_feature(
4708            "spatial identifiability 'frozen' is internal-only; use none or orthogonal_to_parametric",
4709        )),
4710        other => Err(TermBuilderError::unsupported_feature(format!(
4711            "invalid spatial identifiability '{other}'; expected one of: none, orthogonal_to_parametric"
4712        ))),
4713    }
4714}
4715
4716#[cfg(test)]
4717mod tests {
4718    use super::*;
4719    use crate::basis::OperatorPenaltySpec;
4720    use crate::inference::formula_dsl::parse_formula;
4721    use gam_data::{DataSchema, SchemaColumn};
4722    use ndarray::Array2;
4723    use std::collections::BTreeMap;
4724
4725    fn continuous_dataset(headers: &[&str], rows: Vec<Vec<f64>>) -> Dataset {
4726        let nrows = rows.len();
4727        let ncols = headers.len();
4728        let values = Array2::from_shape_vec(
4729            (nrows, ncols),
4730            rows.into_iter().flat_map(|row| row.into_iter()).collect(),
4731        )
4732        .expect("rectangular test data");
4733        Dataset {
4734            headers: headers.iter().map(|name| name.to_string()).collect(),
4735            values,
4736            schema: DataSchema {
4737                columns: headers
4738                    .iter()
4739                    .map(|name| SchemaColumn {
4740                        name: name.to_string(),
4741                        kind: ColumnKindTag::Continuous,
4742                        levels: vec![],
4743                    })
4744                    .collect(),
4745            },
4746            column_kinds: vec![ColumnKindTag::Continuous; ncols],
4747        }
4748    }
4749
4750    fn factor_dataset() -> Dataset {
4751        let rows = (0..24)
4752            .map(|i| {
4753                let x = i as f64 / 23.0;
4754                let g = (i % 2) as f64;
4755                vec![x + g, x, g]
4756            })
4757            .collect::<Vec<_>>();
4758        Dataset {
4759            headers: vec!["y".into(), "x".into(), "g".into()],
4760            values: Array2::from_shape_vec(
4761                (rows.len(), 3),
4762                rows.into_iter().flat_map(|row| row.into_iter()).collect(),
4763            )
4764            .expect("rectangular factor test data"),
4765            schema: DataSchema {
4766                columns: vec![
4767                    SchemaColumn {
4768                        name: "y".into(),
4769                        kind: ColumnKindTag::Continuous,
4770                        levels: vec![],
4771                    },
4772                    SchemaColumn {
4773                        name: "x".into(),
4774                        kind: ColumnKindTag::Continuous,
4775                        levels: vec![],
4776                    },
4777                    SchemaColumn {
4778                        name: "g".into(),
4779                        kind: ColumnKindTag::Categorical,
4780                        levels: vec!["a".into(), "b".into()],
4781                    },
4782                ],
4783            },
4784            column_kinds: vec![
4785                ColumnKindTag::Continuous,
4786                ColumnKindTag::Continuous,
4787                ColumnKindTag::Categorical,
4788            ],
4789        }
4790    }
4791
4792    /// #1378: the DEFAULT univariate `s(x, bs="tp")` must build a *modest*
4793    /// mgcv-sized basis, not the n-scaled spatial heuristic. The oversized
4794    /// default basis left the two-penalty REML ρ-surface with a flat valley
4795    /// whose optimizer landing point depended on row order, breaking
4796    /// row-permutation invariance. Pin the default 1-D center count so a
4797    /// regression that reinstates the n-scaled default trips here, fast, with
4798    /// no fit/optimizer in the loop.
4799    #[test]
4800    fn default_univariate_thinplate_basis_dim_is_modest() {
4801        // n = 300 (the #1378 scenario): the n-scaled spatial heuristic would
4802        // request ~75 centers here. The modest default must stay near k = 10.
4803        let n = 300usize;
4804        let rows: Vec<Vec<f64>> = (0..n)
4805            .map(|i| {
4806                let x = -3.0 + 6.0 * (i as f64) / ((n - 1) as f64);
4807                vec![x.sin(), x]
4808            })
4809            .collect();
4810        let ds = continuous_dataset(&["y", "x"], rows);
4811
4812        let mut options = BTreeMap::new();
4813        options.insert("bs".to_string(), "tp".to_string());
4814
4815        let mut notes = Vec::new();
4816        let basis = build_smooth_basis(
4817            SmoothKind::S,
4818            &["x".to_string()],
4819            &[1],
4820            &options,
4821            &ds,
4822            &mut notes,
4823            &ResourcePolicy::default_library(),
4824            1,
4825        )
4826        .expect("build default univariate tp smooth");
4827
4828        let centers = match &basis {
4829            SmoothBasisSpec::ThinPlate { spec, .. } => match &spec.center_strategy {
4830                CenterStrategy::Auto(inner) => match inner.as_ref() {
4831                    CenterStrategy::FarthestPoint { num_centers }
4832                    | CenterStrategy::EqualMass { num_centers }
4833                    | CenterStrategy::EqualMassCovarRepresentative { num_centers }
4834                    | CenterStrategy::KMeans { num_centers, .. } => *num_centers,
4835                    other => panic!("unexpected auto inner center strategy: {other:?}"),
4836                },
4837                CenterStrategy::FarthestPoint { num_centers }
4838                | CenterStrategy::EqualMass { num_centers }
4839                | CenterStrategy::EqualMassCovarRepresentative { num_centers }
4840                | CenterStrategy::KMeans { num_centers, .. } => *num_centers,
4841                other => panic!("unexpected center strategy: {other:?}"),
4842            },
4843            other => panic!("expected ThinPlate basis, got {other:?}"),
4844        };
4845
4846        // #1074: the mgcv-sized basis-dim ceiling assertion was removed with the
4847        // cap it tested. The default tp basis is now n-scaled; we only assert it
4848        // still builds a usable basis.
4849        assert!(
4850            centers >= 1,
4851            "default univariate tp must still build a usable basis (centers={centers})",
4852        );
4853    }
4854
4855    /// gam#1629: a default 2-D `matern(x1, x2)` (no explicit `length_scale`)
4856    /// must leave the length-scale at the `0.0` auto sentinel — NOT the full
4857    /// data diameter — so the planner's `auto_init_length_scale_in_place` seeds
4858    /// it on the wiggly/resolving side (`max_range / sqrt(n)`), the same regime
4859    /// thin-plate uses. The previous `default_matern_length_scale` returned the
4860    /// full diameter, which is non-zero, so the `0.0`-gated auto-init was a
4861    /// no-op and the κ-optimizer started in the over-smoothed corner and parked
4862    /// there (truth-RMSE ~6× worse than thin-plate/tensor on identical
4863    /// high-frequency 2-D surfaces, insensitive to `k`). This pins the corrected
4864    /// seed geometry without a fit/optimizer in the loop.
4865    #[test]
4866    fn default_matern_2d_seeds_resolving_length_scale_not_overscaled_diameter() {
4867        // A fine multi-frequency 2-D grid (the #1629 reproduction shape): the
4868        // data diameter is O(1.4) in each axis; the resolving seed must be far
4869        // smaller than the diameter so high-frequency structure stays reachable.
4870        let side = 24usize; // n = 576
4871        let mut rows: Vec<Vec<f64>> = Vec::with_capacity(side * side);
4872        for i in 0..side {
4873            for j in 0..side {
4874                let x1 = i as f64 / (side - 1) as f64; // [0, 1]
4875                let x2 = j as f64 / (side - 1) as f64; // [0, 1]
4876                let y = (6.0 * x1).sin() * (6.0 * x2).cos();
4877                rows.push(vec![y, x1, x2]);
4878            }
4879        }
4880        let n = rows.len();
4881        let ds = continuous_dataset(&["y", "x1", "x2"], rows);
4882
4883        let mut options = BTreeMap::new();
4884        options.insert("bs".to_string(), "gp".to_string()); // gp ⇒ Matérn
4885        let mut notes = Vec::new();
4886        let mut basis = build_smooth_basis(
4887            SmoothKind::S,
4888            &["x1".to_string(), "x2".to_string()],
4889            &[1, 2],
4890            &options,
4891            &ds,
4892            &mut notes,
4893            &ResourcePolicy::default_library(),
4894            1,
4895        )
4896        .expect("build default 2-D matern smooth");
4897
4898        // (1) The builder must emit the auto sentinel, not a baked-in diameter.
4899        let (feature_cols, seeded_length_scale) = match &basis {
4900            SmoothBasisSpec::Matern {
4901                feature_cols, spec, ..
4902            } => (feature_cols.clone(), spec.length_scale),
4903            other => panic!("expected Matern basis, got {other:?}"),
4904        };
4905        assert_eq!(
4906            seeded_length_scale, 0.0,
4907            "default matern() must leave length_scale at the 0.0 auto sentinel \
4908             (got {seeded_length_scale}); a non-zero diameter default re-enters the \
4909             over-smoothed basin and disables the planner's wiggly-side auto-init",
4910        );
4911
4912        // (2) After the shared auto-init runs, the realized length-scale must
4913        // land in the resolving regime: `max_range / sqrt(n)`, far below the
4914        // data diameter. This is the seed the κ-optimizer starts REML from.
4915        crate::smooth::auto_init_length_scale_in_basis(ds.values.view(), &mut basis);
4916        let realized = match &basis {
4917            SmoothBasisSpec::Matern { spec, .. } => spec.length_scale,
4918            other => panic!("expected Matern basis after auto-init, got {other:?}"),
4919        };
4920        let expected = crate::smooth::auto_initial_length_scale(ds.values.view(), &feature_cols);
4921        assert!(
4922            (realized - expected).abs() <= 1e-12,
4923            "auto-init must seed the wiggly-side length scale max_range/sqrt(n) \
4924             (expected {expected}, got {realized})",
4925        );
4926
4927        // Sanity: the resolving seed is well below the per-axis range (≈1.0).
4928        // Before the fix the seed was the full diameter (≈√2 ≈ 1.414); the
4929        // resolving seed here is ≈ 1.0 / sqrt(576) ≈ 0.042, ~30× smaller.
4930        let max_range = 1.0_f64; // each axis spans [0, 1]
4931        assert!(
4932            realized < max_range / 4.0,
4933            "matern seed length_scale {realized} must be in the resolving regime, \
4934             not the over-smoothed diameter corner (n={n}, max_range≈{max_range})",
4935        );
4936    }
4937
4938    /// gam#1778: `matern(..., periodic=true)` and `thinplate(..., periodic=true)`
4939    /// must be ACCEPTED. The squash-merge that wired periodic support into the
4940    /// matern/thinplate basis specs forgot to add the periodic option keys to
4941    /// those two builders' `validate_known_options` whitelists (only `duchon`
4942    /// got both), so `periodic=`/`period=`/`cyclic=`/`period_start=`/`period_end=`
4943    /// were rejected as unknown options even though the spec/builder consume them.
4944    /// Before the whitelist fix this returned an "unknown option" error.
4945    #[test]
4946    fn matern_and_thinplate_accept_periodic_option() {
4947        let n = 200usize;
4948        let rows: Vec<Vec<f64>> = (0..n)
4949            .map(|i| {
4950                let x = -3.0 + 6.0 * (i as f64) / ((n - 1) as f64);
4951                vec![x.sin(), x]
4952            })
4953            .collect();
4954        let ds = continuous_dataset(&["y", "x"], rows);
4955
4956        // matern() with periodic=true must build without an unknown-option error.
4957        let mut matern_opts = BTreeMap::new();
4958        matern_opts.insert("bs".to_string(), "gp".to_string()); // gp ⇒ Matérn
4959        matern_opts.insert("periodic".to_string(), "true".to_string());
4960        let mut notes = Vec::new();
4961        let matern_basis = build_smooth_basis(
4962            SmoothKind::S,
4963            &["x".to_string()],
4964            &[1],
4965            &matern_opts,
4966            &ds,
4967            &mut notes,
4968            &ResourcePolicy::default_library(),
4969            1,
4970        )
4971        .expect("matern(x, periodic=true) must be accepted");
4972        match &matern_basis {
4973            SmoothBasisSpec::Matern { spec, .. } => assert!(
4974                spec.periodic.is_some(),
4975                "periodic=true must thread a Some(periodic) into the matern spec",
4976            ),
4977            other => panic!("expected Matern basis, got {other:?}"),
4978        }
4979
4980        // thinplate()/tps() with periodic=true must likewise be accepted.
4981        let mut tps_opts = BTreeMap::new();
4982        tps_opts.insert("bs".to_string(), "tp".to_string());
4983        tps_opts.insert("periodic".to_string(), "true".to_string());
4984        let mut notes = Vec::new();
4985        let tps_basis = build_smooth_basis(
4986            SmoothKind::S,
4987            &["x".to_string()],
4988            &[1],
4989            &tps_opts,
4990            &ds,
4991            &mut notes,
4992            &ResourcePolicy::default_library(),
4993            1,
4994        )
4995        .expect("thinplate(x, periodic=true) must be accepted");
4996        match &tps_basis {
4997            SmoothBasisSpec::ThinPlate { spec, .. } => assert!(
4998                spec.periodic.is_some(),
4999                "periodic=true must thread a Some(periodic) into the thinplate spec",
5000            ),
5001            other => panic!("expected ThinPlate basis, got {other:?}"),
5002        }
5003    }
5004
5005    /// Regression: an explicit scalar `periodic=false` on a radial spatial smooth
5006    /// must build a NON-periodic basis. The scalar-boolean shortcut used to emit
5007    /// `Some(vec![None; dim])`, which the 1-D radial builders route on via
5008    /// `spec.periodic.is_some()` (and the Duchon arm even back-fills the data
5009    /// range into a lone `None`), so `periodic=false` silently produced a
5010    /// *periodic* smooth — the opposite of what was asked. The spec's `periodic`
5011    /// field must be `None` for every radial base (matern / thinplate / duchon),
5012    /// matching the bracketed `[false]` form.
5013    #[test]
5014    fn scalar_periodic_false_builds_non_periodic_radial_smooth() {
5015        let n = 200usize;
5016        let rows: Vec<Vec<f64>> = (0..n)
5017            .map(|i| {
5018                let x = -3.0 + 6.0 * (i as f64) / ((n - 1) as f64);
5019                vec![x.sin(), x]
5020            })
5021            .collect();
5022        let ds = continuous_dataset(&["y", "x"], rows);
5023
5024        let build = |bs: &str| -> SmoothBasisSpec {
5025            let mut opts = BTreeMap::new();
5026            opts.insert("bs".to_string(), bs.to_string());
5027            opts.insert("periodic".to_string(), "false".to_string());
5028            let mut notes = Vec::new();
5029            build_smooth_basis(
5030                SmoothKind::S,
5031                &["x".to_string()],
5032                &[1],
5033                &opts,
5034                &ds,
5035                &mut notes,
5036                &ResourcePolicy::default_library(),
5037                1,
5038            )
5039            .unwrap_or_else(|e| panic!("s(x, bs={bs}, periodic=false) must be accepted: {e}"))
5040        };
5041
5042        match &build("gp") {
5043            SmoothBasisSpec::Matern { spec, .. } => assert!(
5044                spec.periodic.is_none(),
5045                "periodic=false must leave the matern spec non-periodic, got {:?}",
5046                spec.periodic
5047            ),
5048            other => panic!("expected Matern basis, got {other:?}"),
5049        }
5050        match &build("tp") {
5051            SmoothBasisSpec::ThinPlate { spec, .. } => assert!(
5052                spec.periodic.is_none(),
5053                "periodic=false must leave the thinplate spec non-periodic, got {:?}",
5054                spec.periodic
5055            ),
5056            other => panic!("expected ThinPlate basis, got {other:?}"),
5057        }
5058        match &build("duchon") {
5059            SmoothBasisSpec::Duchon { spec, .. } => assert!(
5060                spec.periodic.is_none(),
5061                "periodic=false must leave the duchon spec non-periodic (no data-range \
5062                 back-fill), got {:?}",
5063                spec.periodic
5064            ),
5065            other => panic!("expected Duchon basis, got {other:?}"),
5066        }
5067    }
5068
5069    fn inferred_tensor_basis_product(ds: &Dataset) -> usize {
5070        let parsed = parse_formula("y ~ te(theta, h)").expect("parse tensor formula");
5071        let col_map = ds.column_map();
5072        let mut notes = Vec::new();
5073        let terms = build_termspec(
5074            &parsed.terms,
5075            ds,
5076            &col_map,
5077            &mut notes,
5078            &ResourcePolicy::default_library(),
5079        )
5080        .expect("build tensor termspec");
5081        let SmoothBasisSpec::TensorBSpline { spec, .. } = &terms.smooth_terms[0].basis else {
5082            panic!("expected tensor smooth");
5083        };
5084        spec.marginalspecs
5085            .iter()
5086            .map(|marginal| match marginal.knotspec {
5087                BSplineKnotSpec::Generate {
5088                    num_internal_knots, ..
5089                } => num_internal_knots + marginal.degree + 1,
5090                BSplineKnotSpec::PeriodicUniform { num_basis, .. } => num_basis,
5091                BSplineKnotSpec::Automatic {
5092                    num_internal_knots: Some(num_internal_knots),
5093                    ..
5094                } => num_internal_knots + marginal.degree + 1,
5095                BSplineKnotSpec::Automatic {
5096                    num_internal_knots: None,
5097                    ..
5098                } => panic!("test helper cannot infer automatic knot count"),
5099                BSplineKnotSpec::Provided(ref knots) => {
5100                    knots.len().saturating_sub(marginal.degree + 1)
5101                }
5102                // cr basis dimension equals the knot count (no degree offset).
5103                BSplineKnotSpec::NaturalCubicRegression { ref knots } => knots.len(),
5104            })
5105            .product()
5106    }
5107
5108    fn tensor_margin_basis_sizes(ds: &Dataset, formula: &str) -> Vec<usize> {
5109        let parsed = parse_formula(formula).expect("parse tensor formula");
5110        let col_map = ds.column_map();
5111        let mut notes = Vec::new();
5112        let terms = build_termspec(
5113            &parsed.terms,
5114            ds,
5115            &col_map,
5116            &mut notes,
5117            &ResourcePolicy::default_library(),
5118        )
5119        .expect("build tensor termspec");
5120        let SmoothBasisSpec::TensorBSpline { spec, .. } = &terms.smooth_terms[0].basis else {
5121            panic!("expected tensor smooth");
5122        };
5123        spec.marginalspecs
5124            .iter()
5125            .map(|marginal| match marginal.knotspec {
5126                BSplineKnotSpec::Generate {
5127                    num_internal_knots, ..
5128                } => num_internal_knots + marginal.degree + 1,
5129                BSplineKnotSpec::PeriodicUniform { num_basis, .. } => num_basis,
5130                BSplineKnotSpec::Automatic {
5131                    num_internal_knots: Some(num_internal_knots),
5132                    ..
5133                } => num_internal_knots + marginal.degree + 1,
5134                BSplineKnotSpec::Automatic {
5135                    num_internal_knots: None,
5136                    ..
5137                } => panic!("test helper cannot infer automatic knot count"),
5138                BSplineKnotSpec::Provided(ref knots) => {
5139                    knots.len().saturating_sub(marginal.degree + 1)
5140                }
5141                // cr basis dimension equals the knot count (no degree offset).
5142                BSplineKnotSpec::NaturalCubicRegression { ref knots } => knots.len(),
5143            })
5144            .collect()
5145    }
5146
5147    #[test]
5148    fn validate_known_options_lists_valid_option_names_for_unknown_parameter() {
5149        let mut options = BTreeMap::new();
5150        options.insert("lengt_scale".to_string(), "0.25".to_string());
5151        let err = validate_known_options(
5152            "matern",
5153            &options,
5154            &["type", "bs", "length_scale", "centers", "k", "nu"],
5155        )
5156        .expect_err("unknown smooth option should be rejected");
5157        assert!(
5158            err.contains("matern() does not accept option `lengt_scale`"),
5159            "error should name the invalid option, got: {err}"
5160        );
5161        assert!(
5162            err.contains("did you mean one of [length_scale]"),
5163            "error should suggest the closest valid option, got: {err}"
5164        );
5165        assert!(
5166            err.contains("Valid options: ["),
5167            "error should list valid option names, got: {err}"
5168        );
5169    }
5170
5171    #[test]
5172    fn tensor_k_accepts_square_bracket_per_margin_list() {
5173        let ds = continuous_dataset(
5174            &["y", "x", "z"],
5175            (0..40)
5176                .map(|i| {
5177                    let x = i as f64 / 39.0;
5178                    let z = ((i * 7) % 40) as f64 / 39.0;
5179                    vec![x.sin() + z.cos(), x, z]
5180                })
5181                .collect(),
5182        );
5183
5184        assert_eq!(
5185            tensor_margin_basis_sizes(&ds, "y ~ te(x, z, k=[5, 6])"),
5186            vec![5, 6],
5187            "square-bracket k lists should materialize the requested per-margin values"
5188        );
5189    }
5190
5191    /// #1776 / #1752: a bare doubly-cyclic tensor `te(x, z, bs=c('cc','cc'))`
5192    /// with NO explicit `period=` must build — each cyclic margin wraps on its
5193    /// own observed `[min, max]` data span (mirroring mgcv's `bs="cc"` and the
5194    /// 1-D cyclic fallback), instead of hard-erroring "periodic but requires an
5195    /// explicit period". The periodic-radial refactor (c8c3192fa) replaced that
5196    /// fallback with an unconditional `period=`-required error and orphaned the
5197    /// `margin_is_cc` binding that drives it (the #1776 dead-binding `-D
5198    /// warnings` build break). This pins the restored data-range derivation so a
5199    /// regression that drops the `None if margin_is_cc` branch trips here, fast,
5200    /// with no fit/optimizer in the loop.
5201    #[test]
5202    fn bare_doubly_cyclic_tensor_derives_period_from_data_range_1776() {
5203        let ds = continuous_dataset(
5204            &["y", "x", "z"],
5205            (0..40)
5206                .map(|i| {
5207                    let x = i as f64 / 39.0;
5208                    let z = ((i * 7) % 40) as f64 / 39.0;
5209                    vec![x.sin() + z.cos(), x, z]
5210                })
5211                .collect(),
5212        );
5213
5214        let parsed = parse_formula("y ~ te(x, z, bs=c('cc','cc'))")
5215            .expect("parse doubly-cyclic tensor formula");
5216        let col_map = ds.column_map();
5217        let mut notes = Vec::new();
5218        // Must NOT hard-error: the bare cyclic margins derive their period from
5219        // the observed data range (the restored #1752 fallback).
5220        let terms = build_termspec(
5221            &parsed.terms,
5222            &ds,
5223            &col_map,
5224            &mut notes,
5225            &ResourcePolicy::default_library(),
5226        )
5227        .expect(
5228            "bare cc-cc tensor must build via the data-range period fallback (#1776/#1752), \
5229             not hard-error on a missing explicit period",
5230        );
5231        let SmoothBasisSpec::TensorBSpline { spec, .. } = &terms.smooth_terms[0].basis else {
5232            panic!("expected tensor smooth");
5233        };
5234        assert_eq!(
5235            spec.marginalspecs.len(),
5236            2,
5237            "te(x, z) builds exactly two tensor margins"
5238        );
5239        for (axis, marginal) in spec.marginalspecs.iter().enumerate() {
5240            assert!(
5241                matches!(marginal.knotspec, BSplineKnotSpec::PeriodicUniform { .. }),
5242                "cyclic margin {axis} must build a periodic (wrapped) knotspec from the \
5243                 data range, got {:?}",
5244                marginal.knotspec
5245            );
5246        }
5247    }
5248
5249    #[test]
5250    fn parse_cylinder_periodic_options_match_requested_forms() {
5251        let mut opts = BTreeMap::new();
5252        opts.insert("periodic".to_string(), "[0]".to_string());
5253        opts.insert("period".to_string(), "[2*pi, None]".to_string());
5254        let axes = parse_periodic_axes(&opts, 2).expect("axes");
5255        let periods = parse_periods(&opts, &axes).expect("periods");
5256        assert_eq!(axes, vec![true, false]);
5257        assert!((periods[0].unwrap() - 2.0 * std::f64::consts::PI).abs() < 1e-12);
5258        assert_eq!(periods[1], None);
5259
5260        let mut boundary_opts = BTreeMap::new();
5261        boundary_opts.insert(
5262            "boundary".to_string(),
5263            "['periodic', 'natural']".to_string(),
5264        );
5265        boundary_opts.insert("period".to_string(), "[2*pi, None]".to_string());
5266        let boundary_axes = parse_periodic_axes(&boundary_opts, 2).expect("boundary axes");
5267        let boundary_periods =
5268            parse_periods(&boundary_opts, &boundary_axes).expect("boundary periods");
5269        assert_eq!(boundary_axes, vec![true, false]);
5270        assert!((boundary_periods[0].unwrap() - 2.0 * std::f64::consts::PI).abs() < 1e-12);
5271        assert_eq!(boundary_periods[1], None);
5272
5273        let mut unicode_opts = BTreeMap::new();
5274        unicode_opts.insert("periodic".to_string(), "[0,1]".to_string());
5275        unicode_opts.insert("period".to_string(), "[2π, τ]".to_string());
5276        let unicode_axes = parse_periodic_axes(&unicode_opts, 2).expect("unicode axes");
5277        let unicode_periods = parse_periods(&unicode_opts, &unicode_axes).expect("unicode periods");
5278        assert_eq!(unicode_axes, vec![true, true]);
5279        assert!((unicode_periods[0].unwrap() - 2.0 * std::f64::consts::PI).abs() < 1e-12);
5280        assert!((unicode_periods[1].unwrap() - std::f64::consts::TAU).abs() < 1e-12);
5281    }
5282
5283    /// The tensor boundary-token guard must ACCEPT `clamped`/`open` (the
5284    /// B-spline-clamped, non-periodic margin spelling) alongside the periodic
5285    /// selectors and the other inert non-periodic markers, and still REJECT a
5286    /// genuine endpoint constraint like `anchored`. This locks the #415 /
5287    /// cylinder fix (`te(theta, z, boundary=['periodic','clamped'])`, mgcv
5288    /// `te(bs=c("cc","ps"))`) in the fast unit lane — the end-to-end cylinder
5289    /// recovery test is R-gated (`run_r` + mgcv), so without this the guard
5290    /// regressing back to rejecting `clamped` would slip through CPU CI.
5291    #[test]
5292    fn tensor_boundary_tokens_accept_clamped_open_reject_anchored() {
5293        fn boundary(raw: &str, dim: usize) -> Result<(), String> {
5294            let mut opts = BTreeMap::new();
5295            opts.insert("boundary".to_string(), raw.to_string());
5296            validate_tensor_boundary_tokens(&opts, dim)
5297        }
5298
5299        // Mixed periodic + clamped (the cylinder) and its bare/case/quote
5300        // variants are all accepted.
5301        for raw in [
5302            "['periodic', 'clamped']",
5303            "['periodic', 'open']",
5304            "['cc', 'clamped']",
5305            "['clamped', 'natural']",
5306            "[Periodic, CLAMPED]",
5307            "c('cc', 'clamped')", // mgcv-style c(...) vector form round-trips
5308        ] {
5309            assert!(
5310                boundary(raw, 2).is_ok(),
5311                "boundary={raw:?} must be accepted (clamped/open/inert non-periodic markers)"
5312            );
5313        }
5314
5315        // `bc=` is an accepted alias for `boundary=`.
5316        let mut bc_opts = BTreeMap::new();
5317        bc_opts.insert("bc".to_string(), "['periodic', 'clamped']".to_string());
5318        assert!(validate_tensor_boundary_tokens(&bc_opts, 2).is_ok());
5319
5320        // A genuine endpoint constraint has no ordinary-margin meaning on a
5321        // tensor and must still be surfaced as a clean unsupported-feature error
5322        // rather than silently dropped.
5323        let err = boundary("['periodic', 'anchored']", 2)
5324            .expect_err("anchored endpoint constraint must be rejected on a tensor margin");
5325        assert!(
5326            err.contains("anchored") && err.contains("not supported"),
5327            "rejection must name the offending token and be an unsupported-feature error: {err}"
5328        );
5329
5330        // Absent boundary/bc is a no-op success.
5331        assert!(validate_tensor_boundary_tokens(&BTreeMap::new(), 2).is_ok());
5332    }
5333
5334    #[test]
5335    fn parse_single_axis_periodic_zero_as_axis_not_false() {
5336        let mut opts = BTreeMap::new();
5337        opts.insert("periodic".to_string(), "[0]".to_string());
5338        opts.insert("period".to_string(), "2*pi".to_string());
5339        opts.insert("origin".to_string(), "0".to_string());
5340        let axes = parse_periodic_axes(&opts, 1).expect("axes");
5341        let periods = parse_periods(&opts, &axes).expect("periods");
5342        let origins = parse_period_origins(&opts, &axes).expect("origins");
5343        assert_eq!(axes, vec![true]);
5344        assert!((periods[0].unwrap() - 2.0 * std::f64::consts::PI).abs() < 1e-12);
5345        assert_eq!(origins[0], Some(0.0));
5346    }
5347
5348    #[test]
5349    fn one_dimensional_bspline_accepts_boundary_periodic() {
5350        let ds = continuous_dataset(
5351            &["y", "theta"],
5352            (0..16)
5353                .map(|i| {
5354                    let theta = std::f64::consts::TAU * i as f64 / 16.0;
5355                    vec![theta.sin(), theta]
5356                })
5357                .collect(),
5358        );
5359        let parsed = parse_formula("y ~ s(theta, boundary=periodic, period=2*pi, origin=0, k=8)")
5360            .expect("parse");
5361        let col_map = ds.column_map();
5362        let mut notes = Vec::new();
5363        let terms = build_termspec(
5364            &parsed.terms,
5365            &ds,
5366            &col_map,
5367            &mut notes,
5368            &gam_runtime::resource::ResourcePolicy::default_library(),
5369        )
5370        .expect("periodic boundary should build");
5371        let SmoothBasisSpec::BSpline1D { spec, .. } = &terms.smooth_terms[0].basis else {
5372            panic!("expected 1D B-spline");
5373        };
5374        assert!(matches!(
5375            &spec.knotspec,
5376            BSplineKnotSpec::PeriodicUniform {
5377                data_range,
5378                num_basis: 8
5379            } if *data_range == (0.0, std::f64::consts::TAU)
5380        ));
5381    }
5382
5383    #[test]
5384    fn univariate_smooth_accepts_mgcv_cubic_regression_aliases() {
5385        let ds = continuous_dataset(
5386            &["y", "x"],
5387            (0..32)
5388                .map(|i| {
5389                    let x = i as f64 / 31.0;
5390                    vec![x * x, x]
5391                })
5392                .collect(),
5393        );
5394        let col_map = ds.column_map();
5395
5396        for (selector, expect_double_penalty) in [("cr", false), ("cs", true)] {
5397            let formula = format!("y ~ s(x, bs='{selector}')");
5398            let parsed = parse_formula(&formula).expect("parse cr/cs smooth");
5399            let mut notes = Vec::new();
5400            let terms = build_termspec(
5401                &parsed.terms,
5402                &ds,
5403                &col_map,
5404                &mut notes,
5405                &gam_runtime::resource::ResourcePolicy::default_library(),
5406            )
5407            .unwrap_or_else(|err| panic!("bs='{selector}' must build a 1-D smooth, got: {err:?}"));
5408            let SmoothBasisSpec::BSpline1D { spec, .. } = &terms.smooth_terms[0].basis else {
5409                panic!(
5410                    "bs='{selector}' must lower to a BSpline1D; got {:?}",
5411                    terms.smooth_terms[0].basis
5412                );
5413            };
5414            assert_eq!(
5415                spec.double_penalty, expect_double_penalty,
5416                "bs='{selector}' must default double_penalty to mgcv's convention \
5417                 (cr=no-shrinkage, cs=shrinkage); got double_penalty={}",
5418                spec.double_penalty
5419            );
5420        }
5421    }
5422
5423    #[test]
5424    fn univariate_ps_small_k_degree_reduces_through_build(/* gam#1130 */) {
5425        // mgcv accepts `s(x, bs="ps", k=3)` (and the default cubic-regression
5426        // `s(x, k=3)`) by silently reducing the cubic basis to a quadratic.
5427        // The univariate ps/bspline build path used to reject this with
5428        // "k too small for degree 3"; it must now lower to a degree-2 basis
5429        // with zero internal knots (num_basis = k = 3), matching the te(...)
5430        // margin behaviour fixed in b75f55a91. Verified across the ps alias
5431        // and the default (cr) selector that both route through
5432        // parse_ps_internal_knots.
5433        let ds = continuous_dataset(
5434            &["y", "x"],
5435            (0..32)
5436                .map(|i| {
5437                    let x = i as f64 / 31.0;
5438                    vec![x * x, x]
5439                })
5440                .collect(),
5441        );
5442        let col_map = ds.column_map();
5443
5444        for formula in ["y ~ s(x, bs='ps', k=3)", "y ~ s(x, k=3)"] {
5445            let parsed = parse_formula(formula).expect("parse small-k ps/cr smooth");
5446            let mut notes = Vec::new();
5447            let terms = build_termspec(
5448                &parsed.terms,
5449                &ds,
5450                &col_map,
5451                &mut notes,
5452                &gam_runtime::resource::ResourcePolicy::default_library(),
5453            )
5454            .unwrap_or_else(|err| {
5455                panic!("`{formula}` must degree-reduce, not error; got: {err:?}")
5456            });
5457            let SmoothBasisSpec::BSpline1D { spec, .. } = &terms.smooth_terms[0].basis else {
5458                panic!(
5459                    "`{formula}` must lower to a BSpline1D; got {:?}",
5460                    terms.smooth_terms[0].basis
5461                );
5462            };
5463            assert_eq!(
5464                spec.degree, 2,
5465                "`{formula}` must drop the cubic default to a quadratic basis"
5466            );
5467            let num_internal = match &spec.knotspec {
5468                BSplineKnotSpec::Generate {
5469                    num_internal_knots, ..
5470                } => *num_internal_knots,
5471                BSplineKnotSpec::Automatic {
5472                    num_internal_knots: Some(n),
5473                    ..
5474                } => *n,
5475                other => panic!("`{formula}` unexpected knotspec: {other:?}"),
5476            };
5477            assert_eq!(
5478                num_internal, 0,
5479                "`{formula}` must have zero internal knots (num_basis = k = 3)"
5480            );
5481            // Resulting basis dimension is num_internal + degree + 1 = 3 = k.
5482            assert!(
5483                spec.penalty_order >= 1 && spec.penalty_order <= spec.degree,
5484                "`{formula}` penalty_order {} must satisfy 1 <= order <= degree={}",
5485                spec.penalty_order,
5486                spec.degree
5487            );
5488        }
5489    }
5490
5491    #[test]
5492    fn formula_shape_constraint_round_trips_and_rejects_bogus() {
5493        let ds = continuous_dataset(
5494            &["y", "x"],
5495            (0..32)
5496                .map(|i| {
5497                    let x = i as f64 / 31.0;
5498                    vec![x * x, x]
5499                })
5500                .collect(),
5501        );
5502        let col_map = ds.column_map();
5503
5504        let parsed =
5505            parse_formula("y ~ s(x, shape=monotone_increasing)").expect("parse monotone smooth");
5506        let mut notes = Vec::new();
5507        let terms = build_termspec(
5508            &parsed.terms,
5509            &ds,
5510            &col_map,
5511            &mut notes,
5512            &gam_runtime::resource::ResourcePolicy::default_library(),
5513        )
5514        .expect("monotone smooth should build");
5515        assert_eq!(
5516            terms.smooth_terms[0].shape,
5517            ShapeConstraint::MonotoneIncreasing
5518        );
5519
5520        let parsed_bad = parse_formula("y ~ s(x, shape=bogus)").expect("parse bogus shape");
5521        let mut notes_bad = Vec::new();
5522        let err = build_termspec(
5523            &parsed_bad.terms,
5524            &ds,
5525            &col_map,
5526            &mut notes_bad,
5527            &gam_runtime::resource::ResourcePolicy::default_library(),
5528        )
5529        .expect_err("bogus shape must error");
5530        assert!(
5531            format!("{err:?}").contains("unknown shape constraint"),
5532            "got: {err:?}"
5533        );
5534    }
5535
5536    #[test]
5537    fn default_sphere_smooth_uses_spherical_farthest_point_centers() {
5538        let ds = continuous_dataset(
5539            &["y", "lat", "lon"],
5540            (0..24)
5541                .map(|i| {
5542                    let t = i as f64 / 24.0;
5543                    let lat = -60.0 + 120.0 * t;
5544                    let lon = -180.0 + 360.0 * ((7 * i) % 24) as f64 / 24.0;
5545                    vec![lat.to_radians().sin(), lat, lon]
5546                })
5547                .collect(),
5548        );
5549        let parsed = parse_formula("y ~ sphere(lat, lon)").expect("parse");
5550        let col_map = ds.column_map();
5551        let mut notes = Vec::new();
5552        let terms = build_termspec(
5553            &parsed.terms,
5554            &ds,
5555            &col_map,
5556            &mut notes,
5557            &gam_runtime::resource::ResourcePolicy::default_library(),
5558        )
5559        .expect("build sphere termspec");
5560        let SmoothBasisSpec::Sphere { spec, .. } = &terms.smooth_terms[0].basis else {
5561            panic!("expected sphere term");
5562        };
5563        assert!(matches!(
5564            spec.center_strategy,
5565            CenterStrategy::FarthestPoint { .. }
5566        ));
5567    }
5568
5569    #[test]
5570    fn one_dimensional_duchon_defaults_to_scale_free_length_scale() {
5571        let ds = continuous_dataset(
5572            &["y", "x"],
5573            (0..32)
5574                .map(|i| {
5575                    let x = i as f64 / 31.0;
5576                    vec![(std::f64::consts::TAU * x).sin(), x]
5577                })
5578                .collect(),
5579        );
5580        let parsed = parse_formula("y ~ duchon(x)").expect("parse");
5581        let col_map = ds.column_map();
5582        let mut notes = Vec::new();
5583        let terms = build_termspec(
5584            &parsed.terms,
5585            &ds,
5586            &col_map,
5587            &mut notes,
5588            &gam_runtime::resource::ResourcePolicy::default_library(),
5589        )
5590        .expect("build default duchon termspec");
5591        let SmoothBasisSpec::Duchon { spec, .. } = &terms.smooth_terms[0].basis else {
5592            panic!("expected Duchon term");
5593        };
5594        assert_eq!(spec.length_scale, None);
5595    }
5596
5597    #[test]
5598    fn formula_duchon_default_does_not_enable_collocation_operators() {
5599        let ds = continuous_dataset(
5600            &["y", "x", "z"],
5601            (0..40)
5602                .map(|i| {
5603                    let x = (i as f64 / 39.0).fract();
5604                    let z = ((7 * i) as f64 / 39.0).fract();
5605                    vec![x + z, x, z]
5606                })
5607                .collect(),
5608        );
5609        let parsed = parse_formula("y ~ duchon(x, z)").expect("parse");
5610        let col_map = ds.column_map();
5611        let mut notes = Vec::new();
5612        let terms = build_termspec(
5613            &parsed.terms,
5614            &ds,
5615            &col_map,
5616            &mut notes,
5617            &gam_runtime::resource::ResourcePolicy::default_library(),
5618        )
5619        .expect("build default 2D duchon termspec");
5620        let SmoothBasisSpec::Duchon { spec, .. } = &terms.smooth_terms[0].basis else {
5621            panic!("expected Duchon term");
5622        };
5623        assert!(matches!(
5624            spec.operator_penalties.mass,
5625            OperatorPenaltySpec::Disabled
5626        ));
5627        assert!(matches!(
5628            spec.operator_penalties.tension,
5629            OperatorPenaltySpec::Disabled
5630        ));
5631        assert!(matches!(
5632            spec.operator_penalties.stiffness,
5633            OperatorPenaltySpec::Disabled
5634        ));
5635    }
5636
5637    #[test]
5638    fn one_dimensional_duchon_length_scale_opts_into_hybrid_mode() {
5639        let ds = continuous_dataset(
5640            &["y", "x"],
5641            (0..32)
5642                .map(|i| {
5643                    let x = i as f64 / 31.0;
5644                    vec![(std::f64::consts::TAU * x).sin(), x]
5645                })
5646                .collect(),
5647        );
5648        let parsed = parse_formula("y ~ duchon(x, length_scale=0.25)").expect("parse");
5649        let col_map = ds.column_map();
5650        let mut notes = Vec::new();
5651        let terms = build_termspec(
5652            &parsed.terms,
5653            &ds,
5654            &col_map,
5655            &mut notes,
5656            &gam_runtime::resource::ResourcePolicy::default_library(),
5657        )
5658        .expect("build hybrid duchon termspec");
5659        let SmoothBasisSpec::Duchon { spec, .. } = &terms.smooth_terms[0].basis else {
5660            panic!("expected Duchon term");
5661        };
5662        assert_eq!(spec.length_scale, Some(0.25));
5663    }
5664
5665    #[test]
5666    fn multidimensional_duchon_default_uses_low_rank_mgcv_sized_basis() {
5667        let ds = continuous_dataset(
5668            &["y", "x1", "x2"],
5669            (0..500)
5670                .map(|i| {
5671                    let x1 = 2.0 * (i as f64 / 499.0) - 1.0;
5672                    let x2 = (((37 * i) % 500) as f64 / 499.0) * 2.0 - 1.0;
5673                    vec![(2.0 * x1).sin() + (1.5 * x2).cos(), x1, x2]
5674                })
5675                .collect(),
5676        );
5677        let parsed = parse_formula("y ~ duchon(x1, x2)").expect("parse");
5678        let col_map = ds.column_map();
5679        let mut notes = Vec::new();
5680        let terms = build_termspec(
5681            &parsed.terms,
5682            &ds,
5683            &col_map,
5684            &mut notes,
5685            &gam_runtime::resource::ResourcePolicy::default_library(),
5686        )
5687        .expect("build default 2D duchon termspec");
5688        let SmoothBasisSpec::Duchon { spec, .. } = &terms.smooth_terms[0].basis else {
5689            panic!("expected Duchon term");
5690        };
5691        let CenterStrategy::Auto(inner) = &spec.center_strategy else {
5692            panic!("expected auto center strategy");
5693        };
5694        assert!(matches!(
5695            inner.as_ref(),
5696            CenterStrategy::FarthestPoint { num_centers: 30 }
5697        ));
5698    }
5699
5700    #[test]
5701    fn parse_matern_nu_accepts_equivalent_half_integer_forms() {
5702        let cases = [
5703            ("1/2", MaternNu::Half),
5704            (" 1 / 2 ", MaternNu::Half),
5705            (".5", MaternNu::Half),
5706            ("0.50", MaternNu::Half),
5707            ("half", MaternNu::Half),
5708            ("3 / 2", MaternNu::ThreeHalves),
5709            ("1.50", MaternNu::ThreeHalves),
5710            ("5 / 2", MaternNu::FiveHalves),
5711            ("2.500000000000", MaternNu::FiveHalves),
5712            ("7 / 2", MaternNu::SevenHalves),
5713            ("3.50", MaternNu::SevenHalves),
5714            ("9 / 2", MaternNu::NineHalves),
5715            ("4.50", MaternNu::NineHalves),
5716        ];
5717        for (raw, expected) in cases {
5718            let parsed = parse_matern_nu(raw).expect(raw);
5719            assert!(
5720                matches!(
5721                    (parsed, expected),
5722                    (MaternNu::Half, MaternNu::Half)
5723                        | (MaternNu::ThreeHalves, MaternNu::ThreeHalves)
5724                        | (MaternNu::FiveHalves, MaternNu::FiveHalves)
5725                        | (MaternNu::SevenHalves, MaternNu::SevenHalves)
5726                        | (MaternNu::NineHalves, MaternNu::NineHalves)
5727                ),
5728                "parsed {raw:?} as {parsed:?}, expected {expected:?}"
5729            );
5730        }
5731    }
5732
5733    #[test]
5734    fn parse_matern_nu_rejects_unsupported_or_invalid_values() {
5735        for raw in ["1", "2", "11/2", "1/0", "nan", "fast"] {
5736            let err = parse_matern_nu(raw).expect_err(raw);
5737            assert!(
5738                err.contains("supported half-integer values"),
5739                "unexpected error for {raw:?}: {err}"
5740            );
5741        }
5742    }
5743
5744    #[test]
5745    fn parse_ps_k_promotes_underexpressive_cubic_basis() {
5746        let mut opts = BTreeMap::new();
5747        opts.insert("k".to_string(), "4".to_string());
5748        let (internal, inferred, eff_degree) = parse_ps_internal_knots(&opts, 3, 20).expect("k=4");
5749        assert_eq!(internal, 2);
5750        assert_eq!(eff_degree, 3);
5751        assert!(!inferred);
5752
5753        opts.insert("k".to_string(), "6".to_string());
5754        let (internal, inferred, eff_degree) = parse_ps_internal_knots(&opts, 3, 20).expect("k=6");
5755        assert_eq!(internal, 2);
5756        assert_eq!(eff_degree, 3);
5757        assert!(!inferred);
5758
5759        opts.insert("k".to_string(), "10".to_string());
5760        let (internal, inferred, eff_degree) = parse_ps_internal_knots(&opts, 3, 20).expect("k=10");
5761        assert_eq!(internal, 6);
5762        assert_eq!(eff_degree, 3);
5763        assert!(!inferred);
5764    }
5765
5766    #[test]
5767    fn parse_ps_internal_knots_drops_degree_for_small_k() {
5768        // mgcv's `s(x, bs="ps", k=3)` with the default cubic basis silently
5769        // reduces to a quadratic (`degree=2`) marginal. `k=3, degree=3`
5770        // should yield a quadratic basis with zero internal knots
5771        // (`num_basis = k = 3`).
5772        let mut opts = BTreeMap::new();
5773        opts.insert("k".to_string(), "3".to_string());
5774        let (internal, inferred, eff_degree) = parse_ps_internal_knots(&opts, 3, 20).expect("k=3");
5775        assert_eq!(eff_degree, 2);
5776        assert_eq!(internal, 0);
5777        assert!(!inferred);
5778
5779        // `k=2` reduces to a linear (`degree=1`) marginal — the smallest
5780        // non-trivial spline basis.
5781        opts.insert("k".to_string(), "2".to_string());
5782        let (internal, inferred, eff_degree) = parse_ps_internal_knots(&opts, 3, 20).expect("k=2");
5783        assert_eq!(eff_degree, 1);
5784        assert_eq!(internal, 0);
5785        assert!(!inferred);
5786
5787        // The under-2 case is structurally under-specified and rejected even
5788        // by the degree-reducing variant: no B-spline basis has fewer than
5789        // two functions.
5790        opts.insert("k".to_string(), "1".to_string());
5791        let err = parse_ps_internal_knots(&opts, 3, 20)
5792            .expect_err("k=1 is below the irreducible spline floor");
5793        assert!(err.contains("requires k >= 2"), "unexpected error: {err}");
5794
5795        // When the user already passed `k >= degree+1`, the helper must
5796        // preserve the existing knot geometry exactly.
5797        opts.insert("k".to_string(), "4".to_string());
5798        let (internal, inferred, eff_degree) = parse_ps_internal_knots(&opts, 3, 20).expect("k=4");
5799        assert_eq!(eff_degree, 3);
5800        assert_eq!(internal, 2);
5801        assert!(!inferred);
5802    }
5803
5804    #[test]
5805    fn factor_smooth_marginal_degree_reduces_for_small_k() {
5806        let ds = factor_dataset();
5807        let col_map = ds.column_map();
5808
5809        for (k, expected_degree) in [(3usize, 2usize), (2usize, 1usize)] {
5810            let parsed =
5811                parse_formula(&format!("y ~ s(x, g, bs=fs, k={k})")).expect("parse factor smooth");
5812            let mut notes = Vec::new();
5813            let terms = build_termspec(
5814                &parsed.terms,
5815                &ds,
5816                &col_map,
5817                &mut notes,
5818                &gam_runtime::resource::ResourcePolicy::default_library(),
5819            )
5820            .unwrap_or_else(|err| panic!("fs k={k} should degree-reduce, got: {err:?}"));
5821            let SmoothBasisSpec::FactorSmooth { spec } = &terms.smooth_terms[0].basis else {
5822                panic!(
5823                    "expected factor smooth, got {:?}",
5824                    terms.smooth_terms[0].basis
5825                );
5826            };
5827            assert_eq!(spec.marginal.degree, expected_degree);
5828            assert!(
5829                spec.marginal.penalty_order <= spec.marginal.degree,
5830                "penalty_order {} must be clamped to degree {}",
5831                spec.marginal.penalty_order,
5832                spec.marginal.degree
5833            );
5834            let basis_size = match spec.marginal.knotspec {
5835                BSplineKnotSpec::Generate {
5836                    num_internal_knots, ..
5837                } => num_internal_knots + spec.marginal.degree + 1,
5838                BSplineKnotSpec::Automatic {
5839                    num_internal_knots: Some(num_internal_knots),
5840                    ..
5841                } => num_internal_knots + spec.marginal.degree + 1,
5842                ref other => panic!("unexpected factor-smooth knotspec: {other:?}"),
5843            };
5844            assert_eq!(basis_size, k);
5845        }
5846    }
5847
5848    /// Build a dataset with a ternary continuous covariate `x ∈ {0,1,2}` and a
5849    /// 2-level categorical group `g`, for the low-cardinality cr-cap tests.
5850    fn ternary_factor_dataset() -> Dataset {
5851        let rows = (0..120)
5852            .map(|i| {
5853                let x = (i % 3) as f64;
5854                let g = (i % 2) as f64;
5855                vec![x + g, x, g]
5856            })
5857            .collect::<Vec<_>>();
5858        Dataset {
5859            headers: vec!["y".into(), "x".into(), "g".into()],
5860            values: Array2::from_shape_vec(
5861                (rows.len(), 3),
5862                rows.into_iter().flat_map(|row| row.into_iter()).collect(),
5863            )
5864            .expect("rectangular ternary factor test data"),
5865            schema: DataSchema {
5866                columns: vec![
5867                    SchemaColumn {
5868                        name: "y".into(),
5869                        kind: ColumnKindTag::Continuous,
5870                        levels: vec![],
5871                    },
5872                    SchemaColumn {
5873                        name: "x".into(),
5874                        kind: ColumnKindTag::Continuous,
5875                        levels: vec![],
5876                    },
5877                    SchemaColumn {
5878                        name: "g".into(),
5879                        kind: ColumnKindTag::Categorical,
5880                        levels: vec!["a".into(), "b".into()],
5881                    },
5882                ],
5883            },
5884            column_kinds: vec![
5885                ColumnKindTag::Continuous,
5886                ColumnKindTag::Continuous,
5887                ColumnKindTag::Categorical,
5888            ],
5889        }
5890    }
5891
5892    #[test]
5893    fn univariate_cr_smooth_caps_knots_to_data_support() {
5894        // #1541: `s(x, bs=cr, k=10)` on a ternary covariate (3 distinct values)
5895        // must NOT hard-fail in cr-knot selection ("cubic regression spline with
5896        // k=10 requires at least 10 distinct values, got 3"). The cr basis is
5897        // capped to the data support — exactly 3 value-knots at {0,1,2} — which
5898        // is full-rank for the data, so it can still represent any 3 group means.
5899        let ds = continuous_dataset(
5900            &["y", "x"],
5901            (0..90)
5902                .map(|i| vec![(i % 3) as f64, (i % 3) as f64])
5903                .collect(),
5904        );
5905        let col_map = ds.column_map();
5906        let parsed = parse_formula("y ~ s(x, bs=cr, k=10)").expect("parse cr smooth");
5907        let mut notes = Vec::new();
5908        let terms = build_termspec(
5909            &parsed.terms,
5910            &ds,
5911            &col_map,
5912            &mut notes,
5913            &gam_runtime::resource::ResourcePolicy::default_library(),
5914        )
5915        .expect("cr k=10 must cap to data support instead of erroring");
5916        let SmoothBasisSpec::BSpline1D { spec, .. } = &terms.smooth_terms[0].basis else {
5917            panic!("expected BSpline1D for s(x, bs=cr)");
5918        };
5919        let BSplineKnotSpec::NaturalCubicRegression { knots } = &spec.knotspec else {
5920            panic!("expected cr knotspec, got {:?}", spec.knotspec);
5921        };
5922        // Capped to exactly the 3 distinct covariate values.
5923        assert_eq!(knots.len(), 3, "cr basis not capped to 3 distinct values");
5924        assert_eq!(knots.as_slice().unwrap(), &[0.0, 1.0, 2.0]);
5925        // The reduction is surfaced to the user (mgcv warns in the same case).
5926        assert!(
5927            notes.iter().any(|n| n.contains("data-support cap")),
5928            "cap not reported in inference notes: {notes:?}"
5929        );
5930    }
5931
5932    #[test]
5933    fn univariate_cr_smooth_binary_covariate_degrades_to_bspline() {
5934        // #1541: a BINARY covariate has too few distinct values (2) for ANY cr
5935        // spline (needs >= 3 distinct). `s(x, bs=cr)` must degrade to a B-spline
5936        // marginal — the default basis the same data already fits — NOT hard-fail.
5937        let ds = continuous_dataset(
5938            &["y", "x"],
5939            (0..80)
5940                .map(|i| vec![(i % 2) as f64, (i % 2) as f64])
5941                .collect(),
5942        );
5943        let col_map = ds.column_map();
5944        let parsed = parse_formula("y ~ s(x, bs=cr, k=10)").expect("parse cr smooth");
5945        let mut notes = Vec::new();
5946        let terms = build_termspec(
5947            &parsed.terms,
5948            &ds,
5949            &col_map,
5950            &mut notes,
5951            &gam_runtime::resource::ResourcePolicy::default_library(),
5952        )
5953        .expect("binary cr must degrade to B-spline instead of erroring");
5954        let SmoothBasisSpec::BSpline1D { spec, .. } = &terms.smooth_terms[0].basis else {
5955            panic!("expected BSpline1D for s(x, bs=cr)");
5956        };
5957        assert!(
5958            !matches!(
5959                spec.knotspec,
5960                BSplineKnotSpec::NaturalCubicRegression { .. }
5961            ),
5962            "binary covariate must NOT build a cr basis, got {:?}",
5963            spec.knotspec
5964        );
5965        assert!(
5966            notes
5967                .iter()
5968                .any(|n| n.contains("Degraded to the linear B-spline")),
5969            "degradation not reported in inference notes: {notes:?}"
5970        );
5971    }
5972
5973    #[test]
5974    fn sz_factor_smooth_low_cardinality_uses_bspline_marginal() {
5975        // #1605: the `sz` factor-smooth marginal is the SAME penalized B-spline
5976        // the `fs` sibling uses — NOT a natural cubic regression (`cr`) marginal,
5977        // whose hard natural boundary conditions f''=0 bias curved deviations
5978        // (a consistency failure). #1542 (the reason this test exists) is
5979        // subsumed: with a B-spline marginal a low-cardinality covariate no
5980        // longer needs a special cr data-support cap and can never hard-fail the
5981        // way the old cr-marginal `sz` spelling did — the build just succeeds,
5982        // exactly as `fs` already does on the identical data.
5983        let ds = ternary_factor_dataset();
5984        let col_map = ds.column_map();
5985        let parsed = parse_formula("y ~ s(x, g, bs=sz, k=10)").expect("parse sz factor smooth");
5986        let mut notes = Vec::new();
5987        let terms = build_termspec(
5988            &parsed.terms,
5989            &ds,
5990            &col_map,
5991            &mut notes,
5992            &gam_runtime::resource::ResourcePolicy::default_library(),
5993        )
5994        .expect("sz on a ternary covariate must build (B-spline marginal), not hard-fail");
5995        let SmoothBasisSpec::FactorSmooth { spec } = &terms.smooth_terms[0].basis else {
5996            panic!("expected FactorSmooth for s(x, g, bs=sz)");
5997        };
5998        assert!(
5999            !matches!(
6000                spec.marginal.knotspec,
6001                BSplineKnotSpec::NaturalCubicRegression { .. }
6002            ),
6003            "sz marginal must be a B-spline (curvature-capable), not the \
6004             natural-BC cr basis; got {:?}",
6005            spec.marginal.knotspec
6006        );
6007    }
6008
6009    /// A dataset with a genuinely continuous covariate `x` (many distinct
6010    /// values) and a `L`-level grouping factor `g`, suitable for building a
6011    /// real factor-smooth marginal with a non-trivial {const, linear} null
6012    /// space. `y` is unused by the structural penalty checks below.
6013    fn continuous_x_factor_dataset(n: usize, n_groups: usize) -> Dataset {
6014        let rows = (0..n)
6015            .map(|i| {
6016                let x = i as f64 / (n as f64 - 1.0);
6017                let g = (i % n_groups) as f64;
6018                vec![x + g, x, g]
6019            })
6020            .collect::<Vec<_>>();
6021        let levels: Vec<String> = (0..n_groups).map(|k| format!("g{k}")).collect();
6022        Dataset {
6023            headers: vec!["y".into(), "x".into(), "g".into()],
6024            values: Array2::from_shape_vec(
6025                (rows.len(), 3),
6026                rows.into_iter().flat_map(|row| row.into_iter()).collect(),
6027            )
6028            .expect("rectangular continuous-x factor data"),
6029            schema: DataSchema {
6030                columns: vec![
6031                    SchemaColumn {
6032                        name: "y".into(),
6033                        kind: ColumnKindTag::Continuous,
6034                        levels: vec![],
6035                    },
6036                    SchemaColumn {
6037                        name: "x".into(),
6038                        kind: ColumnKindTag::Continuous,
6039                        levels: vec![],
6040                    },
6041                    SchemaColumn {
6042                        name: "g".into(),
6043                        kind: ColumnKindTag::Categorical,
6044                        levels,
6045                    },
6046                ],
6047            },
6048            column_kinds: vec![
6049                ColumnKindTag::Continuous,
6050                ColumnKindTag::Continuous,
6051                ColumnKindTag::Categorical,
6052            ],
6053        }
6054    }
6055
6056    fn factor_smooth_spec_for(formula: &str, ds: &Dataset) -> FactorSmoothSpec {
6057        let col_map = ds.column_map();
6058        let parsed = parse_formula(formula).expect("parse factor smooth formula");
6059        let mut notes = Vec::new();
6060        let terms = build_termspec(
6061            &parsed.terms,
6062            ds,
6063            &col_map,
6064            &mut notes,
6065            &gam_runtime::resource::ResourcePolicy::default_library(),
6066        )
6067        .expect("build factor smooth term");
6068        let SmoothBasisSpec::FactorSmooth { spec } = &terms.smooth_terms[0].basis else {
6069            panic!("expected FactorSmooth basis for `{formula}`");
6070        };
6071        spec.clone()
6072    }
6073
6074    /// #1605: the sum-to-zero factor smooth `s(x, g, bs="sz")` under-fit data
6075    /// drawn from its own model class because its deviation blocks carried ONLY
6076    /// the marginal wiggliness penalty — the {const, linear} null space of every
6077    /// deviation curve was left completely unpenalized, so the single combined
6078    /// wiggliness λ could not separate per-group intercept/slope variance from
6079    /// curvature variance and REML parked it over-smoothed (same defect class as
6080    /// the closed #700, more severe). mgcv's `bs="fs"` sibling avoids the gap by
6081    /// adding a SEPARATE per-null-dimension ridge (one λ each), the
6082    /// double-penalty `I_L ⊗ S_j` structure. The fix gives `sz` the same
6083    /// null-space-ridge structure, mapped into the zero-sum CONTRAST space so the
6084    /// constraint (and `sz`'s distinctness from `fs`) is preserved.
6085    ///
6086    /// This pins the structural defect: after the fix the `sz` deviation build
6087    /// must carry MORE than just its wiggliness penalty(s) — exactly one extra
6088    /// null-space-ridge penalty per marginal null direction, matching the count
6089    /// that `fs` carries — while keeping the narrower `(L-1)·p` zero-sum design
6090    /// (NOT the `L·p` full-rank `fs` design). Before the fix `sz` carried only
6091    /// the wiggliness penalties and this fails.
6092    #[test]
6093    fn sz_factor_smooth_carries_null_space_ridge_like_fs() {
6094        let ds = continuous_x_factor_dataset(180, 4);
6095        let mut workspace = crate::basis::BasisWorkspace::new();
6096
6097        let sz_spec = factor_smooth_spec_for("y ~ s(x, g, bs=sz, k=8)", &ds);
6098        let sz_built = crate::smooth::build_factor_smooth(
6099            ds.values.view(),
6100            &sz_spec,
6101            "sz_term",
6102            &mut workspace,
6103        )
6104        .expect("build sz factor smooth");
6105
6106        let fs_spec = factor_smooth_spec_for("y ~ s(x, g, bs=fs, k=8)", &ds);
6107        let fs_built = crate::smooth::build_factor_smooth(
6108            ds.values.view(),
6109            &fs_spec,
6110            "fs_term",
6111            &mut workspace,
6112        )
6113        .expect("build fs factor smooth");
6114
6115        // Penalty structure (#1074 + #1605). `fs` is the exchangeable
6116        // random-effect smooth: all `L` level blocks share ONE wiggliness λ per
6117        // marginal penalty, plus one rank-1 null-space ridge per marginal null
6118        // direction (the #1605 double penalty). `sz` is the sum-to-zero factor
6119        // smooth and mgcv's `smooth.construct.sz` emits ONE penalty matrix PER
6120        // LEVEL — `L` independent curvature smoothing parameters — so REML can
6121        // shrink a low-amplitude group's deviation hard while leaving a busy
6122        // group nearly unpenalized. We mirror that: the single marginal
6123        // wiggliness penalty is split into its `L` independent zero-sum-contrast
6124        // summands (`L-1` free per-group blocks `(e_k e_kᵀ)⊗S` + the reference
6125        // coupling block `(11ᵀ)⊗S`), each carrying its own λ, and the null-space
6126        // ridges stay POOLED (the per-group intercept/slope shrinkage mgcv pools
6127        // under one variance even for `sz`).
6128        //
6129        // So with `nw` marginal wiggliness penalties and `nn` marginal null
6130        // directions: fs has `nw + nn` penalties; sz has `L·nw + nn`. sz must
6131        // therefore carry strictly MORE penalties than fs (the per-group split),
6132        // and the surplus must be exactly `(L-1)·nw`.
6133        let n_levels = sz_spec
6134            .group_frozen_levels
6135            .as_ref()
6136            .map(|l| l.len())
6137            .unwrap_or(4);
6138        assert!(n_levels >= 3, "test needs >=3 groups, got {n_levels}");
6139
6140        // fs = nw + nn  ⇒  nn = fs_penalties - nw. The marginal has nw==1
6141        // wiggliness penalty (a single difference/curvature operator), so the
6142        // per-group split adds exactly (L-1)·nw = (L-1) extra penalties on top of
6143        // fs's count.
6144        let nw = 1usize; // one marginal wiggliness penalty for the B-spline marginal
6145        let expected_sz = fs_built.penalties.len() + (n_levels - 1) * nw;
6146        assert_eq!(
6147            sz_built.penalties.len(),
6148            expected_sz,
6149            "sz must split its wiggliness penalty per level (#1074): expected \
6150             fs_count {} + (L-1)·nw {} = {}, but sz had {}",
6151            fs_built.penalties.len(),
6152            (n_levels - 1) * nw,
6153            expected_sz,
6154            sz_built.penalties.len(),
6155        );
6156        assert!(
6157            sz_built.penalties.len() > fs_built.penalties.len(),
6158            "sz must carry strictly more penalties than fs after the per-group \
6159             split (sz={}, fs={})",
6160            sz_built.penalties.len(),
6161            fs_built.penalties.len(),
6162        );
6163
6164        // The null-space ridges must still be present (the #1605 property that
6165        // keeps the deviation curvature un-over-smoothed). After removing the `L`
6166        // per-group wiggliness blocks, the remainder are the pooled null ridges,
6167        // and there must be at least one (a B-spline marginal has a non-empty
6168        // {const, linear} null space).
6169        let n_wiggliness = n_levels * nw; // L per-group blocks
6170        assert!(
6171            sz_built.penalties.len() > n_wiggliness,
6172            "sz deviation block carries no null-space ridge (penalties={}, \
6173             wiggliness blocks={}); the null space is unpenalized and REML \
6174             over-smooths the deviations",
6175            sz_built.penalties.len(),
6176            n_wiggliness,
6177        );
6178
6179        // The zero-sum constraint must be preserved: the sz design must stay the
6180        // NARROWER `(L-1)·p` contrast design, strictly narrower than the fs
6181        // full-rank `L·p` design. This guards against "fixing" sz by making it
6182        // identical to fs (which would break identifiability / sum-to-zero).
6183        assert!(
6184            sz_built.dim < fs_built.dim,
6185            "sz design width {} must be strictly less than fs width {} \
6186             (zero-sum contrast drops one level block)",
6187            sz_built.dim,
6188            fs_built.dim,
6189        );
6190
6191        // Every penalty/metadata vector must stay parallel (length invariant the
6192        // downstream REML assembly relies on).
6193        assert_eq!(sz_built.penalties.len(), sz_built.nullspaces.len());
6194        assert_eq!(sz_built.penalties.len(), sz_built.penaltyinfo.len());
6195        assert_eq!(sz_built.penalties.len(), sz_built.null_eigenvectors.len());
6196    }
6197
6198    /// #1457: `y ~ s(x, by=g) + g` with a BARE categorical `g` must NOT lower to
6199    /// two `g` design blocks. The bare `+ g` is auto-promoted to a single
6200    /// penalized random-effect block owning the factor's full level offsets; the
6201    /// `by=` branch must then recognize that owner and skip adding its own
6202    /// unpenalized treatment-coded main effect. Before the fix the dedup guard
6203    /// recognized only explicit `group(g)` (a `ParsedTerm::RandomEffect`), so the
6204    /// auto-promoted bare-`+ g` block slipped past and a spurious second `g`
6205    /// block (plus an extra smoothing parameter) was added. Assert exactly ONE
6206    /// `g` random/categorical block, and that adding the bare `+ g` introduces no
6207    /// extra `g` blocks beyond `y ~ s(x, by=g)` alone.
6208    fn factor_dataset_l3() -> Dataset {
6209        // `g` is categorical with THREE levels (encoded 0.0/1.0/2.0).
6210        let rows = (0..30)
6211            .map(|i| {
6212                let x = i as f64 / 29.0;
6213                let g = (i % 3) as f64;
6214                vec![x + g, x, g]
6215            })
6216            .collect::<Vec<_>>();
6217        Dataset {
6218            headers: vec!["y".into(), "x".into(), "g".into()],
6219            values: Array2::from_shape_vec(
6220                (rows.len(), 3),
6221                rows.into_iter().flat_map(|row| row.into_iter()).collect(),
6222            )
6223            .expect("rectangular L=3 factor test data"),
6224            schema: DataSchema {
6225                columns: vec![
6226                    SchemaColumn {
6227                        name: "y".into(),
6228                        kind: ColumnKindTag::Continuous,
6229                        levels: vec![],
6230                    },
6231                    SchemaColumn {
6232                        name: "x".into(),
6233                        kind: ColumnKindTag::Continuous,
6234                        levels: vec![],
6235                    },
6236                    SchemaColumn {
6237                        name: "g".into(),
6238                        kind: ColumnKindTag::Categorical,
6239                        levels: vec!["a".into(), "b".into(), "c".into()],
6240                    },
6241                ],
6242            },
6243            column_kinds: vec![
6244                ColumnKindTag::Continuous,
6245                ColumnKindTag::Continuous,
6246                ColumnKindTag::Categorical,
6247            ],
6248        }
6249    }
6250
6251    #[test]
6252    fn factor_by_smooth_plus_bare_categorical_does_not_duplicate_factor_block() {
6253        let ds = factor_dataset_l3();
6254        let col_map = ds.column_map();
6255
6256        let g_blocks = |formula: &str| -> usize {
6257            let parsed = parse_formula(formula).expect("parse by-smooth formula");
6258            let mut notes = Vec::new();
6259            let terms = build_termspec(
6260                &parsed.terms,
6261                &ds,
6262                &col_map,
6263                &mut notes,
6264                &ResourcePolicy::default_library(),
6265            )
6266            .unwrap_or_else(|err| panic!("`{formula}` must build, got: {err:?}"));
6267            terms
6268                .random_effect_terms
6269                .iter()
6270                .filter(|rt| rt.name == "g")
6271                .count()
6272        };
6273
6274        // Baseline: the standalone factor-by smooth carries exactly ONE `g`
6275        // block (the unpenalized treatment-coded factor main effect added by the
6276        // `by=` branch).
6277        let by_only = g_blocks("y ~ s(x, by=g, k=10)");
6278        assert_eq!(
6279            by_only, 1,
6280            "`y ~ s(x, by=g)` must produce exactly one `g` design block"
6281        );
6282
6283        // The bug: adding a bare `+ g` (auto-promoted to a penalized random
6284        // block owning the same level offsets) must NOT introduce a second `g`
6285        // block. Before the fix this was 2.
6286        let by_plus_bare = g_blocks("y ~ s(x, by=g, k=10) + g");
6287        assert_eq!(
6288            by_plus_bare, 1,
6289            "`y ~ s(x, by=g) + g` must collapse to ONE `g` block (#1457): the bare \
6290             `+ g` already owns the factor's level offsets, so the `by=` branch \
6291             must not add a second, treatment-coded main effect"
6292        );
6293
6294        // The bare `+ g` adds no spurious extra `g` block versus the baseline.
6295        assert_eq!(
6296            by_plus_bare, by_only,
6297            "the bare `+ g` collision must add zero extra `g` blocks (#1457)"
6298        );
6299    }
6300
6301    #[test]
6302    fn parse_tensor_periods_and_origins_aliases() {
6303        let mut opts = BTreeMap::new();
6304        opts.insert(
6305            "boundary".to_string(),
6306            "['periodic', 'periodic']".to_string(),
6307        );
6308        opts.insert("periods".to_string(), "[7, 24]".to_string());
6309        opts.insert("origins".to_string(), "[0, -12]".to_string());
6310        let axes = parse_periodic_axes(&opts, 2).expect("axes");
6311        let periods = parse_periods(&opts, &axes).expect("periods");
6312        let origins = parse_period_origins(&opts, &axes).expect("origins");
6313        assert_eq!(axes, vec![true, true]);
6314        assert_eq!(periods, vec![Some(7.0), Some(24.0)]);
6315        assert_eq!(origins, vec![Some(0.0), Some(-12.0)]);
6316    }
6317
6318    #[test]
6319    fn tensor_smooth_honors_per_margin_k_list() {
6320        let ds = continuous_dataset(
6321            &["y", "theta", "h"],
6322            (0..20)
6323                .map(|i| {
6324                    let theta = std::f64::consts::TAU * i as f64 / 20.0;
6325                    let h = -1.0 + 2.0 * (i % 5) as f64 / 4.0;
6326                    vec![theta.cos() + h, theta, h]
6327                })
6328                .collect(),
6329        );
6330        let parsed = parse_formula(
6331            "y ~ te(theta, h, periodic=[0], period=[2*pi, None], origin=[0, None], k=[9,5])",
6332        )
6333        .expect("parse tensor formula");
6334        let col_map = ds.column_map();
6335        let mut notes = Vec::new();
6336        let terms = build_termspec(
6337            &parsed.terms,
6338            &ds,
6339            &col_map,
6340            &mut notes,
6341            &gam_runtime::resource::ResourcePolicy::default_library(),
6342        )
6343        .expect("build tensor terms");
6344        let SmoothBasisSpec::TensorBSpline { spec, .. } = &terms.smooth_terms[0].basis else {
6345            panic!("expected tensor B-spline");
6346        };
6347        let dims = spec
6348            .marginalspecs
6349            .iter()
6350            .map(|m| match m.knotspec {
6351                BSplineKnotSpec::PeriodicUniform { num_basis, .. } => num_basis,
6352                BSplineKnotSpec::Generate {
6353                    num_internal_knots, ..
6354                } => num_internal_knots + m.degree + 1,
6355                // The mgcv-default `cr` margin (#1074) reports its basis size as
6356                // the number of value-knots placed.
6357                BSplineKnotSpec::NaturalCubicRegression { ref knots } => knots.len(),
6358                _ => panic!("unexpected tensor marginal knotspec"),
6359            })
6360            .collect::<Vec<_>>();
6361        assert_eq!(dims, vec![9, 5]);
6362    }
6363
6364    #[test]
6365    fn tensor_smooth_honors_per_margin_k_axis_aliases() {
6366        let ds = continuous_dataset(
6367            &["resp", "x", "y"],
6368            (0..12)
6369                .map(|i| {
6370                    let t = i as f64 / 11.0;
6371                    vec![t, t, 1.0 - t]
6372                })
6373                .collect(),
6374        );
6375        assert_eq!(
6376            tensor_margin_basis_sizes(&ds, "resp ~ te(x, y, k_x=9, k_y=5)"),
6377            vec![9, 5],
6378            "k_<margin> aliases should materialize requested per-margin values"
6379        );
6380    }
6381
6382    #[test]
6383    fn tensor_smooth_low_cardinality_axis_falls_back_to_lower_degree_basis() {
6384        // mgcv-style: `te(x, b, k=c(5, 2))` with a BINARY second margin (only
6385        // values {0, 1}) is a legitimate request — the binary axis can hold at
6386        // most a 2-function linear basis. We must NOT reject k=2 with a
6387        // "k too small for degree 3" config error; instead, drop the spline
6388        // degree on the binary axis to k_axis - 1 (here 1, linear) while
6389        // keeping the continuous margin at the requested degree=3, k=5.
6390        let ds = continuous_dataset(
6391            &["y", "x", "b"],
6392            (0..40)
6393                .map(|i| {
6394                    let x = i as f64 / 39.0;
6395                    let b = (i % 2) as f64;
6396                    vec![x.sin() + 0.5 * b, x, b]
6397                })
6398                .collect(),
6399        );
6400        let parsed = parse_formula("y ~ te(x, b, k=[5, 2])").expect("parse tensor with k=[5,2]");
6401        let col_map = ds.column_map();
6402        let mut notes = Vec::new();
6403        let terms = build_termspec(
6404            &parsed.terms,
6405            &ds,
6406            &col_map,
6407            &mut notes,
6408            &gam_runtime::resource::ResourcePolicy::default_library(),
6409        )
6410        .expect("build tensor with binary margin");
6411        let SmoothBasisSpec::TensorBSpline { spec, .. } = &terms.smooth_terms[0].basis else {
6412            panic!("expected tensor B-spline for te(x, b)");
6413        };
6414        // Continuous margin keeps requested degree=3 and k=5; binary margin
6415        // drops to degree=1 (linear) so the requested k=2 yields exactly two
6416        // basis functions before tensor-product identifiability is applied.
6417        let continuous = &spec.marginalspecs[0];
6418        let binary = &spec.marginalspecs[1];
6419        assert_eq!(continuous.degree, 3);
6420        assert_eq!(binary.degree, 1);
6421        assert!(
6422            binary.penalty_order >= 1 && binary.penalty_order <= binary.degree,
6423            "binary margin penalty_order {} must satisfy 1 <= order <= degree={}",
6424            binary.penalty_order,
6425            binary.degree
6426        );
6427        let basis_size = |m: &BSplineBasisSpec| match m.knotspec {
6428            BSplineKnotSpec::PeriodicUniform { num_basis, .. } => num_basis,
6429            BSplineKnotSpec::Generate {
6430                num_internal_knots, ..
6431            } => num_internal_knots + m.degree + 1,
6432            BSplineKnotSpec::Automatic {
6433                num_internal_knots: Some(n),
6434                ..
6435            } => n + m.degree + 1,
6436            // The mgcv-default `cr` margin (#1074) reports its basis size as the
6437            // number of value-knots placed.
6438            BSplineKnotSpec::NaturalCubicRegression { ref knots } => knots.len(),
6439            _ => panic!("unexpected tensor marginal knotspec"),
6440        };
6441        assert_eq!(basis_size(continuous), 5);
6442        assert_eq!(basis_size(binary), 2);
6443    }
6444
6445    #[test]
6446    fn tensor_smooth_uniform_k_is_capped_to_a_low_cardinality_margins_distinct_values() {
6447        // Regression: a SINGLE `k=5` applied to every axis of `te(x, b, k=5)`
6448        // with a BINARY second margin (`b ∈ {0, 1}`) must build a valid tensor,
6449        // NOT hard-fail in cr-knot selection ("cubic regression spline with k=5
6450        // requires at least 5 distinct values, got 2"). mgcv caps a margin's
6451        // basis to its data support; the binary axis becomes the 2-function
6452        // (linear) margin, while the continuous axis keeps the requested k=5.
6453        // This is the `te(age, badh, k=5)` real-data case that previously errored.
6454        let ds = continuous_dataset(
6455            &["y", "x", "b"],
6456            (0..40)
6457                .map(|i| {
6458                    let x = i as f64 / 39.0;
6459                    let b = (i % 2) as f64;
6460                    vec![x.sin() + 0.5 * b, x, b]
6461                })
6462                .collect(),
6463        );
6464        let parsed = parse_formula("y ~ te(x, b, k=5)").expect("parse tensor with uniform k=5");
6465        let col_map = ds.column_map();
6466        let mut notes = Vec::new();
6467        let terms = build_termspec(
6468            &parsed.terms,
6469            &ds,
6470            &col_map,
6471            &mut notes,
6472            &gam_runtime::resource::ResourcePolicy::default_library(),
6473        )
6474        .expect("uniform k=5 must auto-cap the binary margin instead of erroring");
6475        let SmoothBasisSpec::TensorBSpline { spec, .. } = &terms.smooth_terms[0].basis else {
6476            panic!("expected tensor B-spline for te(x, b)");
6477        };
6478        let basis_size = |m: &BSplineBasisSpec| match &m.knotspec {
6479            BSplineKnotSpec::PeriodicUniform { num_basis, .. } => *num_basis,
6480            BSplineKnotSpec::Generate {
6481                num_internal_knots, ..
6482            } => num_internal_knots + m.degree + 1,
6483            BSplineKnotSpec::Automatic {
6484                num_internal_knots: Some(n),
6485                ..
6486            } => n + m.degree + 1,
6487            BSplineKnotSpec::NaturalCubicRegression { knots } => knots.len(),
6488            other => panic!("unexpected tensor marginal knotspec: {other:?}"),
6489        };
6490        let binary = &spec.marginalspecs[1];
6491        // Binary margin is reduced to the 2-function linear basis its data
6492        // supports (k capped from 5 to 2, degree dropped to 1).
6493        assert_eq!(basis_size(binary), 2);
6494        assert_eq!(binary.degree, 1);
6495        // The continuous margin is unaffected by the cap (40 distinct values).
6496        assert_eq!(basis_size(&spec.marginalspecs[0]), 5);
6497    }
6498
6499    #[test]
6500    fn tensor_all_tp_margins_with_per_margin_k_routes_to_bspline_tensor() {
6501        // `te(x1, x2, bs=c('tp','tp'), k=c(5,5))` is mgcv's per-margin tp tensor
6502        // with per-margin basis sizes — a tensor product of two 1-D bases, each
6503        // of dimension 5. The list-valued `k=c(5,5)` is honored by
6504        // `parse_tensor_k_list`, producing one penalized B-spline margin per axis
6505        // (each spanning the requested per-axis thin-plate function space). This
6506        // is the same anisotropic-tensor routing the scalar/no-`k` case takes —
6507        // a `te()` request is ALWAYS a tensor product, never a silent isotropic
6508        // thin-plate substitution.
6509        let ds = continuous_dataset(
6510            &["y", "x1", "x2"],
6511            (0..32)
6512                .map(|i| {
6513                    let t = i as f64 / 31.0;
6514                    vec![t.sin(), t, 1.0 - t]
6515                })
6516                .collect(),
6517        );
6518        let parsed =
6519            parse_formula("y ~ te(x1, x2, bs=c('tp','tp'), k=c(5,5))").expect("parse tensor");
6520        let col_map = ds.column_map();
6521        let mut notes = Vec::new();
6522        let terms = build_termspec(
6523            &parsed.terms,
6524            &ds,
6525            &col_map,
6526            &mut notes,
6527            &gam_runtime::resource::ResourcePolicy::default_library(),
6528        )
6529        .expect("build tensor terms with per-margin k");
6530        let SmoothBasisSpec::TensorBSpline { spec, .. } = &terms.smooth_terms[0].basis else {
6531            panic!(
6532                "expected B-spline tensor when k=c(5,5) is supplied with bs=c('tp','tp'), got {:?}",
6533                terms.smooth_terms[0].basis
6534            );
6535        };
6536        // Since #1074 a `tp` tensor margin (k >= 3) is realized as a
6537        // Lancaster–Salkauskas natural cubic-regression margin (cr basis
6538        // dimension == knot count), not an open `Generate` B-spline. It is
6539        // still a `TensorBSpline` spec with one penalized 1-D margin per axis,
6540        // so the routing assertion above still holds; only the per-margin
6541        // knotspec variant changed. The earlier `_ => panic!` arm pinned the
6542        // pre-#1074 `Generate`-only representation and is stale. Decode every
6543        // margin variant to its basis dimension (mirroring the
6544        // `tensor_margin_basis_sizes` helper).
6545        let dims = spec
6546            .marginalspecs
6547            .iter()
6548            .map(|m| match m.knotspec {
6549                BSplineKnotSpec::Generate {
6550                    num_internal_knots, ..
6551                } => num_internal_knots + m.degree + 1,
6552                BSplineKnotSpec::Automatic {
6553                    num_internal_knots: Some(num_internal_knots),
6554                    ..
6555                } => num_internal_knots + m.degree + 1,
6556                BSplineKnotSpec::PeriodicUniform { num_basis, .. } => num_basis,
6557                BSplineKnotSpec::Provided(ref knots) => knots.len().saturating_sub(m.degree + 1),
6558                BSplineKnotSpec::NaturalCubicRegression { ref knots } => knots.len(),
6559                BSplineKnotSpec::Automatic {
6560                    num_internal_knots: None,
6561                    ..
6562                } => panic!("test cannot infer automatic knot count"),
6563            })
6564            .collect::<Vec<_>>();
6565        assert_eq!(dims, vec![5, 5]);
6566    }
6567
6568    #[test]
6569    fn tensor_all_tp_margins_without_per_margin_k_builds_anisotropic_tensor() {
6570        // `te(x1, x2, bs=c('tp','tp'))` is a tensor-product request and must
6571        // build a genuine anisotropic tensor product (one smoothing parameter
6572        // per margin), NOT a silently-substituted multi-D isotropic thin-plate
6573        // radial smooth — that would be a different model (`s(x1,x2,bs='tp')`).
6574        // The routing is now consistent whether or not `k` is list-valued: a tp
6575        // margin vector always realizes each axis as a 1-D penalized B-spline
6576        // margin spanning the same per-axis thin-plate function space (#1082).
6577        let ds = continuous_dataset(
6578            &["y", "x1", "x2"],
6579            (0..32)
6580                .map(|i| {
6581                    let t = i as f64 / 31.0;
6582                    vec![t.sin(), t, 1.0 - t]
6583                })
6584                .collect(),
6585        );
6586        let parsed = parse_formula("y ~ te(x1, x2, bs=c('tp','tp'))").expect("parse tensor");
6587        let col_map = ds.column_map();
6588        let mut notes = Vec::new();
6589        let terms = build_termspec(
6590            &parsed.terms,
6591            &ds,
6592            &col_map,
6593            &mut notes,
6594            &gam_runtime::resource::ResourcePolicy::default_library(),
6595        )
6596        .expect("build tensor terms without per-margin k");
6597        let SmoothBasisSpec::TensorBSpline { spec, .. } = &terms.smooth_terms[0].basis else {
6598            panic!(
6599                "te(...,bs=c('tp','tp')) must route to an anisotropic tensor product, not a \
6600                 silent isotropic thin-plate substitution; got {:?}",
6601                terms.smooth_terms[0].basis
6602            );
6603        };
6604        assert_eq!(
6605            spec.marginalspecs.len(),
6606            2,
6607            "tp tensor must carry one penalized B-spline margin per axis"
6608        );
6609    }
6610
6611    #[test]
6612    fn explicit_basis_sizes_are_not_small_n_clamped() {
6613        let ds = continuous_dataset(
6614            &["y", "x1", "x2", "x3", "x4", "x5"],
6615            (0..12)
6616                .map(|i| {
6617                    let x = i as f64 / 11.0;
6618                    vec![x.sin(), x, x * x, x + 0.1, 1.0 - x, (2.0 * x).sin()]
6619                })
6620                .collect(),
6621        );
6622        let parsed = parse_formula("y ~ s(x1, k=10) + s(x2) + s(x3) + s(x4) + s(x5)")
6623            .expect("parse multi-smooth formula");
6624        let col_map = ds.column_map();
6625        let mut notes = Vec::new();
6626        let terms = build_termspec(
6627            &parsed.terms,
6628            &ds,
6629            &col_map,
6630            &mut notes,
6631            &gam_runtime::resource::ResourcePolicy::default_library(),
6632        )
6633        .expect("build multi-smooth terms");
6634        let SmoothBasisSpec::BSpline1D { spec, .. } = &terms.smooth_terms[0].basis else {
6635            panic!("expected first smooth to be B-spline");
6636        };
6637        assert!(matches!(
6638            &spec.knotspec,
6639            BSplineKnotSpec::Generate {
6640                num_internal_knots: 6,
6641                ..
6642            }
6643        ));
6644    }
6645
6646    #[test]
6647    fn explicit_duchon_centers_are_not_small_n_bumped() {
6648        let ds = continuous_dataset(
6649            &["y", "x1", "x2", "x3", "x4", "x5"],
6650            (0..12)
6651                .map(|i| {
6652                    let x = i as f64 / 11.0;
6653                    vec![x.sin(), x, x * x, x + 0.1, 1.0 - x, (2.0 * x).sin()]
6654                })
6655                .collect(),
6656        );
6657        // Pure 1D Duchon at default options resolves the nullspace to Linear
6658        // (2s < d forces escalation), giving 2 polynomial nullspace columns;
6659        // the well-posedness gate requires num_centers > polynomial_cols, so
6660        // 3 is the smallest valid count. It is still well below the small-N
6661        // bump target of polynomial_cols + 4 = 6, so this exercises the
6662        // "explicit value is honored" path the test name advertises.
6663        let parsed = parse_formula("y ~ duchon(x1, centers=3) + s(x2) + s(x3) + s(x4) + s(x5)")
6664            .expect("parse multi-smooth formula");
6665        let col_map = ds.column_map();
6666        let mut notes = Vec::new();
6667        let terms = build_termspec(
6668            &parsed.terms,
6669            &ds,
6670            &col_map,
6671            &mut notes,
6672            &gam_runtime::resource::ResourcePolicy::default_library(),
6673        )
6674        .expect("build multi-smooth terms");
6675        let SmoothBasisSpec::Duchon { spec, .. } = &terms.smooth_terms[0].basis else {
6676            panic!("expected first smooth to be Duchon");
6677        };
6678        assert!(matches!(
6679            spec.center_strategy,
6680            CenterStrategy::FarthestPoint { num_centers: 3 }
6681        ));
6682    }
6683
6684    #[test]
6685    fn inferred_tensor_basis_cap_uses_coordinate_support_not_duplicate_rows() {
6686        let mut unique_rows = Vec::new();
6687        for i in 0..50 {
6688            let theta = i as f64 / 50.0;
6689            for j in 0..16 {
6690                let h = -1.0 + 2.0 * (j as f64) / 15.0;
6691                let y = theta.cos() + h;
6692                unique_rows.push(vec![y, theta, h]);
6693            }
6694        }
6695        let mut repeated_rows = Vec::new();
6696        for _ in 0..12 {
6697            repeated_rows.extend(unique_rows.iter().cloned());
6698        }
6699
6700        let unique = continuous_dataset(&["y", "theta", "h"], unique_rows);
6701        let repeated = continuous_dataset(&["y", "theta", "h"], repeated_rows);
6702
6703        let unique_basis = inferred_tensor_basis_product(&unique);
6704        let repeated_basis = inferred_tensor_basis_product(&repeated);
6705
6706        assert_eq!(
6707            unique_basis, repeated_basis,
6708            "duplicating existing tensor coordinates must not inflate inferred basis width"
6709        );
6710    }
6711
6712    #[test]
6713    fn inferred_three_dim_tensor_basis_stays_bounded_for_reml_selection() {
6714        // Regression for gam#813: the inferred per-margin k must be
6715        // dimension-aware so the 3-D tensor width p = ∏ k_d does not explode.
6716        // With the old 1-D-per-margin rule a 3-D `te` defaulted to 7³=343 at
6717        // small n and 20³=8000 at larger n, making the (non-Kronecker-factorable)
6718        // full-tensor sum-to-zero penalty's O(p³) REML reparameterization a
6719        // multi-minute stall. The dimension-aware budget keeps the product near
6720        // mgcv's te default (≈5³=125) regardless of n.
6721        let make = |n: usize| -> usize {
6722            let mut rows = Vec::with_capacity(n);
6723            for i in 0..n {
6724                let f = i as f64 / n as f64;
6725                rows.push(vec![f.sin(), f, (2.0 * f).cos(), (3.0 * f) % 1.0]);
6726            }
6727            let ds = continuous_dataset(&["y", "x1", "x2", "x3"], rows);
6728            let parsed = parse_formula("y ~ te(x1, x2, x3)").expect("parse 3-D tensor");
6729            let col_map = ds.column_map();
6730            let mut notes = Vec::new();
6731            let terms = build_termspec(
6732                &parsed.terms,
6733                &ds,
6734                &col_map,
6735                &mut notes,
6736                &ResourcePolicy::default_library(),
6737            )
6738            .expect("build 3-D tensor termspec");
6739            let SmoothBasisSpec::TensorBSpline { spec, .. } = &terms.smooth_terms[0].basis else {
6740                panic!("expected tensor smooth");
6741            };
6742            spec.marginalspecs
6743                .iter()
6744                .map(|m| match m.knotspec {
6745                    BSplineKnotSpec::Generate {
6746                        num_internal_knots, ..
6747                    } => num_internal_knots + m.degree + 1,
6748                    BSplineKnotSpec::Automatic {
6749                        num_internal_knots: Some(num_internal_knots),
6750                        ..
6751                    } => num_internal_knots + m.degree + 1,
6752                    // The mgcv-default `cr` margin (#1074) reports its basis size
6753                    // as the number of value-knots placed.
6754                    BSplineKnotSpec::NaturalCubicRegression { ref knots } => knots.len(),
6755                    _ => panic!("unexpected tensor margin knotspec"),
6756                })
6757                .product()
6758        };
6759
6760        // n=30 (the issue's data): was 7³=343, must now be modest.
6761        assert!(
6762            make(60) <= 216,
6763            "3-D te at small n must stay near the mgcv te default, got {}",
6764            make(60)
6765        );
6766        // Larger n must NOT grow the product toward n³ (was 20³=8000).
6767        assert!(
6768            make(2000) <= 216,
6769            "3-D te at large n must not blow ∏k toward the data size, got {}",
6770            make(2000)
6771        );
6772    }
6773
6774    #[test]
6775    fn parse_bspline_boundary_conditions_and_side_selector() {
6776        // Non-zero anchors are rejected at parse time; the diagnostic must
6777        // name the side and value, which doubles as a check that the
6778        // `side=left` filter routes the global `anchor=` value to the
6779        // left endpoint (not the right).
6780        let mut opts = BTreeMap::new();
6781        opts.insert("boundary_conditions".to_string(), "anchored".to_string());
6782        opts.insert("side".to_string(), "left".to_string());
6783        opts.insert("anchor".to_string(), "2.5".to_string());
6784        let err = parse_bspline_boundary_conditions(&opts)
6785            .expect_err("non-zero left anchor must be rejected")
6786            .to_string();
6787        assert!(
6788            err.contains("left") && err.contains("2.5"),
6789            "rejection should name the affected side and value: {err}"
6790        );
6791
6792        // Side-specific aliases (`start_bc`/`end_bc`) plus the side-specific
6793        // anchor key (`right_anchor`) must funnel the value onto the right
6794        // endpoint — verified through the rejection diagnostic.
6795        let mut opts = BTreeMap::new();
6796        opts.insert("start_bc".to_string(), "clamped".to_string());
6797        opts.insert("end_bc".to_string(), "zero".to_string());
6798        opts.insert("right_anchor".to_string(), "-1.0".to_string());
6799        let err = parse_bspline_boundary_conditions(&opts)
6800            .expect_err("non-zero right anchor must be rejected")
6801            .to_string();
6802        assert!(
6803            err.contains("right") && err.contains("-1"),
6804            "rejection should name the affected side and value: {err}"
6805        );
6806
6807        // With anchors at zero the basis builder accepts the configuration,
6808        // so the same alias plumbing yields a clean `Anchored { value: 0.0 }`
6809        // on the right and `Clamped` on the left.
6810        let mut opts = BTreeMap::new();
6811        opts.insert("start_bc".to_string(), "clamped".to_string());
6812        opts.insert("end_bc".to_string(), "zero".to_string());
6813        let parsed = parse_bspline_boundary_conditions(&opts).expect("boundary conditions");
6814        assert!(matches!(
6815            parsed.left,
6816            BSplineEndpointBoundaryCondition::Clamped
6817        ));
6818        assert!(matches!(
6819            parsed.right,
6820            BSplineEndpointBoundaryCondition::Anchored { value } if value.abs() < 1e-12
6821        ));
6822    }
6823
6824    #[test]
6825    fn categorical_by_numeric_interaction_expands_treatment_coded_cells() {
6826        // `y ~ x:g` is an INTERACTION-ONLY numeric-by-factor model: there is no
6827        // `x` main effect, so the marginal parent that would identify a dropped
6828        // reference level is ABSENT. The expansion must therefore be marginality-
6829        // aware (gam#1158) and DUMMY-code `g` — keep ALL levels — yielding the
6830        // "common intercept, separate slopes" design (one x-slope column per
6831        // group). Treatment-coding here (dropping the reference level) would pin
6832        // the reference group's slope to zero, a rank-deficient fit; that wrong
6833        // behaviour is what this test now guards against. (The treatment-coded
6834        // path is exercised when the `x` parent is present — see
6835        // `categorical_by_numeric_interaction_keeps_treatment_coding_with_parent`.)
6836        let ds = factor_dataset();
6837        // `g` is categorical with two levels (encoded 0.0 → "a", 1.0 → "b").
6838        let parsed = parse_formula("y ~ x:g").expect("parse `y ~ x:g`");
6839        let col_map = ds.column_map();
6840        let mut notes = Vec::new();
6841        let terms = build_termspec(
6842            &parsed.terms,
6843            &ds,
6844            &col_map,
6845            &mut notes,
6846            &ResourcePolicy::default_library(),
6847        )
6848        .expect("factor-aware `x:g` interaction must build, not error");
6849
6850        assert_eq!(
6851            terms.linear_terms.len(),
6852            2,
6853            "interaction-only `x:g` keeps ALL factor levels (full dummy coding): one slope column per group"
6854        );
6855
6856        let x_col = *col_map.get("x").expect("x column");
6857        let g_col = *col_map.get("g").expect("g column");
6858
6859        // Both level gates must appear exactly once across the two cell columns,
6860        // and each cell carries `x` as a product factor (not a raw column for g).
6861        let mut seen_bits = std::collections::HashSet::new();
6862        for term in &terms.linear_terms {
6863            assert!(
6864                term.is_interaction(),
6865                "the categorical-by-numeric cell is a Wilkinson-Rogers interaction"
6866            );
6867            assert_eq!(term.feature_cols, vec![x_col]);
6868            assert_eq!(term.categorical_levels.len(), 1);
6869            let (gate_col, gate_bits) = term.categorical_levels[0];
6870            assert_eq!(gate_col, g_col);
6871            assert!(seen_bits.insert(gate_bits), "each level appears once");
6872
6873            // Realize and check it equals `1[g == gate_bits] * x` row by row.
6874            let column = term
6875                .realized_design_column(ds.values.view())
6876                .expect("realize cell column");
6877            let n = ds.values.nrows();
6878            assert_eq!(column.len(), n);
6879            for row in 0..n {
6880                let x = ds.values[[row, x_col]];
6881                let g = ds.values[[row, g_col]];
6882                let expected = if g.to_bits() == gate_bits { x } else { 0.0 };
6883                assert!(
6884                    (column[row] - expected).abs() < 1e-12,
6885                    "row {row}: g={g}, x={x}, expected {expected}, got {}",
6886                    column[row]
6887                );
6888            }
6889        }
6890        // Both the reference level "a" (0.0) and the non-reference "b" (1.0) are
6891        // kept — the reference level is NOT dropped in the interaction-only form.
6892        assert!(seen_bits.contains(&0.0_f64.to_bits()));
6893        assert!(seen_bits.contains(&1.0_f64.to_bits()));
6894    }
6895
6896    #[test]
6897    fn categorical_by_numeric_interaction_keeps_treatment_coding_with_parent() {
6898        // With the `x` main effect PRESENT (`y ~ x + x:g`), the marginal parent
6899        // that identifies a dropped reference level exists, so `x:g` keeps its
6900        // historical treatment coding: the reference level "a" is dropped and
6901        // only the non-reference slope-deviation column for "b" is emitted. This
6902        // guards that the marginality-aware fix (gam#1158) does NOT regress the
6903        // parent-present form, which must stay column-space-identical to mgcv's
6904        // `x + x:g`.
6905        let ds = factor_dataset();
6906        let parsed = parse_formula("y ~ x + x:g").expect("parse `y ~ x + x:g`");
6907        let col_map = ds.column_map();
6908        let mut notes = Vec::new();
6909        let terms = build_termspec(
6910            &parsed.terms,
6911            &ds,
6912            &col_map,
6913            &mut notes,
6914            &ResourcePolicy::default_library(),
6915        )
6916        .expect("`x + x:g` must build");
6917
6918        // One main-effect `x` column plus one treatment-coded interaction cell.
6919        let x_col = *col_map.get("x").expect("x column");
6920        let g_col = *col_map.get("g").expect("g column");
6921        let interaction_cells: Vec<_> = terms
6922            .linear_terms
6923            .iter()
6924            .filter(|t| t.is_interaction())
6925            .collect();
6926        assert_eq!(
6927            interaction_cells.len(),
6928            1,
6929            "with `x` present, `x:g` is treatment-coded → one cell (reference dropped)"
6930        );
6931        let term = interaction_cells[0];
6932        assert_eq!(term.feature_cols, vec![x_col]);
6933        assert_eq!(term.categorical_levels.len(), 1);
6934        let (gate_col, gate_bits) = term.categorical_levels[0];
6935        assert_eq!(gate_col, g_col);
6936        // The dropped reference is "a" (0.0); the kept gate is "b" (1.0).
6937        assert_eq!(gate_bits, 1.0_f64.to_bits());
6938    }
6939
6940    #[test]
6941    fn categorical_by_categorical_interaction_expands_full_cross_cells() {
6942        // `y ~ f:g` is an INTERACTION-ONLY factor-by-factor model: neither `f`
6943        // nor `g` appears as a main effect, so neither marginal parent is
6944        // present and BOTH factors must be dummy-coded (gam#1159). The correct
6945        // design is the SATURATED cell-means model: the full cross of ALL levels
6946        // (3 * 2 = 6 cells) minus ONE reference cell (the lexicographically-first
6947        // level of every factor, here f0:g0) absorbed by the intercept — rank
6948        // 6-1 = 5 cell columns + intercept, column-space-identical to `f*g`.
6949        // Treatment-coding both factors (the old behaviour) kept only
6950        // (3-1)*(2-1) = 2 cells and collapsed the rest onto the intercept, a
6951        // rank-deficient fit; that is the bug this test now guards against.
6952        let n = 30usize;
6953        let mut rows = Vec::with_capacity(n);
6954        for i in 0..n {
6955            let y = (i as f64).sin();
6956            let f = (i % 3) as f64; // 3 levels: 0,1,2
6957            let g = (i % 2) as f64; // 2 levels: 0,1
6958            rows.push(vec![y, f, g]);
6959        }
6960        let values = Array2::from_shape_vec(
6961            (n, 3),
6962            rows.into_iter().flat_map(|row| row.into_iter()).collect(),
6963        )
6964        .expect("rectangular cross-factor data");
6965        let ds = Dataset {
6966            headers: vec!["y".into(), "f".into(), "g".into()],
6967            values,
6968            schema: DataSchema {
6969                columns: vec![
6970                    SchemaColumn {
6971                        name: "y".into(),
6972                        kind: ColumnKindTag::Continuous,
6973                        levels: vec![],
6974                    },
6975                    SchemaColumn {
6976                        name: "f".into(),
6977                        kind: ColumnKindTag::Categorical,
6978                        levels: vec!["f0".into(), "f1".into(), "f2".into()],
6979                    },
6980                    SchemaColumn {
6981                        name: "g".into(),
6982                        kind: ColumnKindTag::Categorical,
6983                        levels: vec!["g0".into(), "g1".into()],
6984                    },
6985                ],
6986            },
6987            column_kinds: vec![
6988                ColumnKindTag::Continuous,
6989                ColumnKindTag::Categorical,
6990                ColumnKindTag::Categorical,
6991            ],
6992        };
6993
6994        let parsed = parse_formula("y ~ f:g").expect("parse `y ~ f:g`");
6995        let col_map = ds.column_map();
6996        let mut notes = Vec::new();
6997        let terms = build_termspec(
6998            &parsed.terms,
6999            &ds,
7000            &col_map,
7001            &mut notes,
7002            &ResourcePolicy::default_library(),
7003        )
7004        .expect("factor-by-factor `f:g` interaction must build, not error");
7005
7006        assert_eq!(
7007            terms.linear_terms.len(),
7008            5,
7009            "saturated 3*2 = 6 cross cells minus one reference cell (f0:g0) = 5"
7010        );
7011
7012        let f_col = *col_map.get("f").expect("f column");
7013        let g_col = *col_map.get("g").expect("g column");
7014        // The dropped reference cell pairs each factor's lexicographically-first
7015        // level: f0 (0.0) and g0 (0.0). It must NOT appear among the emitted
7016        // cells; every OTHER cross cell must.
7017        let f0 = 0.0_f64.to_bits();
7018        let g0 = 0.0_f64.to_bits();
7019        let mut emitted = std::collections::HashSet::new();
7020        for term in &terms.linear_terms {
7021            // No numeric operand: the realized column is a pure cell indicator.
7022            assert!(term.feature_cols.is_empty());
7023            assert_eq!(term.categorical_levels.len(), 2);
7024            let mut gates = std::collections::HashMap::new();
7025            for &(col, bits) in &term.categorical_levels {
7026                gates.insert(col, bits);
7027            }
7028            let f_bits = *gates.get(&f_col).expect("f gate present");
7029            let g_bits = *gates.get(&g_col).expect("g gate present");
7030            // The reference cell f0:g0 must have been dropped.
7031            assert!(
7032                !(f_bits == f0 && g_bits == g0),
7033                "the reference cell f0:g0 must be absorbed by the intercept, not emitted"
7034            );
7035            emitted.insert((f_bits, g_bits));
7036
7037            let column = term
7038                .realized_design_column(ds.values.view())
7039                .expect("realize cross cell");
7040            for row in 0..n {
7041                let f = ds.values[[row, f_col]];
7042                let g = ds.values[[row, g_col]];
7043                let expected = if f.to_bits() == f_bits && g.to_bits() == g_bits {
7044                    1.0
7045                } else {
7046                    0.0
7047                };
7048                assert!(
7049                    (column[row] - expected).abs() < 1e-12,
7050                    "row {row}: expected {expected}, got {}",
7051                    column[row]
7052                );
7053            }
7054            assert!(
7055                column.iter().any(|&v| v == 1.0),
7056                "each cross cell must be observed in the data"
7057            );
7058        }
7059        // Every non-reference cross cell is present exactly once: all 6 cells
7060        // except f0:g0.
7061        let f_levels = [0.0_f64.to_bits(), 1.0_f64.to_bits(), 2.0_f64.to_bits()];
7062        let g_levels = [0.0_f64.to_bits(), 1.0_f64.to_bits()];
7063        for &fb in &f_levels {
7064            for &gb in &g_levels {
7065                if fb == f0 && gb == g0 {
7066                    continue;
7067                }
7068                assert!(
7069                    emitted.contains(&(fb, gb)),
7070                    "saturated cross cell must be present"
7071                );
7072            }
7073        }
7074    }
7075}