Skip to main content

gam_terms/inference/
formula_dsl.rs

1use std::collections::BTreeMap;
2
3use pest::Parser;
4use pest::iterators::Pair;
5use pest_derive::Parser;
6
7use crate::smooth::BoundedCoefficientPriorSpec;
8use gam_problem::types::{
9    InverseLink, LikelihoodSpec, LinkComponent, LinkFunction, StandardLink, WigglePenaltyConfig,
10};
11
12#[derive(Parser)]
13#[grammar_inline = r#"
14WHITESPACE = _{ " " | "\t" | NEWLINE }
15
16top_function_call = { SOI ~ function_call ~ EOI }
17top_expr = { SOI ~ expr ~ EOI }
18formula = { SOI ~ expr ~ "~" ~ rhs ~ EOI }
19rhs = { term ~ ("+" ~ term)* }
20term = { expr }
21
22expr = { sum }
23sum = { product ~ (add_op ~ product)* }
24add_op = { "+" | "-" }
25product = { interact ~ (mul_op ~ interact)* }
26mul_op = { "*" | "/" }
27interact = { power ~ (interact_op ~ power)* }
28interact_op = { ":" }
29power = { unary ~ (pow_op ~ unary)* }
30pow_op = { "^" }
31unary = { unary_op* ~ primary }
32unary_op = _{ "+" | "-" }
33
34primary = { function_call | list_lit | tuple_lit | ident | number | string_lit | "(" ~ expr ~ ")" }
35list_lit = @{ "[" ~ (!"]" ~ ANY)* ~ "]" }
36tuple_lit = @{ "(" ~ (!("," | ")") ~ ANY)+ ~ "," ~ (!")" ~ ANY)* ~ ")" }
37function_call = { ident ~ "(" ~ arg_list? ~ ")" }
38arg_list = { arg ~ ("," ~ arg)* }
39arg = { named_arg | expr }
40named_arg = { ident ~ "=" ~ expr }
41
42ident = @{ ident_start ~ ident_continue* }
43ident_start = _{ ASCII_ALPHA | "_" }
44ident_continue = _{ ASCII_ALPHANUMERIC | "_" | "." }
45
46number = @{
47    "-"?
48    ~ (ASCII_DIGIT+ ~ ("." ~ ASCII_DIGIT*)? | "." ~ ASCII_DIGIT+)
49    ~ (("e" | "E") ~ ("+" | "-")? ~ ASCII_DIGIT+)?
50}
51
52string_lit = @{ "\"" ~ (!"\"" ~ ANY)* ~ "\"" | "'" ~ (!"'" ~ ANY)* ~ "'" }
53"#]
54struct FormulaParser;
55
56#[derive(Clone, Debug, PartialEq, Eq)]
57pub struct FormulaDslParse {
58    pub response_expr: String,
59    pub rhs_terms: Vec<String>,
60}
61
62#[derive(Clone, Debug, PartialEq, Eq)]
63pub enum CallArgSpec {
64    Positional(String),
65    Named { key: String, value: String },
66}
67
68#[derive(Clone, Debug, PartialEq, Eq)]
69pub struct FunctionCallSpec {
70    pub name: String,
71    pub args: Vec<CallArgSpec>,
72}
73
74/// Typed error surface for the formula DSL parser.
75///
76/// Every variant carries a free-form `reason: String` payload; `Display`
77/// emits exactly that payload, so converting a `FormulaDslError` into
78/// `String` (via the `From` impl below) is byte-equivalent to the pre-
79/// refactor `Err(format!(...))` / `Err("...".to_string())` strings that
80/// the same call sites produced. Public entry points keep their existing
81/// `Result<_, String>` signatures — CLI input handling stays unchanged —
82/// and typed errors flow across the boundary via `From<FormulaDslError>
83/// for String`.
84#[derive(Clone, Debug, PartialEq, Eq)]
85pub enum FormulaDslError {
86    /// Pest grammar failure, unbalanced delimiters, empty terms, or
87    /// missing required parse fragments — i.e. the formula text is
88    /// not a well-formed DSL string.
89    ParseError { reason: String },
90    /// A referenced symbol (link name, blended-link component, term
91    /// function name, top-level RHS identifier) is not part of the
92    /// supported vocabulary.
93    UnknownIdentifier { reason: String },
94    /// A named option's value is unparseable, out of range, or not a
95    /// finite number / valid integer.
96    InvalidArgument { reason: String },
97    /// A combination of terms or options is disallowed (duplicate
98    /// terms, multiple linkwiggle/link/survmodel, mutually exclusive
99    /// option groups in bounded(), wiggle-incompatible links, etc.).
100    IncompatibleTerm { reason: String },
101    /// A required configuration option is missing or empty (e.g.
102    /// `link()` without `type=`, `survmodel()` with no options,
103    /// `bounded()` without a required argument).
104    MalformedConfig { reason: String },
105}
106
107gam_linalg::impl_reason_error_boilerplate! {
108    FormulaDslError {
109        ParseError,
110        UnknownIdentifier,
111        InvalidArgument,
112        IncompatibleTerm,
113        MalformedConfig,
114    }
115}
116
117/// Inbound conversion from `String` is used by `?` cascades inside `parse_formula`
118/// and friends so that internal parser helpers still returning `Result<_, String>`
119/// can flow through without each call site needing an explicit `.map_err(...)`.
120/// We route into `ParseError` because by construction every internal helper that
121/// still produces a raw `String` is itself a parse/term-resolution stage.
122impl From<String> for FormulaDslError {
123    fn from(reason: String) -> Self {
124        FormulaDslError::ParseError { reason }
125    }
126}
127
128pub fn parse_formula_dsl(formula: &str) -> Result<FormulaDslParse, String> {
129    validate_balanced_delimiters(formula, "invalid formula syntax")?;
130    let mut parsed =
131        FormulaParser::parse(Rule::formula, formula).map_err(|e| FormulaDslError::ParseError {
132            reason: format!("invalid formula syntax: {e}"),
133        })?;
134    let formula_pair = parsed.next().ok_or_else(|| FormulaDslError::ParseError {
135        reason: "invalid formula syntax: empty parse".to_string(),
136    })?;
137
138    let mut response_expr: Option<String> = None;
139    let mut rhs_terms: Option<Vec<String>> = None;
140
141    for part in formula_pair.into_inner() {
142        match part.as_rule() {
143            Rule::expr if response_expr.is_none() => {
144                response_expr = Some(part.as_str().trim().to_string());
145            }
146            Rule::rhs => {
147                rhs_terms = Some(extract_rhs_terms(part)?);
148            }
149            _ => {}
150        }
151    }
152
153    let response_expr = response_expr.ok_or_else(|| FormulaDslError::ParseError {
154        reason: "invalid formula: missing response expression".to_string(),
155    })?;
156    let rhs_terms = rhs_terms.ok_or_else(|| FormulaDslError::ParseError {
157        reason: "invalid formula: missing RHS terms".to_string(),
158    })?;
159    if rhs_terms.is_empty() {
160        return Err(FormulaDslError::ParseError {
161            reason: "formula has no usable terms".to_string(),
162        }
163        .into());
164    }
165
166    Ok(FormulaDslParse {
167        response_expr,
168        rhs_terms,
169    })
170}
171
172fn delimiter_balance_error(prefix: &str) -> String {
173    format!("{prefix}: unbalanced parentheses or quotes")
174}
175
176// Pest reports malformed delimiters as a generic parse failure. We validate the
177// raw text first so callers get a stable, specific error class for unmatched
178// parentheses/quotes instead of whichever grammar branch happened to fail last.
179fn validate_balanced_delimiters(input: &str, prefix: &str) -> Result<(), String> {
180    let mut stack = Vec::<char>::new();
181    let mut in_single = false;
182    let mut in_double = false;
183
184    for ch in input.chars() {
185        match ch {
186            '\'' if !in_double => in_single = !in_single,
187            '"' if !in_single => in_double = !in_double,
188            '(' | '[' | '{' if !in_single && !in_double => stack.push(ch),
189            ')' | ']' | '}' if !in_single && !in_double => {
190                let expected = match ch {
191                    ')' => '(',
192                    ']' => '[',
193                    // The outer match arm guarantees ch is one of ')', ']', '}'.
194                    _ => '{',
195                };
196                if stack.pop() != Some(expected) {
197                    return Err(FormulaDslError::ParseError {
198                        reason: delimiter_balance_error(prefix),
199                    }
200                    .into());
201                }
202            }
203            _ => {}
204        }
205    }
206
207    if in_single || in_double || !stack.is_empty() {
208        return Err(FormulaDslError::ParseError {
209            reason: delimiter_balance_error(prefix),
210        }
211        .into());
212    }
213    Ok(())
214}
215
216fn extract_rhs_terms(rhs: Pair<'_, Rule>) -> Result<Vec<String>, String> {
217    let mut out = Vec::new();
218    let mut depth = 0_i32;
219    let mut in_single = false;
220    let mut in_double = false;
221    let mut start = 0_usize;
222    // Last non-whitespace character seen at depth 0 outside quotes. A top-level
223    // `+` only separates terms when it follows a completed operand; when it
224    // follows a binary operator (`:`, `*`, `/`, `^`) or another sign — or opens
225    // the RHS — it is a UNARY sign, not a separator. Splitting on it there would
226    // hand the grammar a truncated fragment like `x:` (from `x:+z`) and surface
227    // a confusing "invalid term syntax in `x:`" instead of the dedicated unary
228    // diagnostic the grammar raises when the whole `x:+z` term reaches it. This
229    // never suppresses a split in a valid formula: no valid Wilkinson-Rogers RHS
230    // places `+` immediately after an operator. `-` is already never treated as
231    // a separator here (it is rejected downstream as a binary term operator), so
232    // the unary `-` path already flows through intact — this restores the same
233    // intact flow for unary `+`.
234    let mut last_significant: Option<char> = None;
235    let text = rhs.as_str();
236    let bytes = text.as_bytes();
237    for (idx, &b) in bytes.iter().enumerate() {
238        let ch = b as char;
239        match ch {
240            '\'' if !in_double => in_single = !in_single,
241            '"' if !in_single => in_double = !in_double,
242            '(' | '[' | '{' if !in_single && !in_double => depth += 1,
243            ')' | ']' | '}' if !in_single && !in_double && depth > 0 => depth -= 1,
244            '+' if !in_single
245                && !in_double
246                && depth == 0
247                && !matches!(
248                    last_significant,
249                    None | Some(':' | '*' | '/' | '^' | '+' | '-')
250                ) =>
251            {
252                let term = text[start..idx].trim();
253                if term.is_empty() {
254                    return Err(FormulaDslError::ParseError {
255                        reason: "formula RHS contains an empty term".to_string(),
256                    }
257                    .into());
258                }
259                out.push(term.to_string());
260                start = idx + 1;
261            }
262            _ => {}
263        }
264        if !ch.is_ascii_whitespace() {
265            last_significant = Some(ch);
266        }
267    }
268    if in_single || in_double || depth != 0 {
269        return Err(FormulaDslError::ParseError {
270            reason: "formula RHS has unbalanced quotes or parentheses".to_string(),
271        }
272        .into());
273    }
274    let tail = text[start..].trim();
275    if tail.is_empty() {
276        return Err(FormulaDslError::ParseError {
277            reason: "formula RHS contains an empty term".to_string(),
278        }
279        .into());
280    }
281    out.push(tail.to_string());
282    Ok(out)
283}
284
285/// Wilkinson-Rogers operator-family expansion.
286///
287/// A raw RHS term (already split on top-level `+`) may use the documented
288/// formula operators `:`, `*`, `/`, `^`, and parenthesization. This function
289/// expands the term into a normalized list of `WrAtomList`s, where each
290/// `WrAtomList` is one resulting model term — either a single atom (linear
291/// main effect / function-call term) or multiple atoms (interaction).
292///
293/// Wilkinson-Rogers semantics implemented here:
294/// * `a` produces `{a}` — one main effect.
295/// * `a:b` produces `{a:b}` — one interaction.
296/// * `a*b` produces `{a, b, a:b}` — crossing (expanded).
297/// * `a/b` produces `{a, a:b}` — nesting.
298/// * `(a + b + ... )^n` produces every non-empty subset of `{a, b, ...}` of
299///   size at most `n`, each subset being one interaction.
300/// * `+` unions two term sets; `-` is rejected.
301///
302/// Atoms inside the AST may be bare identifiers or function calls. Function
303/// calls are opaque — they pass through as-is and cannot participate in an
304/// `:` interaction with other atoms (smooths/factors require dedicated
305/// constructors like `te()`).
306type WrAtomList = Vec<String>;
307
308fn expand_wr_term(raw: &str) -> Result<Vec<WrAtomList>, String> {
309    let mut parsed = FormulaParser::parse(Rule::top_expr, raw).map_err(|e| {
310        FormulaDslError::ParseError {
311            reason: format!("invalid term syntax in `{raw}`: {e}"),
312        }
313        .to_string()
314    })?;
315    let top = parsed.next().ok_or_else(|| {
316        FormulaDslError::ParseError {
317            reason: format!("invalid term syntax in `{raw}`: empty parse"),
318        }
319        .to_string()
320    })?;
321    let expr = top
322        .into_inner()
323        .find(|p| p.as_rule() == Rule::expr)
324        .ok_or_else(|| {
325            FormulaDslError::ParseError {
326                reason: format!("invalid term syntax in `{raw}`: missing expr"),
327            }
328            .to_string()
329        })?;
330    let interactions = expand_expr(expr, raw)?;
331    let normalized: Vec<WrAtomList> = interactions
332        .into_iter()
333        .map(normalize_interaction)
334        .collect();
335    // Deduplicate while preserving order: a*b yields {a, b, a:b}; a*b + a
336    // would otherwise list `a` twice.
337    let mut seen = std::collections::BTreeSet::<Vec<String>>::new();
338    let mut out = Vec::<WrAtomList>::new();
339    for term in normalized {
340        let key = term.clone();
341        if seen.insert(key) {
342            out.push(term);
343        }
344    }
345    Ok(out)
346}
347
348fn normalize_interaction(mut atoms: WrAtomList) -> WrAtomList {
349    atoms.sort();
350    atoms.dedup();
351    atoms
352}
353
354fn expand_expr(pair: Pair<'_, Rule>, raw: &str) -> Result<Vec<WrAtomList>, String> {
355    match pair.as_rule() {
356        Rule::expr => {
357            let inner = pair.into_inner().next().ok_or_else(|| {
358                FormulaDslError::ParseError {
359                    reason: format!("invalid term syntax in `{raw}`: empty expr"),
360                }
361                .to_string()
362            })?;
363            expand_expr(inner, raw)
364        }
365        Rule::sum => {
366            let mut iter = pair.into_inner();
367            let first = iter.next().ok_or_else(|| {
368                FormulaDslError::ParseError {
369                    reason: format!("invalid term syntax in `{raw}`: empty sum"),
370                }
371                .to_string()
372            })?;
373            let mut acc = expand_expr(first, raw)?;
374            while let Some(op) = iter.next() {
375                if op.as_rule() != Rule::add_op {
376                    return Err(FormulaDslError::ParseError {
377                        reason: format!(
378                            "invalid term syntax in `{raw}`: expected add operator, got `{:?}`",
379                            op.as_rule()
380                        ),
381                    }
382                    .into());
383                }
384                let op_str = op.as_str().trim();
385                let operand = iter.next().ok_or_else(|| {
386                    FormulaDslError::ParseError {
387                        reason: format!("invalid term syntax in `{raw}`: dangling `{op_str}`"),
388                    }
389                    .to_string()
390                })?;
391                if op_str == "-" {
392                    return Err(FormulaDslError::IncompatibleTerm {
393                        reason: format!(
394                            "binary `-` is not supported inside a formula term in `{raw}` \
395                             (use multiple `+` terms or drop the unwanted predictor explicitly)"
396                        ),
397                    }
398                    .into());
399                }
400                let mut rhs = expand_expr(operand, raw)?;
401                acc.append(&mut rhs);
402            }
403            Ok(acc)
404        }
405        Rule::product => {
406            let mut iter = pair.into_inner();
407            let first = iter.next().ok_or_else(|| {
408                FormulaDslError::ParseError {
409                    reason: format!("invalid term syntax in `{raw}`: empty product"),
410                }
411                .to_string()
412            })?;
413            let mut acc = expand_expr(first, raw)?;
414            while let Some(op) = iter.next() {
415                if op.as_rule() != Rule::mul_op {
416                    return Err(FormulaDslError::ParseError {
417                        reason: format!(
418                            "invalid term syntax in `{raw}`: expected `*` or `/`, got `{:?}`",
419                            op.as_rule()
420                        ),
421                    }
422                    .into());
423                }
424                let op_str = op.as_str().trim();
425                let operand = iter.next().ok_or_else(|| {
426                    FormulaDslError::ParseError {
427                        reason: format!("invalid term syntax in `{raw}`: dangling `{op_str}`"),
428                    }
429                    .to_string()
430                })?;
431                let rhs = expand_expr(operand, raw)?;
432                acc = match op_str {
433                    "*" => wr_cross(acc, rhs),
434                    "/" => wr_nest(acc, rhs),
435                    other => {
436                        return Err(FormulaDslError::ParseError {
437                            reason: format!(
438                                "invalid term syntax in `{raw}`: unrecognized mul operator `{other}`"
439                            ),
440                        }
441                        .into());
442                    }
443                };
444            }
445            Ok(acc)
446        }
447        Rule::interact => {
448            let mut iter = pair.into_inner();
449            let first = iter.next().ok_or_else(|| {
450                FormulaDslError::ParseError {
451                    reason: format!("invalid term syntax in `{raw}`: empty interact"),
452                }
453                .to_string()
454            })?;
455            let mut acc = expand_expr(first, raw)?;
456            while let Some(op) = iter.next() {
457                if op.as_rule() != Rule::interact_op {
458                    return Err(FormulaDslError::ParseError {
459                        reason: format!(
460                            "invalid term syntax in `{raw}`: expected `:`, got `{:?}`",
461                            op.as_rule()
462                        ),
463                    }
464                    .into());
465                }
466                let operand = iter.next().ok_or_else(|| {
467                    FormulaDslError::ParseError {
468                        reason: format!("invalid term syntax in `{raw}`: dangling `:`"),
469                    }
470                    .to_string()
471                })?;
472                let rhs = expand_expr(operand, raw)?;
473                acc = wr_interact(acc, rhs, raw)?;
474            }
475            Ok(acc)
476        }
477        Rule::power => {
478            let mut iter = pair.into_inner();
479            let first = iter.next().ok_or_else(|| {
480                FormulaDslError::ParseError {
481                    reason: format!("invalid term syntax in `{raw}`: empty power"),
482                }
483                .to_string()
484            })?;
485            let base = expand_expr(first, raw)?;
486            let Some(op) = iter.next() else {
487                return Ok(base);
488            };
489            if op.as_rule() != Rule::pow_op {
490                return Err(FormulaDslError::ParseError {
491                    reason: format!(
492                        "invalid term syntax in `{raw}`: expected `^`, got `{:?}`",
493                        op.as_rule()
494                    ),
495                }
496                .into());
497            }
498            let exponent_pair = iter.next().ok_or_else(|| {
499                FormulaDslError::ParseError {
500                    reason: format!("invalid term syntax in `{raw}`: dangling `^`"),
501                }
502                .to_string()
503            })?;
504            let exp_text = exponent_pair.as_str().trim();
505            let n: usize = exp_text.parse().map_err(|_| {
506                FormulaDslError::ParseError {
507                    reason: format!(
508                        "invalid term syntax in `{raw}`: `^` exponent must be a positive integer, got `{exp_text}`"
509                    ),
510                }
511                .to_string()
512            })?;
513            if n == 0 {
514                return Err(FormulaDslError::ParseError {
515                    reason: format!(
516                        "invalid term syntax in `{raw}`: `^0` is not a meaningful formula expansion"
517                    ),
518                }
519                .into());
520            }
521            if let Some(extra_op) = iter.next() {
522                let extra = extra_op.as_str().trim();
523                return Err(FormulaDslError::ParseError {
524                    reason: format!(
525                        "invalid term syntax in `{raw}`: chained `^` operators are not supported; \
526                         use one positive integer exponent, got another `{extra}`"
527                    ),
528                }
529                .into());
530            }
531            Ok(wr_power(base, n))
532        }
533        Rule::unary => {
534            let unary_start = pair.as_span().start();
535            for inner in pair.into_inner() {
536                if inner.as_rule() == Rule::primary {
537                    let prefix_len = inner.as_span().start().saturating_sub(unary_start);
538                    if prefix_len > 0 {
539                        let prefix = &raw[unary_start..inner.as_span().start()];
540                        if prefix.chars().any(|ch| matches!(ch, '+' | '-')) {
541                            return Err(FormulaDslError::IncompatibleTerm {
542                                reason: format!(
543                                    "unary `+`/`-` is not supported inside a formula term in `{raw}`"
544                                ),
545                            }
546                            .into());
547                        }
548                    }
549                    return expand_expr(inner, raw);
550                }
551            }
552            Err(FormulaDslError::ParseError {
553                reason: format!("invalid term syntax in `{raw}`: empty unary"),
554            }
555            .into())
556        }
557        Rule::primary => {
558            let span = pair.as_str().trim().to_string();
559            let inner = pair.into_inner().next();
560            match inner {
561                Some(child) if child.as_rule() == Rule::expr => expand_expr(child, raw),
562                Some(child) => Ok(vec![vec![child.as_str().trim().to_string()]]),
563                None => Ok(vec![vec![span]]),
564            }
565        }
566        _ => Err(FormulaDslError::ParseError {
567            reason: format!(
568                "invalid term syntax in `{raw}`: unexpected node `{:?}`",
569                pair.as_rule()
570            ),
571        }
572        .into()),
573    }
574}
575
576fn wr_cross(left: Vec<WrAtomList>, right: Vec<WrAtomList>) -> Vec<WrAtomList> {
577    // a*b = a + b + a:b
578    let mut out = Vec::with_capacity(left.len() + right.len() + left.len() * right.len());
579    out.extend(left.iter().cloned());
580    out.extend(right.iter().cloned());
581    for l in &left {
582        for r in &right {
583            let mut merged: WrAtomList = l.iter().cloned().chain(r.iter().cloned()).collect();
584            merged.sort();
585            merged.dedup();
586            out.push(merged);
587        }
588    }
589    out
590}
591
592fn wr_nest(left: Vec<WrAtomList>, right: Vec<WrAtomList>) -> Vec<WrAtomList> {
593    // a/b = a + a:b
594    let mut out = Vec::with_capacity(left.len() + left.len() * right.len());
595    out.extend(left.iter().cloned());
596    for l in &left {
597        for r in &right {
598            let mut merged: WrAtomList = l.iter().cloned().chain(r.iter().cloned()).collect();
599            merged.sort();
600            merged.dedup();
601            out.push(merged);
602        }
603    }
604    out
605}
606
607fn wr_interact(
608    left: Vec<WrAtomList>,
609    right: Vec<WrAtomList>,
610    raw: &str,
611) -> Result<Vec<WrAtomList>, String> {
612    // a:b — Cartesian merge of every (l, r) pair.
613    let mut out = Vec::with_capacity(left.len() * right.len());
614    for l in &left {
615        for r in &right {
616            let combined: WrAtomList = l.iter().cloned().chain(r.iter().cloned()).collect();
617            // Reject interactions whose atoms contain function-call syntax.
618            // These would build design columns that are not simple products
619            // (factors, smooths) and need a dedicated constructor.
620            for atom in &combined {
621                if atom.contains('(') {
622                    return Err(FormulaDslError::IncompatibleTerm {
623                        reason: format!(
624                            "interaction operator `:` with function-call atom is not supported in `{raw}`. \
625                             Use te(...) for smooth interactions or group()/factor() with a separate \
626                             interaction strategy for categorical effects."
627                        ),
628                    }
629                    .into());
630                }
631            }
632            let mut merged = combined;
633            merged.sort();
634            merged.dedup();
635            out.push(merged);
636        }
637    }
638    Ok(out)
639}
640
641fn wr_power(base: Vec<WrAtomList>, n: usize) -> Vec<WrAtomList> {
642    // (e)^n produces every non-empty subset of `base` of size ≤ n,
643    // each subset being one interaction. The standard WR semantics
644    // treat the base as a *set* of atoms — interactions inside `base`
645    // are kept as-is and not re-crossed.
646    if base.is_empty() {
647        return Vec::new();
648    }
649    let m = base.len();
650    let mut out = Vec::<WrAtomList>::new();
651    let max_size = n.min(m);
652    for size in 1..=max_size {
653        // Enumerate combinations of `size` elements out of m.
654        let mut indices: Vec<usize> = (0..size).collect();
655        loop {
656            let mut merged = WrAtomList::new();
657            for &i in &indices {
658                merged.extend(base[i].iter().cloned());
659            }
660            merged.sort();
661            merged.dedup();
662            out.push(merged);
663            // Next combination
664            let mut k = size;
665            while k > 0 {
666                k -= 1;
667                if indices[k] != k + m - size {
668                    indices[k] += 1;
669                    for j in (k + 1)..size {
670                        indices[j] = indices[j - 1] + 1;
671                    }
672                    break;
673                }
674                if k == 0 {
675                    k = usize::MAX;
676                    break;
677                }
678            }
679            if k == usize::MAX {
680                break;
681            }
682        }
683    }
684    out
685}
686
687fn is_exact_ident(raw: &str) -> bool {
688    let mut chars = raw.chars();
689    let Some(first) = chars.next() else {
690        return false;
691    };
692    if !first.is_ascii_alphabetic() && first != '_' {
693        return false;
694    }
695    chars.all(|ch| ch.is_ascii_alphanumeric() || ch == '_' || ch == '.')
696}
697
698pub fn parse_function_call(input: &str) -> Result<FunctionCallSpec, String> {
699    validate_balanced_delimiters(input, "invalid function call syntax")?;
700    let mut parsed = FormulaParser::parse(Rule::top_function_call, input).map_err(|e| {
701        FormulaDslError::ParseError {
702            reason: format!("invalid function call syntax: {e}"),
703        }
704    })?;
705    let top = parsed.next().ok_or_else(|| FormulaDslError::ParseError {
706        reason: "invalid function call syntax: empty parse".to_string(),
707    })?;
708    let call = top
709        .into_inner()
710        .find(|p| p.as_rule() == Rule::function_call)
711        .ok_or_else(|| FormulaDslError::ParseError {
712            reason: "invalid function call syntax: missing call".to_string(),
713        })?;
714    parse_call_pair(call)
715}
716
717fn parse_call_pair(call: Pair<'_, Rule>) -> Result<FunctionCallSpec, String> {
718    let mut name: Option<String> = None;
719    let mut args = Vec::<CallArgSpec>::new();
720    for part in call.into_inner() {
721        match part.as_rule() {
722            Rule::ident => {
723                if name.is_none() {
724                    name = Some(part.as_str().trim().to_string());
725                }
726            }
727            Rule::arg_list => {
728                for a in part.into_inner() {
729                    if a.as_rule() != Rule::arg {
730                        continue;
731                    }
732                    let mut a_inner = a.into_inner();
733                    let Some(first) = a_inner.next() else {
734                        continue;
735                    };
736                    match first.as_rule() {
737                        Rule::named_arg => {
738                            let mut ni = first.into_inner();
739                            let key = ni
740                                .next()
741                                .ok_or_else(|| FormulaDslError::ParseError {
742                                    reason: "invalid named argument key".to_string(),
743                                })?
744                                .as_str()
745                                .trim()
746                                .to_ascii_lowercase();
747                            let value = ni
748                                .next()
749                                .ok_or_else(|| FormulaDslError::ParseError {
750                                    reason: "invalid named argument value".to_string(),
751                                })?
752                                .as_str()
753                                .trim()
754                                .to_string();
755                            args.push(CallArgSpec::Named { key, value });
756                        }
757                        Rule::expr => {
758                            args.push(CallArgSpec::Positional(first.as_str().trim().to_string()));
759                        }
760                        _ => {}
761                    }
762                }
763            }
764            _ => {}
765        }
766    }
767    let name = name.ok_or_else(|| FormulaDslError::ParseError {
768        reason: "invalid function call: missing name".to_string(),
769    })?;
770    Ok(FunctionCallSpec { name, args })
771}
772
773#[cfg(test)]
774mod tests {
775    use super::{
776        CallArgSpec, ParsedTerm, parse_formula, parse_formula_dsl, parse_function_call,
777        parse_linkwiggle_formulaspec, parsed_term_column_names, parsed_terms_reference_column,
778        validate_marginal_slope_z_column_exclusion,
779    };
780    use std::collections::{BTreeMap, BTreeSet};
781
782    #[test]
783    fn parsed_term_column_names_includes_by_smooth_grouping_variable() {
784        // The model's input contract must include a smooth's `by=` column, not
785        // just its positional variable. `s(x, by=g)` consumes both `x` and `g`
786        // (`term_builder` reads `options["by"]`); dropping `g` from the
787        // required/consumable set would either omit a genuine predictor at fit
788        // (CLI projected load) or — worse — make predict silently project the
789        // `g` column away (#840 regression). A bare main effect and an
790        // interaction round out the variant coverage.
791        let parsed =
792            parse_formula("y ~ s(x, by=g) + z + a:b").expect("formula with a by= smooth parses");
793        let mut cols = BTreeSet::<String>::new();
794        parsed_term_column_names(&parsed.terms, &mut cols);
795        for expected in ["x", "g", "z", "a", "b"] {
796            assert!(
797                cols.contains(expected),
798                "parsed_term_column_names dropped '{expected}'; got {cols:?}"
799            );
800        }
801        // The response is never a *term* column (it is handled separately).
802        assert!(
803            !cols.contains("y"),
804            "response leaked into term columns: {cols:?}"
805        );
806    }
807
808    #[test]
809    fn linkwiggle_parser_does_not_bake_in_cubic_only_restriction() {
810        // Regression for #384: the cubic-only constraint belongs to the
811        // score-warp / link-deviation `DeviationRuntime`, NOT to this shared
812        // parser. `parse_linkwiggle_formulaspec` also feeds `timewiggle` and
813        // the location-scale survival path, whose general monotone I-spline
814        // value basis honors any `degree >= 2`. So the parser must accept
815        // non-cubic degrees and carry them through verbatim; the cubic gate is
816        // applied downstream only where the cubic-only runtime is built. A
817        // prior fix wrongly forced `degree == 3` here, breaking timewiggle and
818        // location-scale callers — this pins that the parser stays general.
819        for deg in [2usize, 4, 5, 10] {
820            let mut options = BTreeMap::new();
821            options.insert("degree".to_string(), deg.to_string());
822            options.insert("internal_knots".to_string(), "3".to_string());
823            let raw = format!("timewiggle(degree={deg}, internal_knots=3)");
824            let spec = parse_linkwiggle_formulaspec(&options, &raw)
825                .expect("non-cubic wiggle degree must parse at the shared layer");
826            assert_eq!(
827                spec.degree, deg,
828                "parser must carry the requested degree through verbatim"
829            );
830        }
831
832        // The only universal lower bound the shared parser enforces is that a
833        // polynomial degree is positive.
834        let mut zero = BTreeMap::new();
835        zero.insert("degree".to_string(), "0".to_string());
836        zero.insert("internal_knots".to_string(), "3".to_string());
837        let err = parse_linkwiggle_formulaspec(&zero, "linkwiggle(degree=0, internal_knots=3)")
838            .expect_err("degree=0 must be rejected");
839        assert!(
840            err.contains("degree >= 1"),
841            "error should state the positive-degree lower bound, got: {err}"
842        );
843    }
844
845    #[test]
846    fn parses_nested_formula_terms() {
847        let parsed =
848            parse_formula_dsl("log(y) ~ x1 + s(log(x2 + 1), bs=\"tps\", k=10) + te(x3, x4)")
849                .expect("parse");
850        assert_eq!(parsed.response_expr, "log(y)");
851        assert_eq!(parsed.rhs_terms.len(), 3);
852        assert_eq!(parsed.rhs_terms[0], "x1");
853        assert_eq!(parsed.rhs_terms[1], "s(log(x2 + 1), bs=\"tps\", k=10)");
854        assert_eq!(parsed.rhs_terms[2], "te(x3, x4)");
855    }
856
857    #[test]
858    fn parses_cyclic_formula_aliases() {
859        let parsed = parse_formula("y ~ cyclic(theta, period_start=0, period_end=6.283)")
860            .expect("parse cyclic formula");
861        match &parsed.terms[0] {
862            super::ParsedTerm::Smooth { vars, options, .. } => {
863                assert_eq!(vars, &vec!["theta".to_string()]);
864                assert_eq!(options.get("type").map(String::as_str), Some("cyclic"));
865                assert_eq!(options.get("period_start").map(String::as_str), Some("0"));
866            }
867            other => panic!("expected cyclic smooth term, got {other:?}"),
868        }
869    }
870
871    #[test]
872    fn sphere_aliases_all_dispatch_to_intrinsic_s2_basis() {
873        // Regression for #383: `s2(lat, lon)` must build the same intrinsic
874        // S² (sphere) basis as `sphere()`/`sos()`/`spherical()`. Previously the
875        // `s2` arm returned a Smooth without `type=sphere`, so it silently fell
876        // back to a generic Euclidean 2-D smooth and diverged in the
877        // spatial-kappa optimizer. All four aliases must be byte-for-byte
878        // equivalent in their dispatch (vars + `type=sphere`).
879        for alias in ["sphere", "sos", "spherical", "s2"] {
880            let parsed = parse_formula(&format!("y ~ {alias}(lat, lon)"))
881                .unwrap_or_else(|e| panic!("parse {alias}: {e}"));
882            match &parsed.terms[0] {
883                super::ParsedTerm::Smooth { vars, options, .. } => {
884                    assert_eq!(
885                        vars,
886                        &vec!["lat".to_string(), "lon".to_string()],
887                        "{alias} should keep (lat, lon) as its variables"
888                    );
889                    assert_eq!(
890                        options.get("type").map(String::as_str),
891                        Some("sphere"),
892                        "{alias} must dispatch to the intrinsic sphere basis (type=sphere)"
893                    );
894                }
895                other => panic!("expected sphere smooth term for {alias}, got {other:?}"),
896            }
897        }
898    }
899
900    #[test]
901    fn parses_function_callwithnamed_and_positional_args() {
902        let call = parse_function_call("s(log(x + 1), type=\"duchon\", centers=12)").expect("call");
903        assert_eq!(call.name, "s");
904        assert_eq!(call.args.len(), 3);
905        assert_eq!(
906            call.args[0],
907            CallArgSpec::Positional("log(x + 1)".to_string())
908        );
909        assert_eq!(
910            call.args[1],
911            CallArgSpec::Named {
912                key: "type".to_string(),
913                value: "\"duchon\"".to_string()
914            }
915        );
916    }
917
918    #[test]
919    fn parses_tensor_boundary_list_options() {
920        let call = parse_function_call(
921            "te(day_of_week, hour, boundary=['periodic', 'periodic'], period=[7, 24])",
922        )
923        .expect("call");
924        assert_eq!(call.name, "te");
925        assert_eq!(call.args.len(), 4);
926        assert_eq!(
927            call.args[2],
928            CallArgSpec::Named {
929                key: "boundary".to_string(),
930                value: "['periodic', 'periodic']".to_string(),
931            }
932        );
933    }
934
935    #[test]
936    fn parse_formula_dsl_reports_unbalanced_parentheses() {
937        let err = parse_formula_dsl("y ~ s(x, k=10").expect_err("expected parse failure");
938        assert!(err.contains("unbalanced parentheses"));
939    }
940
941    #[test]
942    fn parse_function_call_reports_unbalanced_parentheses() {
943        let err = parse_function_call("s(x, k=10").expect_err("expected parse failure");
944        assert!(err.contains("unbalanced parentheses"));
945    }
946
947    #[test]
948    fn parse_formula_accepts_tuple_smooth_options() {
949        let parsed = parse_formula("z ~ te(x, y, k=(20, 20))")
950            .expect("tuple-valued smooth option should parse");
951        assert_eq!(parsed.terms.len(), 1);
952
953        let dsl = parse_formula_dsl("z ~ te(x, y, k=(20, 20))")
954            .expect("tuple-valued smooth option should parse in the DSL layer");
955        assert_eq!(dsl.rhs_terms, vec!["te(x, y, k=(20, 20))"]);
956
957        let call = parse_function_call("te(x, y, k=(20, 20))")
958            .expect("tuple-valued smooth option should parse as a function call");
959        assert_eq!(
960            call.args[2],
961            CallArgSpec::Named {
962                key: "k".to_string(),
963                value: "(20, 20)".to_string(),
964            }
965        );
966    }
967
968    #[test]
969    fn parse_formula_rejects_unsupported_top_level_rhs_expressions() {
970        // Binary `-`, unary `-`, bare parens, and `-1` intercept removal are
971        // not supported as top-level RHS expressions; they must surface
972        // through the bare-identifier check in `parse_term` with the same
973        // diagnostic. WR operators `:`, `*`, `/`, and `^` are intentionally
974        // supported by `expand_wr_term` and are exercised by
975        // `parse_formula_supports_wr_slash_nesting` /
976        // `parse_formula_supports_wr_star_crossing`; they must NOT appear in
977        // this list.
978        for formula in ["y ~ x - z", "y ~ -x", "y ~ (x)", "y ~ x - 1"] {
979            let err = parse_formula(formula).expect_err("expected formula parse failure");
980            assert!(err.to_string().contains("unsupported top-level RHS term"));
981        }
982    }
983
984    /// Wilkinson-Rogers `a/b` (nesting) is documented and implemented by
985    /// `expand_wr_term` to yield `{a, a:b}`. This test pins that contract end
986    /// to end so a regression in either the grammar's `mul_op = { "*" | "/" }`
987    /// branch or in `expand_expr`'s `Rule::product` handling surfaces here
988    /// instead of slipping into a silent re-interpretation as a rejection.
989    #[test]
990    fn parse_formula_supports_wr_slash_nesting() {
991        let parsed = parse_formula("y ~ x / z").expect("`/` is supported as WR nesting");
992        assert_eq!(parsed.response, "y");
993        assert_eq!(parsed.terms.len(), 2);
994        let names: Vec<String> = parsed
995            .terms
996            .iter()
997            .map(|t| match t {
998                ParsedTerm::Linear { name, .. } => format!("Linear({name})"),
999                ParsedTerm::Interaction { vars } => format!("Interaction({})", vars.join(":")),
1000                other => format!("Other({other:?})"),
1001            })
1002            .collect();
1003        assert_eq!(
1004            names,
1005            vec!["Linear(x)".to_string(), "Interaction(x:z)".to_string()]
1006        );
1007    }
1008
1009    /// Wilkinson-Rogers `a*b` (crossing) is documented and implemented by
1010    /// `expand_wr_term` to yield `{a, b, a:b}`. Pin the contract so a future
1011    /// refactor of `mul_op` / `Rule::product` handling cannot silently change
1012    /// the set of model terms.
1013    #[test]
1014    fn parse_formula_supports_wr_star_crossing() {
1015        let parsed = parse_formula("y ~ x * z").expect("`*` is supported as WR crossing");
1016        assert_eq!(parsed.response, "y");
1017        assert_eq!(parsed.terms.len(), 3);
1018        let names: Vec<String> = parsed
1019            .terms
1020            .iter()
1021            .map(|t| match t {
1022                ParsedTerm::Linear { name, .. } => format!("Linear({name})"),
1023                ParsedTerm::Interaction { vars } => format!("Interaction({})", vars.join(":")),
1024                other => format!("Other({other:?})"),
1025            })
1026            .collect();
1027        assert_eq!(
1028            names,
1029            vec![
1030                "Linear(x)".to_string(),
1031                "Linear(z)".to_string(),
1032                "Interaction(x:z)".to_string(),
1033            ]
1034        );
1035    }
1036
1037    #[test]
1038    fn parse_formula_rejects_unary_signs_inside_wr_expansion() {
1039        for formula in ["y ~ x:-z", "y ~ a*-b", "y ~ x/-z", "y ~ x:+z"] {
1040            let err = parse_formula(formula)
1041                .expect_err("WR expansion must not silently drop unary signs");
1042            let msg = err.to_string();
1043            assert!(
1044                msg.contains("unary `+`/`-` is not supported"),
1045                "unexpected error for {formula}: {msg}"
1046            );
1047        }
1048    }
1049
1050    #[test]
1051    fn parse_formula_supports_wr_power_crossing() {
1052        let parsed = parse_formula("y ~ (x + z)^2").expect("`^` is supported as WR power");
1053        assert_eq!(parsed.response, "y");
1054        assert_eq!(parsed.terms.len(), 3);
1055        let names: Vec<String> = parsed
1056            .terms
1057            .iter()
1058            .map(|t| match t {
1059                ParsedTerm::Linear { name, .. } => format!("Linear({name})"),
1060                ParsedTerm::Interaction { vars } => format!("Interaction({})", vars.join(":")),
1061                other => format!("Other({other:?})"),
1062            })
1063            .collect();
1064        assert_eq!(
1065            names,
1066            vec![
1067                "Linear(x)".to_string(),
1068                "Linear(z)".to_string(),
1069                "Interaction(x:z)".to_string(),
1070            ]
1071        );
1072    }
1073
1074    #[test]
1075    fn parse_formula_rejects_chained_wr_power() {
1076        let err = parse_formula("y ~ (x + z)^2^3")
1077            .expect_err("chained WR powers must not silently drop later exponents");
1078        let msg = err.to_string();
1079        assert!(
1080            msg.contains("chained `^` operators are not supported"),
1081            "error should explain that chained powers are rejected, got: {msg}"
1082        );
1083    }
1084
1085    #[test]
1086    fn parsed_terms_reference_column_sees_the_by_smooth_variable() {
1087        // Regression for #807: the by= grouping variable lives in
1088        // options["by"], not the smooth's positional vars. The reference
1089        // predicate must still recognise it, both so the CLI loads the column
1090        // and so the marginal-slope z-column exclusion check (which reuses this
1091        // predicate) cannot be fooled into aliasing a reserved z onto a by=.
1092        let parsed = parse_formula("y ~ s(x, by=g)").expect("parse by-smooth");
1093        assert!(
1094            parsed_terms_reference_column(&parsed.terms, "g"),
1095            "s(x, by=g) references column g via options[\"by\"]"
1096        );
1097        assert!(parsed_terms_reference_column(&parsed.terms, "x"));
1098        assert!(!parsed_terms_reference_column(&parsed.terms, "absent"));
1099    }
1100
1101    #[test]
1102    fn marginal_slope_z_column_validator_detects_linear_and_smooth_reuse() {
1103        let main = parse_formula("y ~ x + z").expect("parse main");
1104        let logslope = parse_formula("y ~ s(z, type=duchon, centers=6)").expect("parse logslope");
1105
1106        assert!(parsed_terms_reference_column(&main.terms, "z"));
1107        assert!(parsed_terms_reference_column(&logslope.terms, "z"));
1108
1109        let err = validate_marginal_slope_z_column_exclusion(
1110            &main,
1111            &parse_formula("y ~ 1").expect("parse clean logslope"),
1112            "z",
1113            "bernoulli marginal-slope",
1114            "--logslope-formula",
1115        )
1116        .expect_err("main formula should be rejected");
1117        assert!(err.contains("cannot also appear in the main formula"));
1118
1119        let err = validate_marginal_slope_z_column_exclusion(
1120            &parse_formula("y ~ x").expect("parse clean main"),
1121            &logslope,
1122            "z",
1123            "bernoulli marginal-slope",
1124            "--logslope-formula",
1125        )
1126        .expect_err("logslope formula should be rejected");
1127        assert!(err.contains("cannot also appear in --logslope-formula"));
1128    }
1129
1130    #[test]
1131    fn logslope_surface_declarations_are_additive() {
1132        let parsed = parse_formula("y ~ s(pc1) + logslope(z2, s(pc2)) + logslope(z3, x3)")
1133            .expect("parse additive logslope surfaces");
1134        assert_eq!(parsed.terms.len(), 1);
1135        assert_eq!(parsed.logslope_surfaces.len(), 2);
1136        assert_eq!(parsed.logslope_surfaces[0].z_column, "z2");
1137        assert_eq!(parsed.logslope_surfaces[0].terms.len(), 1);
1138        assert_eq!(parsed.logslope_surfaces[1].z_column, "z3");
1139        assert_eq!(parsed.logslope_surfaces[1].terms.len(), 1);
1140    }
1141
1142    #[test]
1143    fn marginal_slope_z_column_validator_reserves_all_surface_z_columns() {
1144        let main = parse_formula("y ~ x").expect("parse main");
1145        let logslope = parse_formula("y ~ s(pc1) + logslope(z2, s(z3)) + logslope(z3, x)")
1146            .expect("parse logslope surfaces");
1147        let err = validate_marginal_slope_z_column_exclusion(
1148            &main,
1149            &logslope,
1150            "z1",
1151            "bernoulli marginal-slope",
1152            "--logslope-formula",
1153        )
1154        .expect_err("surface formula should reject another reserved z coordinate");
1155        assert!(err.contains("reserves z column 'z3'"));
1156    }
1157}
1158
1159// ---------------------------------------------------------------------------
1160// Higher-level formula parsing: ParsedFormula, ParsedTerm, and friends
1161// ---------------------------------------------------------------------------
1162
1163#[derive(Clone, Debug)]
1164pub struct LinkWiggleFormulaSpec {
1165    pub degree: usize,
1166    pub num_internal_knots: usize,
1167    pub penalty_orders: Vec<usize>,
1168    pub double_penalty: bool,
1169}
1170
1171pub fn default_linkwiggle_formulaspec() -> LinkWiggleFormulaSpec {
1172    let cfg = WigglePenaltyConfig::cubic_triple_operator_default();
1173    LinkWiggleFormulaSpec {
1174        degree: cfg.degree,
1175        num_internal_knots: cfg.num_internal_knots,
1176        penalty_orders: cfg.penalty_orders,
1177        double_penalty: cfg.double_penalty,
1178    }
1179}
1180
1181#[derive(Clone, Debug)]
1182pub struct LinkFormulaSpec {
1183    pub link: String,
1184    pub mixture_rho: Option<String>,
1185    pub sas_init: Option<String>,
1186    pub beta_logistic_init: Option<String>,
1187}
1188
1189#[derive(Clone, Debug)]
1190pub struct SurvivalFormulaSpec {
1191    pub spec: Option<String>,
1192    pub survival_distribution: Option<String>,
1193}
1194
1195#[derive(Clone, Debug)]
1196pub struct ParsedFormula {
1197    pub response: String,
1198    pub terms: Vec<ParsedTerm>,
1199    pub logslope_surfaces: Vec<LogSlopeSurfaceSpec>,
1200    pub linkwiggle: Option<LinkWiggleFormulaSpec>,
1201    pub timewiggle: Option<LinkWiggleFormulaSpec>,
1202    pub linkspec: Option<LinkFormulaSpec>,
1203    pub survivalspec: Option<SurvivalFormulaSpec>,
1204}
1205
1206#[derive(Clone, Debug)]
1207pub struct LogSlopeSurfaceSpec {
1208    pub z_column: String,
1209    pub terms: Vec<ParsedTerm>,
1210}
1211
1212pub fn marginal_slope_logslope_surfaces(
1213    logslope_formula: &ParsedFormula,
1214    default_z_column: &str,
1215) -> Result<Vec<LogSlopeSurfaceSpec>, String> {
1216    let mut surfaces = Vec::new();
1217    if !logslope_formula.terms.is_empty() {
1218        surfaces.push(LogSlopeSurfaceSpec {
1219            z_column: default_z_column.to_string(),
1220            terms: logslope_formula.terms.clone(),
1221        });
1222    }
1223    surfaces.extend(logslope_formula.logslope_surfaces.clone());
1224    if surfaces.is_empty() {
1225        surfaces.push(LogSlopeSurfaceSpec {
1226            z_column: default_z_column.to_string(),
1227            terms: Vec::new(),
1228        });
1229    }
1230    let mut seen = std::collections::BTreeSet::<String>::new();
1231    for surface in &surfaces {
1232        if !seen.insert(surface.z_column.clone()) {
1233            return Err(FormulaDslError::IncompatibleTerm {
1234                reason: format!(
1235                    "logslope formula declares z column '{}' more than once; each z coordinate needs exactly one log-slope surface",
1236                    surface.z_column
1237                ),
1238            }
1239            .into());
1240        }
1241    }
1242    Ok(surfaces)
1243}
1244
1245#[derive(Clone, Debug)]
1246pub enum ParsedTerm {
1247    Linear {
1248        name: String,
1249        explicit: bool,
1250        coefficient_min: Option<f64>,
1251        coefficient_max: Option<f64>,
1252    },
1253    BoundedLinear {
1254        name: String,
1255        min: f64,
1256        max: f64,
1257        prior: BoundedCoefficientPriorSpec,
1258    },
1259    RandomEffect {
1260        name: String,
1261    },
1262    Smooth {
1263        label: String,
1264        vars: Vec<String>,
1265        kind: SmoothKind,
1266        options: BTreeMap<String, String>,
1267    },
1268    LinkWiggle {
1269        options: BTreeMap<String, String>,
1270    },
1271    TimeWiggle {
1272        options: BTreeMap<String, String>,
1273    },
1274    LinkConfig {
1275        options: BTreeMap<String, String>,
1276    },
1277    SurvivalConfig {
1278        options: BTreeMap<String, String>,
1279    },
1280    LogSlopeSurface {
1281        z_column: String,
1282        terms: Vec<ParsedTerm>,
1283    },
1284    /// Wilkinson-Rogers interaction term `a:b[:c...]`.
1285    ///
1286    /// `vars` is a sorted, deduplicated list of base column names. Each element
1287    /// must be a bare identifier — interactions with function-call atoms
1288    /// (smooths, factors, etc.) are rejected upstream because their design
1289    /// columns are not simple products. The design column is the elementwise
1290    /// product of the referenced numeric columns.
1291    Interaction {
1292        vars: Vec<String>,
1293    },
1294}
1295
1296/// Collect the names of every data column the parsed terms consume.
1297///
1298/// This is the canonical formula→columns walk shared by the fit-time and
1299/// predict-time required-column computations (the CLI and PyFFI surfaces both
1300/// route through it). It includes a smooth's positional `vars` *and* its `by=`
1301/// grouping/scaling column (`s(x, by=g)`), which `term_builder` reads from
1302/// `options["by"]` but which is not among the positional variables — omitting
1303/// it would drop a genuine predictor from the model's input contract.
1304pub fn parsed_term_column_names(
1305    terms: &[ParsedTerm],
1306    out: &mut std::collections::BTreeSet<String>,
1307) {
1308    for term in terms {
1309        match term {
1310            ParsedTerm::Linear { name, .. }
1311            | ParsedTerm::BoundedLinear { name, .. }
1312            | ParsedTerm::RandomEffect { name } => {
1313                out.insert(name.clone());
1314            }
1315            ParsedTerm::Smooth { vars, options, .. } => {
1316                out.extend(vars.iter().cloned());
1317                if let Some(by) = options.get("by") {
1318                    out.insert(by.clone());
1319                }
1320            }
1321            ParsedTerm::Interaction { vars } => {
1322                out.extend(vars.iter().cloned());
1323            }
1324            ParsedTerm::LinkWiggle { .. }
1325            | ParsedTerm::TimeWiggle { .. }
1326            | ParsedTerm::LinkConfig { .. }
1327            | ParsedTerm::SurvivalConfig { .. } => {}
1328            ParsedTerm::LogSlopeSurface { z_column, terms } => {
1329                out.insert(z_column.clone());
1330                parsed_term_column_names(terms, out);
1331            }
1332        }
1333    }
1334}
1335
1336pub fn parsed_terms_reference_column(terms: &[ParsedTerm], column_name: &str) -> bool {
1337    terms.iter().any(|term| match term {
1338        ParsedTerm::Linear { name, .. }
1339        | ParsedTerm::BoundedLinear { name, .. }
1340        | ParsedTerm::RandomEffect { name } => name == column_name,
1341        ParsedTerm::Smooth { vars, options, .. } => {
1342            vars.iter().any(|var| var == column_name)
1343                || options.get("by").is_some_and(|by| by == column_name)
1344        }
1345        ParsedTerm::Interaction { vars } => vars.iter().any(|var| var == column_name),
1346        ParsedTerm::LinkWiggle { .. }
1347        | ParsedTerm::TimeWiggle { .. }
1348        | ParsedTerm::LinkConfig { .. }
1349        | ParsedTerm::SurvivalConfig { .. } => false,
1350        ParsedTerm::LogSlopeSurface { z_column, terms } => {
1351            z_column == column_name || parsed_terms_reference_column(terms, column_name)
1352        }
1353    })
1354}
1355
1356pub fn validate_marginal_slope_z_column_exclusion(
1357    main_formula: &ParsedFormula,
1358    logslope_formula: &ParsedFormula,
1359    z_column: &str,
1360    context: &str,
1361    logslope_label: &str,
1362) -> Result<(), String> {
1363    let surfaces = marginal_slope_logslope_surfaces(logslope_formula, z_column)?;
1364    for z_column in surfaces.iter().map(|surface| surface.z_column.as_str()) {
1365        if parsed_terms_reference_column(&main_formula.terms, z_column) {
1366            return Err(FormulaDslError::IncompatibleTerm {
1367                reason: format!(
1368                    "{context} reserves z column '{z_column}' as the auxiliary latent score; it cannot also appear in the main formula"
1369                ),
1370            }
1371            .into());
1372        }
1373    }
1374    for reserved in surfaces.iter().map(|surface| surface.z_column.as_str()) {
1375        if parsed_terms_reference_column(&logslope_formula.terms, reserved) {
1376            return Err(FormulaDslError::IncompatibleTerm {
1377                reason: format!(
1378                    "{context} reserves z column '{reserved}' as the auxiliary latent score; it cannot also appear in {logslope_label}"
1379                ),
1380            }
1381            .into());
1382        }
1383        for surface in &surfaces {
1384            if parsed_terms_reference_column(&surface.terms, reserved) {
1385                return Err(FormulaDslError::IncompatibleTerm {
1386                    reason: format!(
1387                        "{context} reserves z column '{reserved}' as an auxiliary latent score; it cannot also appear in {logslope_label}"
1388                    ),
1389                }
1390                .into());
1391            }
1392        }
1393    }
1394    Ok(())
1395}
1396
1397#[derive(Clone, Copy, Debug)]
1398pub enum SmoothKind {
1399    S,
1400    Te,
1401    /// Tensor smooth (`t2(...)`) using mgcv's separable penalty decomposition:
1402    /// the tensor coefficient space is split into marginal penalized/null-space
1403    /// tensor subspaces, with one smoothing parameter per non-null subspace.
1404    T2,
1405    /// Tensor *interaction* smooth (`ti(...)`): a tensor-product smooth whose
1406    /// marginal main effects are excluded, so the term captures only the pure
1407    /// interaction between its variables. Materializes through the same tensor
1408    /// path as [`SmoothKind::Te`] but with per-margin sum-to-zero
1409    /// identifiability (`TensorBSplineIdentifiability::MarginalSumToZero`).
1410    Ti,
1411}
1412
1413#[derive(Clone, Copy, Debug)]
1414pub enum LinkMode {
1415    Strict,
1416    Flexible,
1417}
1418
1419#[derive(Clone, Debug)]
1420pub struct LinkChoice {
1421    pub mode: LinkMode,
1422    pub link: LinkFunction,
1423    pub mixture_components: Option<Vec<LinkComponent>>,
1424}
1425
1426// ---------------------------------------------------------------------------
1427// Link wiggle / link choice helpers
1428// ---------------------------------------------------------------------------
1429
1430pub fn effectivelinkwiggle_formulaspec(
1431    formula_linkwiggle: Option<&LinkWiggleFormulaSpec>,
1432    link_choice: Option<&LinkChoice>,
1433) -> Option<LinkWiggleFormulaSpec> {
1434    formula_linkwiggle.cloned().or_else(|| {
1435        link_choice.and_then(|choice| {
1436            if matches!(choice.mode, LinkMode::Flexible) {
1437                Some(default_linkwiggle_formulaspec())
1438            } else {
1439                None
1440            }
1441        })
1442    })
1443}
1444
1445pub const fn linkname_supports_joint_wiggle(link: LinkFunction) -> bool {
1446    !matches!(link, LinkFunction::Sas | LinkFunction::BetaLogistic)
1447}
1448
1449pub const fn linkchoice_supports_joint_wiggle(choice: &LinkChoice) -> bool {
1450    match &choice.mixture_components {
1451        None => linkname_supports_joint_wiggle(choice.link),
1452        Some(_) => false,
1453    }
1454}
1455
1456pub fn require_linkchoice_supports_joint_wiggle(
1457    choice: &LinkChoice,
1458    context: &str,
1459) -> Result<(), String> {
1460    if linkchoice_supports_joint_wiggle(choice) {
1461        Ok(())
1462    } else {
1463        Err(joint_wiggle_unsupported_link_message(context))
1464    }
1465}
1466
1467pub const fn likelihood_spec_supports_joint_wiggle(likelihood: &LikelihoodSpec) -> bool {
1468    inverse_link_supports_joint_wiggle(&likelihood.link)
1469}
1470
1471pub fn require_likelihood_spec_supports_joint_wiggle(
1472    likelihood: &LikelihoodSpec,
1473    context: &str,
1474) -> Result<(), String> {
1475    if likelihood_spec_supports_joint_wiggle(likelihood) {
1476        Ok(())
1477    } else {
1478        Err(joint_wiggle_unsupported_link_message(context))
1479    }
1480}
1481
1482pub const fn inverse_link_supports_joint_wiggle(link: &InverseLink) -> bool {
1483    matches!(
1484        link,
1485        InverseLink::Standard(StandardLink::Identity)
1486            | InverseLink::Standard(StandardLink::Log)
1487            | InverseLink::Standard(StandardLink::Logit)
1488            | InverseLink::Standard(StandardLink::Probit)
1489            | InverseLink::Standard(StandardLink::CLogLog)
1490    )
1491}
1492
1493pub fn require_inverse_link_supports_joint_wiggle(
1494    link: &InverseLink,
1495    context: &str,
1496) -> Result<(), String> {
1497    if inverse_link_supports_joint_wiggle(link) {
1498        Ok(())
1499    } else {
1500        Err(joint_wiggle_unsupported_link_message(context))
1501    }
1502}
1503
1504pub const fn binomial_inverse_link_supports_joint_wiggle(link: &InverseLink) -> bool {
1505    matches!(
1506        link,
1507        InverseLink::Standard(StandardLink::Logit)
1508            | InverseLink::Standard(StandardLink::Probit)
1509            | InverseLink::Standard(StandardLink::CLogLog)
1510    )
1511}
1512
1513pub fn require_binomial_inverse_link_supports_joint_wiggle(
1514    link: &InverseLink,
1515    context: &str,
1516) -> Result<(), String> {
1517    if binomial_inverse_link_supports_joint_wiggle(link) {
1518        Ok(())
1519    } else {
1520        Err(FormulaDslError::IncompatibleTerm {
1521            reason: format!(
1522                "{context} does not support identity, log, latent-cloglog, SAS, BetaLogistic, or Mixture links; wiggle is only available for jointly fitted logit/probit/cloglog links"
1523            ),
1524        }
1525        .into())
1526    }
1527}
1528
1529pub fn joint_wiggle_unsupported_link_message(context: &str) -> String {
1530    format!(
1531        "{context} does not support latent-cloglog, SAS, BetaLogistic, or Mixture links; wiggle is only available for jointly fitted standard links"
1532    )
1533}
1534
1535// ---------------------------------------------------------------------------
1536// Option-map helpers (shared by formula parsing and term construction)
1537// ---------------------------------------------------------------------------
1538
1539pub fn option_usize(map: &BTreeMap<String, String>, key: &str) -> Option<usize> {
1540    map.get(key).and_then(|v| v.parse::<usize>().ok())
1541}
1542
1543/// Local sibling of `term_builder::validate_known_options` used by the
1544/// parser-side `linear / bounded / constrain / nonnegative / nonpositive`
1545/// branches (which build their `ParsedTerm` here and never enter
1546/// `term_builder::build_smooth_basis`). Without this, typos like
1547/// `bounded(x, min=0, max=1, foo=bar)` silently succeed because the
1548/// `foo` key was just never read.
1549fn validate_known_term_options(
1550    term_name: &str,
1551    options: &BTreeMap<String, String>,
1552    known: &[&str],
1553    raw: &str,
1554) -> Result<(), String> {
1555    let known_set: std::collections::BTreeSet<&&str> = known.iter().collect();
1556    for key in options.keys() {
1557        if !known_set.contains(&key.as_str()) {
1558            let known_sorted = {
1559                let mut v = known.to_vec();
1560                v.sort_unstable();
1561                v.join(", ")
1562            };
1563            let known_hint = if known.is_empty() {
1564                "no options".to_string()
1565            } else {
1566                format!("[{known_sorted}]")
1567            };
1568            return Err(FormulaDslError::InvalidArgument {
1569                reason: format!(
1570                    "{term_name}() does not accept option `{key}` (in `{raw}`); known options: {known_hint}"
1571                ),
1572            }
1573            .into());
1574        }
1575    }
1576    Ok(())
1577}
1578
1579pub fn option_usize_any(map: &BTreeMap<String, String>, keys: &[&str]) -> Option<usize> {
1580    for key in keys {
1581        if let Some(v) = option_usize(map, key) {
1582            return Some(v);
1583        }
1584    }
1585    None
1586}
1587
1588/// Strict integer option: returns `Ok(None)` if not present, `Ok(Some(n))` if
1589/// it parses as a non-negative integer, and `Err(msg)` if the user supplied a
1590/// value that isn't a valid usize (negative, decimal, garbage). Without this
1591/// the lenient `option_usize` silently drops invalid values and reverts to
1592/// the default — `k=-1` and `k=1.5` were both accepted as "k not specified"
1593/// instead of being flagged as user mistakes.
1594pub fn option_usize_strict(
1595    map: &BTreeMap<String, String>,
1596    key: &str,
1597) -> Result<Option<usize>, String> {
1598    match map.get(key) {
1599        None => Ok(None),
1600        Some(raw) => raw.parse::<usize>().map(Some).map_err(|err| {
1601            FormulaDslError::InvalidArgument {
1602                reason: format!(
1603                    "option `{key}={raw}` is not a non-negative integer; \
1604                     expected a whole number >= 0: {err}"
1605                ),
1606            }
1607            .into()
1608        }),
1609    }
1610}
1611
1612/// Strict variant of `option_usize_any` that errors on the first present-but-
1613/// unparseable key rather than silently falling through.
1614pub fn option_usize_any_strict(
1615    map: &BTreeMap<String, String>,
1616    keys: &[&str],
1617) -> Result<Option<usize>, String> {
1618    for key in keys {
1619        if let Some(v) = option_usize_strict(map, key)? {
1620            return Ok(Some(v));
1621        }
1622    }
1623    Ok(None)
1624}
1625
1626pub fn option_f64(map: &BTreeMap<String, String>, key: &str) -> Option<f64> {
1627    map.get(key).and_then(|v| v.parse::<f64>().ok())
1628}
1629
1630/// Strict float option: `Ok(None)` if absent, `Ok(Some(n))` if parses as a
1631/// finite f64, `Err` if the user passed an unparseable value (rather than
1632/// silently dropping it like the lenient `option_f64`).
1633pub fn option_f64_strict(map: &BTreeMap<String, String>, key: &str) -> Result<Option<f64>, String> {
1634    match map.get(key) {
1635        None => Ok(None),
1636        Some(raw) => match raw.parse::<f64>() {
1637            Ok(v) if v.is_finite() => Ok(Some(v)),
1638            Ok(v) => Err(FormulaDslError::InvalidArgument {
1639                reason: format!("option `{key}={raw}` parses as {v} which is not a finite number"),
1640            }
1641            .into()),
1642            Err(err) => Err(FormulaDslError::InvalidArgument {
1643                reason: format!(
1644                    "option `{key}={raw}` is not a valid number; expected a finite decimal: {err}"
1645                ),
1646            }
1647            .into()),
1648        },
1649    }
1650}
1651
1652pub fn option_bool(map: &BTreeMap<String, String>, key: &str) -> Option<bool> {
1653    map.get(key)
1654        .and_then(|v| match v.trim().to_ascii_lowercase().as_str() {
1655            "true" | "1" | "yes" | "y" => Some(true),
1656            "false" | "0" | "no" | "n" => Some(false),
1657            _ => None,
1658        })
1659}
1660
1661/// Strict boolean option: `Ok(None)` if absent, `Ok(Some(b))` for a recognized
1662/// truthy/falsy token, and `Err(msg)` for a present-but-unparseable value. The
1663/// lenient `option_bool` maps an unrecognized value to `None`, which callers
1664/// then silently treat as "not specified" — masking user typos like
1665/// `double_penalty=ture`.
1666pub fn option_bool_strict(
1667    map: &BTreeMap<String, String>,
1668    key: &str,
1669) -> Result<Option<bool>, String> {
1670    match map.get(key) {
1671        None => Ok(None),
1672        Some(raw) => match raw.trim().to_ascii_lowercase().as_str() {
1673            "true" | "1" | "yes" | "y" => Ok(Some(true)),
1674            "false" | "0" | "no" | "n" => Ok(Some(false)),
1675            _ => Err(FormulaDslError::InvalidArgument {
1676                reason: format!(
1677                    "option `{key}={raw}` is not a boolean; \
1678                     expected one of true/false/yes/no/1/0"
1679                ),
1680            }
1681            .into()),
1682        },
1683    }
1684}
1685
1686pub fn strip_quotes(v: &str) -> &str {
1687    let b = v.as_bytes();
1688    if b.len() >= 2
1689        && ((b[0] == b'\'' && b[b.len() - 1] == b'\'') || (b[0] == b'"' && b[b.len() - 1] == b'"'))
1690    {
1691        &v[1..v.len() - 1]
1692    } else {
1693        v
1694    }
1695}
1696
1697// ---------------------------------------------------------------------------
1698// Sub-parsers for formula option blocks
1699// ---------------------------------------------------------------------------
1700
1701fn parse_linear_constraint_bounds(
1702    options: &BTreeMap<String, String>,
1703    raw: &str,
1704) -> Result<(Option<f64>, Option<f64>), String> {
1705    let min = parse_optional_f64_option_alias(options, &["min", "lower"], raw, "linear")?;
1706    let max = parse_optional_f64_option_alias(options, &["max", "upper"], raw, "linear")?;
1707    if let (Some(min), Some(max)) = (min, max)
1708        && (!min.is_finite() || !max.is_finite() || min > max)
1709    {
1710        return Err(FormulaDslError::InvalidArgument {
1711            reason: format!(
1712                "linear coefficient constraints require finite min <= max, got min={min}, max={max}: {raw}"
1713            ),
1714        }
1715        .into());
1716    }
1717    Ok((min, max))
1718}
1719
1720fn parse_required_f64_option(
1721    options: &BTreeMap<String, String>,
1722    key: &str,
1723    raw: &str,
1724) -> Result<f64, String> {
1725    let value = options
1726        .get(key)
1727        .ok_or_else(|| FormulaDslError::MalformedConfig {
1728            reason: format!("bounded() is missing required '{key}' argument: {raw}"),
1729        })?;
1730    value.parse::<f64>().map_err(|err| {
1731        FormulaDslError::InvalidArgument {
1732            reason: format!(
1733                "bounded() argument '{key}' must be a finite number, got '{}': {err}: {raw}",
1734                value
1735            ),
1736        }
1737        .into()
1738    })
1739}
1740
1741fn parse_optional_f64_option(
1742    options: &BTreeMap<String, String>,
1743    key: &str,
1744    raw: &str,
1745) -> Result<Option<f64>, String> {
1746    match options.get(key) {
1747        Some(value) => value.parse::<f64>().map(Some).map_err(|err| {
1748            FormulaDslError::InvalidArgument {
1749                reason: format!(
1750                    "bounded() argument '{key}' must be a finite number, got '{}': {err}: {raw}",
1751                    value
1752                ),
1753            }
1754            .into()
1755        }),
1756        None => Ok(None),
1757    }
1758}
1759
1760fn parse_optional_f64_option_alias(
1761    options: &BTreeMap<String, String>,
1762    keys: &[&str],
1763    raw: &str,
1764    fn_label: &str,
1765) -> Result<Option<f64>, String> {
1766    let mut found: Option<(&str, f64)> = None;
1767    for key in keys {
1768        if let Some(value) = options.get(*key) {
1769            let parsed = value
1770                .parse::<f64>()
1771                .map_err(|err| FormulaDslError::InvalidArgument {
1772                    reason: format!(
1773                        "{fn_label}() argument '{key}' must be a finite number, got '{}': {err}: {raw}",
1774                        value
1775                    ),
1776                })?;
1777            if found.is_some() {
1778                return Err(FormulaDslError::IncompatibleTerm {
1779                    reason: format!(
1780                        "{fn_label}() cannot specify both '{}' and '{}': {raw}",
1781                        found.expect("present").0,
1782                        key
1783                    ),
1784                }
1785                .into());
1786            }
1787            found = Some((key, parsed));
1788        }
1789    }
1790    Ok(found.map(|(_, v)| v))
1791}
1792
1793fn parse_linkwiggle_penalty_orders(raw: Option<&str>) -> Result<Vec<usize>, String> {
1794    let Some(raw) = raw.map(str::trim) else {
1795        return Ok(WigglePenaltyConfig::cubic_triple_operator_default().penalty_orders);
1796    };
1797    if raw.is_empty() {
1798        return Ok(WigglePenaltyConfig::cubic_triple_operator_default().penalty_orders);
1799    }
1800    let mut out = Vec::<usize>::new();
1801    for token in raw.split(',') {
1802        let t = token.trim().to_ascii_lowercase();
1803        if t.is_empty() {
1804            continue;
1805        }
1806        match t.as_str() {
1807            "all" => {
1808                out.extend([1, 2, 3]);
1809            }
1810            "slope" | "1" => out.push(1),
1811            "curvature" | "2" => out.push(2),
1812            "curvature-change" | "curvature_change" | "3" => out.push(3),
1813            _ => {
1814                return Err(FormulaDslError::InvalidArgument {
1815                    reason: format!(
1816                        "invalid linkwiggle penalty_order '{t}'; use all|slope|curvature|curvature-change or 1/2/3"
1817                    ),
1818                }
1819                .into());
1820            }
1821        }
1822    }
1823    if out.is_empty() {
1824        out.extend(WigglePenaltyConfig::cubic_triple_operator_default().penalty_orders);
1825    }
1826    out.sort_unstable();
1827    out.dedup();
1828    Ok(out)
1829}
1830
1831pub fn parse_linkwiggle_formulaspec(
1832    options: &BTreeMap<String, String>,
1833    raw: &str,
1834) -> Result<LinkWiggleFormulaSpec, String> {
1835    let allowed = [
1836        "degree",
1837        "internal_knots",
1838        "penalty_order",
1839        "double_penalty",
1840    ];
1841    let unknown = options
1842        .keys()
1843        .filter(|key| !allowed.contains(&key.as_str()))
1844        .cloned()
1845        .collect::<Vec<_>>();
1846    let term_name = raw.split('(').next().unwrap_or("linkwiggle");
1847    if !unknown.is_empty() {
1848        return Err(FormulaDslError::InvalidArgument {
1849            reason: format!(
1850                "{}() does not support option(s) {}: {raw}",
1851                term_name,
1852                unknown.join(", ")
1853            ),
1854        }
1855        .into());
1856    }
1857    let defaults = WigglePenaltyConfig::cubic_triple_operator_default();
1858    // Strict parsing: a present-but-unparseable value (`degree=abc`, `=-3`,
1859    // `=6.5`) must be rejected, not silently dropped and replaced by the
1860    // default as the lossy `option_usize`/`option_bool` readers would do.
1861    //
1862    // This parser is shared by *all* wiggle grammars: `linkwiggle` and
1863    // `timewiggle` (see `parse_formula`), the standard-model flexible-link
1864    // wiggle, and the marginal-slope score-warp / link-deviation routing.
1865    // The general monotone I-spline value basis (`monotone_wiggle_*` in
1866    // `families::gamlss`, used by `timewiggle` and the location-scale survival
1867    // path) honors arbitrary `degree >= 2`, while only the cubic-only
1868    // score-warp / link-deviation `DeviationRuntime` is restricted to 3.
1869    // A consumer-specific limit therefore must NOT be baked into this shared
1870    // parser — it is enforced at the routing layer that feeds the cubic-only
1871    // runtime (`deviation_block_config_from_formula_linkwiggle`). Here we only
1872    // enforce the universal lower bound that a polynomial degree is positive.
1873    let degree = option_usize_strict(options, "degree")?.unwrap_or(defaults.degree);
1874    if degree < 1 {
1875        return Err(FormulaDslError::InvalidArgument {
1876            reason: format!("{term_name}() requires degree >= 1: {raw}"),
1877        }
1878        .into());
1879    }
1880    let num_internal_knots =
1881        option_usize_strict(options, "internal_knots")?.unwrap_or(defaults.num_internal_knots);
1882    if num_internal_knots == 0 {
1883        return Err(FormulaDslError::InvalidArgument {
1884            reason: format!("{term_name}() requires internal_knots > 0: {raw}"),
1885        }
1886        .into());
1887    }
1888    let penalty_orders =
1889        parse_linkwiggle_penalty_orders(options.get("penalty_order").map(String::as_str))?;
1890    let double_penalty =
1891        option_bool_strict(options, "double_penalty")?.unwrap_or(defaults.double_penalty);
1892    Ok(LinkWiggleFormulaSpec {
1893        degree,
1894        num_internal_knots,
1895        penalty_orders,
1896        double_penalty,
1897    })
1898}
1899
1900fn parse_link_formulaspec(
1901    options: &BTreeMap<String, String>,
1902    raw: &str,
1903) -> Result<LinkFormulaSpec, String> {
1904    let link = options
1905        .get("type")
1906        .map(|s| s.trim().to_string())
1907        .ok_or_else(|| FormulaDslError::MalformedConfig {
1908            reason: format!("link() requires type=<link-name>: {raw}"),
1909        })?;
1910    if link.is_empty() {
1911        return Err(FormulaDslError::MalformedConfig {
1912            reason: format!("link() requires a non-empty type: {raw}"),
1913        }
1914        .into());
1915    }
1916    let mixture_rho = options.get("rho").map(|s| s.trim().to_string());
1917    let sas_init = options.get("sas_init").map(|s| s.trim().to_string());
1918    let beta_logistic_init = options
1919        .get("beta_logistic_init")
1920        .map(|s| s.trim().to_string());
1921    Ok(LinkFormulaSpec {
1922        link,
1923        mixture_rho,
1924        sas_init,
1925        beta_logistic_init,
1926    })
1927}
1928
1929fn parse_survival_formulaspec(
1930    options: &BTreeMap<String, String>,
1931    raw: &str,
1932) -> Result<SurvivalFormulaSpec, String> {
1933    if options.is_empty() {
1934        return Err(FormulaDslError::MalformedConfig {
1935            reason: format!(
1936                "survmodel() requires at least one named option (e.g., spec=..., distribution=...): {raw}"
1937            ),
1938        }
1939        .into());
1940    }
1941    Ok(SurvivalFormulaSpec {
1942        spec: options.get("spec").map(|s| s.trim().to_string()),
1943        survival_distribution: options.get("distribution").map(|s| s.trim().to_string()),
1944    })
1945}
1946
1947fn parse_bounded_priorspec(
1948    options: &BTreeMap<String, String>,
1949    min: f64,
1950    max: f64,
1951    raw: &str,
1952) -> Result<BoundedCoefficientPriorSpec, String> {
1953    let prior_mode = options.get("prior").map(|s| s.to_ascii_lowercase());
1954    let pull = options.get("pull").map(|s| s.to_ascii_lowercase());
1955    let target = parse_optional_f64_option(options, "target", raw)?;
1956    let strength = parse_optional_f64_option(options, "strength", raw)?;
1957
1958    let target_mode = target.is_some() || strength.is_some();
1959    if prior_mode.is_some() && pull.is_some() {
1960        return Err(FormulaDslError::IncompatibleTerm {
1961            reason: format!("bounded() cannot combine prior=... with pull=...: {raw}"),
1962        }
1963        .into());
1964    }
1965    if prior_mode.is_some() && target_mode {
1966        return Err(FormulaDslError::IncompatibleTerm {
1967            reason: format!("bounded() cannot combine prior=... with target/strength: {raw}"),
1968        }
1969        .into());
1970    }
1971    if pull.is_some() && target_mode {
1972        return Err(FormulaDslError::IncompatibleTerm {
1973            reason: format!("bounded() cannot combine pull=... with target/strength: {raw}"),
1974        }
1975        .into());
1976    }
1977
1978    if let Some(priorname) = prior_mode {
1979        return match priorname.as_str() {
1980            "none" => Ok(BoundedCoefficientPriorSpec::None),
1981            "uniform" | "log-jacobian" | "log_jacobian" | "jacobian" => {
1982                Ok(BoundedCoefficientPriorSpec::Uniform)
1983            }
1984            "center" => Ok(BoundedCoefficientPriorSpec::Beta { a: 2.0, b: 2.0 }),
1985            _ => Err(FormulaDslError::InvalidArgument {
1986                reason: format!(
1987                    "bounded() prior must currently be one of none|uniform|log-jacobian|center, got '{}': {raw}",
1988                    priorname
1989                ),
1990            }
1991            .into()),
1992        };
1993    }
1994
1995    if let Some(pull_mode) = pull {
1996        return match pull_mode.as_str() {
1997            "uniform" | "log-jacobian" | "log_jacobian" | "jacobian" => {
1998                Ok(BoundedCoefficientPriorSpec::Uniform)
1999            }
2000            "center" => Ok(BoundedCoefficientPriorSpec::Beta { a: 2.0, b: 2.0 }),
2001            _ => Err(FormulaDslError::InvalidArgument {
2002                reason: format!(
2003                    "bounded() pull must currently be 'uniform'/'log-jacobian' or 'center', got '{}': {raw}",
2004                    pull_mode
2005                ),
2006            }
2007            .into()),
2008        };
2009    }
2010
2011    if target_mode {
2012        let targetvalue = target.ok_or_else(|| FormulaDslError::MalformedConfig {
2013            reason: format!("bounded() target is required with strength: {raw}"),
2014        })?;
2015        let strengthvalue = strength.ok_or_else(|| FormulaDslError::MalformedConfig {
2016            reason: format!("bounded() strength is required with target: {raw}"),
2017        })?;
2018        if !(min < targetvalue && targetvalue < max) {
2019            return Err(FormulaDslError::InvalidArgument {
2020                reason: format!("bounded() target must lie strictly inside ({min}, {max}): {raw}"),
2021            }
2022            .into());
2023        }
2024        if !strengthvalue.is_finite() || strengthvalue <= 0.0 {
2025            return Err(FormulaDslError::InvalidArgument {
2026                reason: format!("bounded() strength must be finite and > 0: {raw}"),
2027            }
2028            .into());
2029        }
2030        let z = (targetvalue - min) / (max - min);
2031        let a = 1.0 + strengthvalue * z;
2032        let b = 1.0 + strengthvalue * (1.0 - z);
2033        return Ok(BoundedCoefficientPriorSpec::Beta { a, b });
2034    }
2035
2036    Ok(BoundedCoefficientPriorSpec::None)
2037}
2038
2039// ---------------------------------------------------------------------------
2040// Top-level formula and term parsers
2041// ---------------------------------------------------------------------------
2042
2043pub fn formula_rhs_text(formula: &str) -> Result<String, String> {
2044    let parsed = parse_formula_dsl(formula)?;
2045    if parsed.rhs_terms.is_empty() {
2046        return Err(FormulaDslError::ParseError {
2047            reason: "formula right-hand side cannot be empty".to_string(),
2048        }
2049        .into());
2050    }
2051    Ok(parsed.rhs_terms.join(" + "))
2052}
2053
2054/// Parsed Surv(...) response specification.
2055///
2056/// `entry` is `None` for the 2-arg right-censored shorthand
2057/// `Surv(time, event)`, which matches the R survival/mgcv default: every
2058/// subject has entry time zero. Callers materialize a zero entry column
2059/// when this is `None`.
2060pub fn parse_surv_response(
2061    lhs: &str,
2062) -> Result<Option<(Option<String>, String, String)>, FormulaDslError> {
2063    let trimmed = lhs.trim();
2064    let call = match parse_function_call(trimmed) {
2065        Ok(call) => call,
2066        Err(_) => return Ok(None),
2067    };
2068    if !call.name.eq_ignore_ascii_case("surv") {
2069        return Ok(None);
2070    }
2071    let vars = call
2072        .args
2073        .iter()
2074        .filter_map(|arg| match arg {
2075            CallArgSpec::Positional(v) => Some(v.trim().to_string()),
2076            CallArgSpec::Named { .. } => None,
2077        })
2078        .filter(|s| !s.is_empty())
2079        .collect::<Vec<_>>();
2080    match vars.len() {
2081        // Right-censored shorthand: Surv(time, event) ≡ Surv(0, time, event)
2082        // with a synthetic zero entry column. This matches R's
2083        // `survival::Surv(time, event)` default for left-truncation-free data.
2084        2 => Ok(Some((None, vars[0].clone(), vars[1].clone()))),
2085        3 => Ok(Some((
2086            Some(vars[0].clone()),
2087            vars[1].clone(),
2088            vars[2].clone(),
2089        ))),
2090        n => Err(FormulaDslError::InvalidArgument {
2091            reason: format!(
2092                "Surv(...) expects either Surv(time, event) (right-censored) or \
2093                 Surv(entry, exit, event) (left-truncated); got {n} columns"
2094            ),
2095        }),
2096    }
2097}
2098
2099/// Parsed `SurvInterval(L, R, event)` interval-censored response.
2100///
2101/// Returns `Some((left_col, right_col, event_col))` when the left-hand side is a
2102/// `SurvInterval(...)` call, `None` otherwise (so a plain `Surv(...)` or a bare
2103/// column response falls through to the other response parsers).
2104///
2105/// Interval censoring observes only a bracket `T ∈ (L, R]` — the exact event
2106/// time is never seen — and its row contribution is the survival-mass difference
2107/// `log[S(L) − S(R)]`, distinct from both the exact-event point density and the
2108/// single-sided right-censored survival. A *dedicated call name* (rather than
2109/// overloading the 3-argument `Surv(entry, exit, event)` delayed-entry form,
2110/// which is also 3-argument and semantically incompatible) is the unambiguous
2111/// DSL spelling: it mirrors flexsurv's `Surv(L, R, type="interval2")` intent
2112/// without colliding with the existing left-truncation grammar.
2113pub fn parse_surv_interval_response(
2114    lhs: &str,
2115) -> Result<Option<(String, String, String)>, FormulaDslError> {
2116    let trimmed = lhs.trim();
2117    let call = match parse_function_call(trimmed) {
2118        Ok(call) => call,
2119        Err(_) => return Ok(None),
2120    };
2121    if !call.name.eq_ignore_ascii_case("survinterval") {
2122        return Ok(None);
2123    }
2124    let vars = call
2125        .args
2126        .iter()
2127        .filter_map(|arg| match arg {
2128            CallArgSpec::Positional(v) => Some(v.trim().to_string()),
2129            CallArgSpec::Named { .. } => None,
2130        })
2131        .filter(|s| !s.is_empty())
2132        .collect::<Vec<_>>();
2133    match vars.len() {
2134        3 => Ok(Some((vars[0].clone(), vars[1].clone(), vars[2].clone()))),
2135        n => Err(FormulaDslError::InvalidArgument {
2136            reason: format!(
2137                "SurvInterval(...) expects SurvInterval(L, R, event) (interval-censored, the \
2138                 observed bracket T ∈ (L, R]); got {n} columns"
2139            ),
2140        }),
2141    }
2142}
2143
2144fn top_level_formula_separator(input: &str) -> Result<Option<usize>, String> {
2145    let mut depth = 0_i32;
2146    let mut in_single = false;
2147    let mut in_double = false;
2148
2149    for (idx, ch) in input.char_indices() {
2150        match ch {
2151            '\'' if !in_double => in_single = !in_single,
2152            '"' if !in_single => in_double = !in_double,
2153            '(' | '[' | '{' if !in_single && !in_double => depth += 1,
2154            ')' | ']' | '}' if !in_single && !in_double && depth > 0 => depth -= 1,
2155            '~' if !in_single && !in_double && depth == 0 => return Ok(Some(idx)),
2156            _ => {}
2157        }
2158    }
2159
2160    if in_single || in_double || depth != 0 {
2161        return Err(FormulaDslError::ParseError {
2162            reason: "invalid auxiliary formula syntax: unbalanced parentheses or quotes"
2163                .to_string(),
2164        }
2165        .into());
2166    }
2167    Ok(None)
2168}
2169
2170pub fn parse_matching_auxiliary_formula(
2171    formula: &str,
2172    response: &str,
2173    flag_name: &str,
2174) -> Result<(String, ParsedFormula), FormulaDslError> {
2175    let rhs = formula.trim();
2176    if top_level_formula_separator(rhs)?.is_some() {
2177        return Err(FormulaDslError::InvalidArgument {
2178            reason: format!(
2179                "{flag_name} expects only the terms after '~', not a full 'response ~ terms' formula; use {flag_name} 's(x)' instead of {flag_name} 'y ~ s(x)' (or pass '1' for an intercept-only noise model)"
2180            ),
2181        });
2182    }
2183    let parsed_formula = parse_formula(&format!("{response} ~ {rhs}"))?;
2184    Ok((rhs.to_string(), parsed_formula))
2185}
2186
2187pub fn validate_auxiliary_formula_controls(
2188    parsed_formula: &ParsedFormula,
2189    flag_name: &str,
2190) -> Result<(), String> {
2191    if parsed_formula.linkwiggle.is_some() {
2192        return Err(FormulaDslError::IncompatibleTerm {
2193            reason: format!(
2194                "linkwiggle(...) is only supported in the main formula, not {flag_name}"
2195            ),
2196        }
2197        .into());
2198    }
2199    if parsed_formula.timewiggle.is_some() {
2200        return Err(FormulaDslError::IncompatibleTerm {
2201            reason: format!(
2202                "timewiggle(...) is only supported in the main survival formula, not {flag_name}"
2203            ),
2204        }
2205        .into());
2206    }
2207    if parsed_formula.linkspec.is_some() {
2208        return Err(FormulaDslError::IncompatibleTerm {
2209            reason: format!("link(...) is only supported in the main formula, not {flag_name}"),
2210        }
2211        .into());
2212    }
2213    if parsed_formula.survivalspec.is_some() {
2214        return Err(FormulaDslError::IncompatibleTerm {
2215            reason: format!(
2216                "survmodel(...) is only supported in the main survival formula, not {flag_name}"
2217            ),
2218        }
2219        .into());
2220    }
2221    if !parsed_formula.logslope_surfaces.is_empty() && flag_name != "--logslope-formula" {
2222        return Err(FormulaDslError::IncompatibleTerm {
2223            reason: format!(
2224                "logslope(...) is only supported in --logslope-formula, not {flag_name}"
2225            ),
2226        }
2227        .into());
2228    }
2229    Ok::<(), _>(())
2230}
2231
2232pub fn parse_formula(formula: &str) -> Result<ParsedFormula, FormulaDslError> {
2233    let parsed_dsl =
2234        parse_formula_dsl(formula).map_err(|reason| FormulaDslError::ParseError { reason })?;
2235    let lhs = parsed_dsl.response_expr.trim();
2236    if lhs.is_empty() {
2237        return Err(FormulaDslError::ParseError {
2238            reason: "formula response (left-hand side) cannot be empty".to_string(),
2239        });
2240    }
2241    let mut terms = Vec::<ParsedTerm>::new();
2242    let mut linkwiggle: Option<LinkWiggleFormulaSpec> = None;
2243    let mut timewiggle: Option<LinkWiggleFormulaSpec> = None;
2244    let mut linkspec: Option<LinkFormulaSpec> = None;
2245    let mut survivalspec: Option<SurvivalFormulaSpec> = None;
2246    let mut logslope_surfaces = Vec::<LogSlopeSurfaceSpec>::new();
2247    // Track seen-term-keys so we can reject exact duplicates like
2248    // `y ~ smooth(x) + smooth(x)` upfront — without this the duplicate
2249    // produces a rank-deficient design and the user has no idea why their
2250    // fit is over-parameterized.
2251    let mut seen_term_keys: std::collections::BTreeSet<String> = std::collections::BTreeSet::new();
2252    let mut expanded_terms = Vec::<String>::new();
2253    for raw in parsed_dsl.rhs_terms {
2254        let trimmed = raw.trim();
2255        if trimmed.is_empty() {
2256            expanded_terms.push(String::new());
2257            continue;
2258        }
2259        // Single function-call terms (smooths, group(), etc.) are opaque to
2260        // WR expansion; pass them through verbatim so parse_term sees the
2261        // exact source string. Bare identifiers and any term mentioning
2262        // `:`, `*`, `/`, `^` are routed through the AST-driven WR expander.
2263        let is_call = parse_function_call(trimmed).is_ok();
2264        let needs_expansion = !is_call
2265            && trimmed
2266                .chars()
2267                .scan(0i32, |depth, ch| {
2268                    let d_before = *depth;
2269                    match ch {
2270                        '(' | '[' | '{' => *depth += 1,
2271                        ')' | ']' | '}' if *depth > 0 => *depth -= 1,
2272                        _ => {}
2273                    }
2274                    Some((d_before, ch))
2275                })
2276                .any(|(d, ch)| d == 0 && matches!(ch, ':' | '*' | '/' | '^'));
2277        if needs_expansion {
2278            for atoms in
2279                expand_wr_term(trimmed).map_err(|reason| FormulaDslError::ParseError { reason })?
2280            {
2281                if atoms.is_empty() {
2282                    continue;
2283                }
2284                expanded_terms.push(atoms.join(":"));
2285            }
2286        } else {
2287            expanded_terms.push(trimmed.to_string());
2288        }
2289    }
2290
2291    for raw in expanded_terms {
2292        let t = raw.trim();
2293        if t.is_empty() || t == "1" {
2294            continue;
2295        }
2296        if t == "0" || t == "-1" {
2297            return Err(FormulaDslError::IncompatibleTerm {
2298                reason: "formula terms '0'/'-1' (intercept removal) are not supported yet"
2299                    .to_string(),
2300            });
2301        }
2302        // Normalize whitespace so `smooth(x)` and `smooth( x )` match,
2303        // but preserve whitespace inside string literals so that
2304        // `bs="a b"` and `bs="ab"` do not collide.
2305        let key: String = {
2306            let mut acc = String::with_capacity(t.len());
2307            let mut in_single = false;
2308            let mut in_double = false;
2309            for ch in t.chars() {
2310                match ch {
2311                    '\'' if !in_double => {
2312                        in_single = !in_single;
2313                        acc.push(ch);
2314                    }
2315                    '"' if !in_single => {
2316                        in_double = !in_double;
2317                        acc.push(ch);
2318                    }
2319                    c if c.is_whitespace() && !in_single && !in_double => {}
2320                    _ => acc.push(ch),
2321                }
2322            }
2323            acc
2324        };
2325        if !seen_term_keys.insert(key.clone()) {
2326            return Err(FormulaDslError::IncompatibleTerm {
2327                reason: format!(
2328                    "formula `{formula}` lists term `{t}` more than once. \
2329                     Duplicate terms produce a rank-deficient design; \
2330                     keep one copy or differentiate them (e.g. distinct k=, bs= options)."
2331                ),
2332            });
2333        }
2334        match parse_term(t)? {
2335            ParsedTerm::LinkWiggle { options } => {
2336                if linkwiggle.is_some() {
2337                    return Err(FormulaDslError::IncompatibleTerm {
2338                        reason: "formula can include at most one linkwiggle(...) term".to_string(),
2339                    });
2340                }
2341                linkwiggle = Some(parse_linkwiggle_formulaspec(&options, t)?);
2342            }
2343            ParsedTerm::TimeWiggle { options } => {
2344                if timewiggle.is_some() {
2345                    return Err(FormulaDslError::IncompatibleTerm {
2346                        reason: "formula can include at most one timewiggle(...) term".to_string(),
2347                    });
2348                }
2349                timewiggle = Some(parse_linkwiggle_formulaspec(&options, t)?);
2350            }
2351            ParsedTerm::LinkConfig { options } => {
2352                if linkspec.is_some() {
2353                    return Err(FormulaDslError::IncompatibleTerm {
2354                        reason: "formula can include at most one link(...) term".to_string(),
2355                    });
2356                }
2357                linkspec = Some(parse_link_formulaspec(&options, t)?);
2358            }
2359            ParsedTerm::SurvivalConfig { options } => {
2360                if survivalspec.is_some() {
2361                    return Err(FormulaDslError::IncompatibleTerm {
2362                        reason: "formula can include at most one survmodel(...) term".to_string(),
2363                    });
2364                }
2365                survivalspec = Some(parse_survival_formulaspec(&options, t)?);
2366            }
2367            ParsedTerm::LogSlopeSurface { z_column, terms } => {
2368                logslope_surfaces.push(LogSlopeSurfaceSpec { z_column, terms });
2369            }
2370            other => terms.push(other),
2371        }
2372    }
2373    // Reject self-referential formulas like `y ~ smooth(y)` or `y ~ y`: the
2374    // response is its own predictor, which is a trivial identity fit and
2375    // almost certainly a user mistake. Only flag the simple-identifier case
2376    // (so Surv(entry, exit, event) ~ smooth(time) is left alone — the
2377    // response is the Surv triple, not the bare "time" column).
2378    if lhs.chars().all(|c| c.is_alphanumeric() || c == '_')
2379        && !lhs.is_empty()
2380        && parsed_terms_reference_column(&terms, lhs)
2381    {
2382        return Err(FormulaDslError::IncompatibleTerm {
2383            reason: format!(
2384                "formula `{formula}` uses response column `{lhs}` as its own predictor. \
2385                 This fits y as a function of itself and is almost certainly a typo. \
2386                 Drop the term that mentions `{lhs}` from the right-hand side."
2387            ),
2388        });
2389    }
2390    Ok(ParsedFormula {
2391        response: lhs.to_string(),
2392        terms,
2393        logslope_surfaces,
2394        linkwiggle,
2395        timewiggle,
2396        linkspec,
2397        survivalspec,
2398    })
2399}
2400
2401pub fn parse_term(raw: &str) -> Result<ParsedTerm, String> {
2402    fn split_call_args(call: &FunctionCallSpec) -> (Vec<String>, BTreeMap<String, String>) {
2403        let mut vars = Vec::<String>::new();
2404        let mut options = BTreeMap::<String, String>::new();
2405        for arg in &call.args {
2406            match arg {
2407                CallArgSpec::Positional(v) => vars.push(v.trim().to_string()),
2408                CallArgSpec::Named { key, value } => {
2409                    options.insert(key.to_ascii_lowercase(), strip_quotes(value).to_string());
2410                }
2411            }
2412        }
2413        (vars, options)
2414    }
2415
2416    // Wilkinson-Rogers `:` interaction term. The expander in `parse_formula`
2417    // produces `a:b[:c...]` for these; parse_term is also reached directly
2418    // from tests, so handle the syntax here as well.
2419    if raw.contains(':')
2420        && !raw.contains('(')
2421        && raw.split(':').all(|piece| is_exact_ident(piece.trim()))
2422    {
2423        let vars: Vec<String> = raw
2424            .split(':')
2425            .map(|piece| piece.trim().to_string())
2426            .collect();
2427        if vars.len() >= 2 {
2428            let mut sorted = vars.clone();
2429            sorted.sort();
2430            sorted.dedup();
2431            if sorted.len() != vars.len() {
2432                return Err(FormulaDslError::IncompatibleTerm {
2433                    reason: format!(
2434                        "interaction term `{raw}` references the same variable more than once"
2435                    ),
2436                }
2437                .into());
2438            }
2439            return Ok(ParsedTerm::Interaction { vars: sorted });
2440        }
2441    }
2442
2443    let call = parse_function_call(raw).ok();
2444    if let Some(call) = call {
2445        let name = call.name.to_ascii_lowercase();
2446        let (vars, mut options) = split_call_args(&call);
2447        match name.as_str() {
2448            "constrain" | "constraint" | "box" => {
2449                if vars.len() != 1 {
2450                    return Err(FormulaDslError::InvalidArgument {
2451                        reason: format!(
2452                            "constrain()/constraint()/box() expects exactly one variable: {raw}"
2453                        ),
2454                    }
2455                    .into());
2456                }
2457                validate_known_term_options(
2458                    "constrain",
2459                    &options,
2460                    &["min", "lower", "max", "upper"],
2461                    raw,
2462                )?;
2463                let (coefficient_min, coefficient_max) =
2464                    parse_linear_constraint_bounds(&options, raw)?;
2465                if coefficient_min.is_none() && coefficient_max.is_none() {
2466                    return Err(FormulaDslError::MalformedConfig {
2467                        reason: format!(
2468                            "constrain()/constraint()/box() requires at least one of min/lower/max/upper: {raw}"
2469                        ),
2470                    }
2471                    .into());
2472                }
2473                return Ok(ParsedTerm::Linear {
2474                    name: vars[0].clone(),
2475                    explicit: true,
2476                    coefficient_min,
2477                    coefficient_max,
2478                });
2479            }
2480            "nonnegative" | "nonnegative_coef" => {
2481                if vars.len() != 1 {
2482                    return Err(FormulaDslError::InvalidArgument {
2483                        reason: format!("nonnegative() expects exactly one variable: {raw}"),
2484                    }
2485                    .into());
2486                }
2487                validate_known_term_options("nonnegative", &options, &[], raw)?;
2488                return Ok(ParsedTerm::Linear {
2489                    name: vars[0].clone(),
2490                    explicit: true,
2491                    coefficient_min: Some(0.0),
2492                    coefficient_max: None,
2493                });
2494            }
2495            "nonpositive" | "nonpositive_coef" => {
2496                if vars.len() != 1 {
2497                    return Err(FormulaDslError::InvalidArgument {
2498                        reason: format!("nonpositive() expects exactly one variable: {raw}"),
2499                    }
2500                    .into());
2501                }
2502                validate_known_term_options("nonpositive", &options, &[], raw)?;
2503                return Ok(ParsedTerm::Linear {
2504                    name: vars[0].clone(),
2505                    explicit: true,
2506                    coefficient_min: None,
2507                    coefficient_max: Some(0.0),
2508                });
2509            }
2510            "bounded" => {
2511                if vars.len() != 1 {
2512                    return Err(FormulaDslError::InvalidArgument {
2513                        reason: format!("bounded() expects exactly one variable: {raw}"),
2514                    }
2515                    .into());
2516                }
2517                validate_known_term_options(
2518                    "bounded",
2519                    &options,
2520                    &["min", "max", "prior", "pull", "target", "strength"],
2521                    raw,
2522                )?;
2523                let min = parse_required_f64_option(&options, "min", raw)?;
2524                let max = parse_required_f64_option(&options, "max", raw)?;
2525                if !min.is_finite() || !max.is_finite() || min >= max {
2526                    return Err(FormulaDslError::InvalidArgument {
2527                        reason: format!(
2528                            "bounded() requires finite min < max, got min={min}, max={max}: {raw}"
2529                        ),
2530                    }
2531                    .into());
2532                }
2533                let prior = parse_bounded_priorspec(&options, min, max, raw)?;
2534                return Ok(ParsedTerm::BoundedLinear {
2535                    name: vars[0].clone(),
2536                    min,
2537                    max,
2538                    prior,
2539                });
2540            }
2541            "group" | "re" | "factor" => {
2542                if vars.len() != 1 {
2543                    return Err(FormulaDslError::InvalidArgument {
2544                        reason: format!(
2545                            "{name}() expects exactly one variable, got '{}': {raw}",
2546                            vars.join(",")
2547                        ),
2548                    }
2549                    .into());
2550                }
2551                return Ok(ParsedTerm::RandomEffect {
2552                    name: vars[0].clone(),
2553                });
2554            }
2555            "tensor" | "interaction" | "te" => {
2556                if vars.len() < 2 {
2557                    return Err(FormulaDslError::InvalidArgument {
2558                        reason: format!(
2559                            "tensor()/interaction()/te() requires at least two variables: {raw}"
2560                        ),
2561                    }
2562                    .into());
2563                }
2564                return Ok(ParsedTerm::Smooth {
2565                    label: raw.to_string(),
2566                    vars,
2567                    kind: SmoothKind::Te,
2568                    options,
2569                });
2570            }
2571            "t2" => {
2572                if vars.len() < 2 {
2573                    return Err(FormulaDslError::InvalidArgument {
2574                        reason: format!("t2() requires at least two variables: {raw}"),
2575                    }
2576                    .into());
2577                }
2578                return Ok(ParsedTerm::Smooth {
2579                    label: raw.to_string(),
2580                    vars,
2581                    kind: SmoothKind::T2,
2582                    options,
2583                });
2584            }
2585            "ti" => {
2586                // Tensor interaction smooth (mgcv `ti`): structurally a
2587                // tensor-product smooth, but the marginal main effects are
2588                // excluded so only the pure interaction is modeled. Shares the
2589                // tensor materialization path with `te`; the distinct
2590                // `SmoothKind::Ti` drives per-margin sum-to-zero
2591                // identifiability in the term builder.
2592                if vars.len() < 2 {
2593                    return Err(FormulaDslError::InvalidArgument {
2594                        reason: format!("ti() requires at least two variables: {raw}"),
2595                    }
2596                    .into());
2597                }
2598                return Ok(ParsedTerm::Smooth {
2599                    label: raw.to_string(),
2600                    vars,
2601                    kind: SmoothKind::Ti,
2602                    options,
2603                });
2604            }
2605            "fs" | "sz" => {
2606                if vars.len() != 2 {
2607                    return Err(format!("{}() expects exactly two variables: {raw}", name));
2608                }
2609                options.insert("bs".to_string(), name.clone());
2610                return Ok(ParsedTerm::Smooth {
2611                    label: raw.to_string(),
2612                    vars,
2613                    kind: SmoothKind::S,
2614                    options,
2615                });
2616            }
2617            "thinplate" | "thin_plate" | "tps" => {
2618                if vars.len() < 2 {
2619                    return Err(FormulaDslError::InvalidArgument {
2620                        reason: format!(
2621                            "thinplate()/thin_plate()/tps() requires at least two variables: {raw}"
2622                        ),
2623                    }
2624                    .into());
2625                }
2626                options.insert("type".to_string(), "tps".to_string());
2627                return Ok(ParsedTerm::Smooth {
2628                    label: raw.to_string(),
2629                    vars,
2630                    kind: SmoothKind::S,
2631                    options,
2632                });
2633            }
2634            "smooth" | "s" | "cyclic" | "periodic" | "cc" | "cp" => {
2635                if vars.is_empty() {
2636                    return Err(FormulaDslError::InvalidArgument {
2637                        reason: format!("smooth()/s() requires at least one variable: {raw}"),
2638                    }
2639                    .into());
2640                }
2641                // mgcv idiom: `s(g, bs='re')` with a single variable is a
2642                // random intercept on the factor `g`. Route it to the
2643                // dedicated random-effect machinery (which expects a single
2644                // categorical column) rather than to the factor-smooth path
2645                // (which requires a numeric companion).
2646                let bs_is_re = options
2647                    .get("bs")
2648                    .or_else(|| options.get("type"))
2649                    .map(|v| {
2650                        v.trim()
2651                            .trim_matches(|c| c == '\'' || c == '"')
2652                            .to_ascii_lowercase()
2653                    })
2654                    .as_deref()
2655                    == Some("re");
2656                if bs_is_re && vars.len() == 1 {
2657                    return Ok(ParsedTerm::RandomEffect {
2658                        name: vars[0].clone(),
2659                    });
2660                }
2661                if matches!(name.as_str(), "cyclic" | "periodic" | "cc" | "cp") {
2662                    options.insert("type".to_string(), "cyclic".to_string());
2663                }
2664                if matches!(name.as_str(), "fs" | "sz") {
2665                    options.insert("bs".to_string(), name.clone());
2666                }
2667                return Ok(ParsedTerm::Smooth {
2668                    label: raw.to_string(),
2669                    vars,
2670                    kind: SmoothKind::S,
2671                    options,
2672                });
2673            }
2674            "sphere" | "sos" | "spherical" | "s2" => {
2675                // `s2()` is an alias for the intrinsic S² (sphere) smooth, just
2676                // like `sphere()`/`sos()`/`spherical()`. All four share the
2677                // Wahba/harmonic sphere basis, so they must dispatch through
2678                // the identical `type=sphere` route; otherwise `s2()` would
2679                // silently fall back to a generic Euclidean 2-D smooth over
2680                // (lat, lon) and diverge in the spatial-kappa optimizer.
2681                if vars.len() != 2 {
2682                    return Err(FormulaDslError::InvalidArgument {
2683                        reason: format!(
2684                            "{name}() expects exactly two variables: latitude and longitude; got {} in {raw}",
2685                            vars.len()
2686                        ),
2687                    }
2688                    .into());
2689                }
2690                options.insert("type".to_string(), "sphere".to_string());
2691                return Ok(ParsedTerm::Smooth {
2692                    label: raw.to_string(),
2693                    vars,
2694                    kind: SmoothKind::S,
2695                    options,
2696                });
2697            }
2698            "mjs" | "measurejet" | "measure_jet" | "web" => {
2699                // Measure-jet spline smooth (`basis::measure_jet_smooth` docs)
2700                // for responses varying along an unknown low-dimensional set
2701                // inside a higher-dimensional ambient space. All aliases
2702                // dispatch through the identical `type=measurejet` route,
2703                // mirroring the sphere/curvature alias rule.
2704                if vars.is_empty() {
2705                    return Err(FormulaDslError::InvalidArgument {
2706                        reason: format!("{name}() requires at least one variable: {raw}"),
2707                    }
2708                    .into());
2709                }
2710                options.insert("type".to_string(), "measurejet".to_string());
2711                return Ok(ParsedTerm::Smooth {
2712                    label: raw.to_string(),
2713                    vars,
2714                    kind: SmoothKind::S,
2715                    options,
2716                });
2717            }
2718            "curv" | "curvature" | "constant_curvature" | "mkappa" => {
2719                // Constant-curvature (M_κ) geodesic-kernel smooth (#944): the
2720                // κ-generic sibling of sphere()/s2(), interpolating
2721                // S^d → ℝ^d → H^d through `kappa=` (default 0 = flat). All
2722                // four aliases must dispatch through the identical
2723                // `type=curvature` route, mirroring the sphere alias rule.
2724                if vars.is_empty() {
2725                    return Err(FormulaDslError::InvalidArgument {
2726                        reason: format!("{name}() requires at least one variable: {raw}"),
2727                    }
2728                    .into());
2729                }
2730                options.insert("type".to_string(), "curvature".to_string());
2731                return Ok(ParsedTerm::Smooth {
2732                    label: raw.to_string(),
2733                    vars,
2734                    kind: SmoothKind::S,
2735                    options,
2736                });
2737            }
2738            "matern" => {
2739                if vars.is_empty() {
2740                    return Err(FormulaDslError::InvalidArgument {
2741                        reason: format!("matern() requires at least one variable: {raw}"),
2742                    }
2743                    .into());
2744                }
2745                options.insert("type".to_string(), "matern".to_string());
2746                return Ok(ParsedTerm::Smooth {
2747                    label: raw.to_string(),
2748                    vars,
2749                    kind: SmoothKind::S,
2750                    options,
2751                });
2752            }
2753            "duchon" => {
2754                if vars.is_empty() {
2755                    return Err(FormulaDslError::InvalidArgument {
2756                        reason: format!("duchon() requires at least one variable: {raw}"),
2757                    }
2758                    .into());
2759                }
2760                if option_bool(&options, "cyclic").unwrap_or(false)
2761                    || option_bool(&options, "periodic").unwrap_or(false)
2762                {
2763                    options.insert("cyclic".to_string(), "true".to_string());
2764                }
2765                options.insert("type".to_string(), "duchon".to_string());
2766                return Ok(ParsedTerm::Smooth {
2767                    label: raw.to_string(),
2768                    vars,
2769                    kind: SmoothKind::S,
2770                    options,
2771                });
2772            }
2773            "pca" => {
2774                if vars.is_empty() {
2775                    return Err(FormulaDslError::InvalidArgument {
2776                        reason: format!("pca() requires at least one variable: {raw}"),
2777                    }
2778                    .into());
2779                }
2780                options.insert("type".to_string(), "pca".to_string());
2781                return Ok(ParsedTerm::Smooth {
2782                    label: raw.to_string(),
2783                    vars,
2784                    kind: SmoothKind::S,
2785                    options,
2786                });
2787            }
2788            "linkwiggle" => {
2789                if !vars.is_empty() {
2790                    return Err(FormulaDslError::InvalidArgument {
2791                        reason: format!(
2792                            "linkwiggle() takes named options only; positional args are not supported: {raw}"
2793                        ),
2794                    }
2795                    .into());
2796                }
2797                return Ok(ParsedTerm::LinkWiggle { options });
2798            }
2799            "timewiggle" => {
2800                if !vars.is_empty() {
2801                    return Err(FormulaDslError::InvalidArgument {
2802                        reason: format!(
2803                            "timewiggle() takes named options only; positional args are not supported: {raw}"
2804                        ),
2805                    }
2806                    .into());
2807                }
2808                return Ok(ParsedTerm::TimeWiggle { options });
2809            }
2810            "link" => {
2811                if !vars.is_empty() {
2812                    return Err(FormulaDslError::InvalidArgument {
2813                        reason: format!(
2814                            "link() takes named options only; positional args are not supported: {raw}"
2815                        ),
2816                    }
2817                    .into());
2818                }
2819                return Ok(ParsedTerm::LinkConfig { options });
2820            }
2821            "survmodel" => {
2822                if !vars.is_empty() {
2823                    return Err(FormulaDslError::InvalidArgument {
2824                        reason: format!(
2825                            "survmodel() takes named options only; positional args are not supported: {raw}"
2826                        ),
2827                    }
2828                    .into());
2829                }
2830                return Ok(ParsedTerm::SurvivalConfig { options });
2831            }
2832            "logslope" | "log_slope" | "log_slope_surface" => {
2833                validate_known_term_options("logslope", &options, &[], raw)?;
2834                if vars.len() < 2 {
2835                    return Err(FormulaDslError::InvalidArgument {
2836                        reason: format!(
2837                            "logslope() expects a z column followed by one or more RHS terms; add one logslope(z, ...) declaration per vector-z coordinate: {raw}"
2838                        ),
2839                    }
2840                    .into());
2841                }
2842                let z_column = vars[0].trim();
2843                if !is_exact_ident(z_column) {
2844                    return Err(FormulaDslError::InvalidArgument {
2845                        reason: format!(
2846                            "logslope() z column must be a bare column name, got `{z_column}` in {raw}"
2847                        ),
2848                    }
2849                    .into());
2850                }
2851                let rhs = vars[1..].join(" + ");
2852                let parsed = parse_formula(&format!("__logslope__ ~ {rhs}"))?;
2853                if !parsed.logslope_surfaces.is_empty() {
2854                    return Err(FormulaDslError::IncompatibleTerm {
2855                        reason: format!(
2856                            "logslope() declarations cannot be nested inside another logslope(): {raw}"
2857                        ),
2858                    }
2859                    .into());
2860                }
2861                validate_auxiliary_formula_controls(&parsed, "logslope()")?;
2862                return Ok(ParsedTerm::LogSlopeSurface {
2863                    z_column: z_column.to_string(),
2864                    terms: parsed.terms,
2865                });
2866            }
2867            "linear" => {
2868                if vars.len() != 1 {
2869                    return Err(FormulaDslError::InvalidArgument {
2870                        reason: format!("linear() expects exactly one variable: {raw}"),
2871                    }
2872                    .into());
2873                }
2874                validate_known_term_options(
2875                    "linear",
2876                    &options,
2877                    &["min", "lower", "max", "upper"],
2878                    raw,
2879                )?;
2880                let (coefficient_min, coefficient_max) =
2881                    parse_linear_constraint_bounds(&options, raw)?;
2882                return Ok(ParsedTerm::Linear {
2883                    name: vars[0].clone(),
2884                    explicit: true,
2885                    coefficient_min,
2886                    coefficient_max,
2887                });
2888            }
2889            _ => {
2890                return Err(format!(
2891                    "unknown term function `{name}` in '{raw}'. Supported: bounded(), linear(), constrain()/constraint()/box(), nonnegative(), nonpositive(), smooth()/s(), cyclic()/periodic()/cc()/cp(), thinplate()/thin_plate()/tps(), tensor()/interaction()/te(), t2(), ti(), fs(), sz(), group()/re()/factor(), sphere()/sos()/spherical(), s2(), matern(), duchon(), pca(), logslope()/log_slope(), linkwiggle(), timewiggle(), link(), survmodel()"
2892                ));
2893            }
2894        }
2895    }
2896
2897    let ident = raw.trim();
2898    if !is_exact_ident(ident) {
2899        return Err(FormulaDslError::UnknownIdentifier {
2900            reason: format!("unsupported top-level RHS term: {raw}"),
2901        }
2902        .into());
2903    }
2904
2905    Ok(ParsedTerm::Linear {
2906        name: ident.to_string(),
2907        explicit: false,
2908        coefficient_min: None,
2909        coefficient_max: None,
2910    })
2911}
2912
2913// ---------------------------------------------------------------------------
2914// Link choice parsing
2915// ---------------------------------------------------------------------------
2916
2917pub fn parse_link_choice(
2918    raw: Option<&str>,
2919    flexible_flag: bool,
2920) -> Result<Option<LinkChoice>, FormulaDslError> {
2921    if raw.is_none() && !flexible_flag {
2922        return Ok(None);
2923    }
2924    let Some(v) = raw else {
2925        return Ok(Some(LinkChoice {
2926            mode: LinkMode::Flexible,
2927            link: LinkFunction::Probit,
2928            mixture_components: None,
2929        }));
2930    };
2931    let t = v.trim().to_ascii_lowercase();
2932    if let Some(inner) = t
2933        .strip_prefix("flexible(")
2934        .and_then(|s| s.strip_suffix(')'))
2935    {
2936        if let Some(components_inner) = inner
2937            .strip_prefix("blended(")
2938            .and_then(|s| s.strip_suffix(')'))
2939            .or_else(|| {
2940                inner
2941                    .strip_prefix("mixture(")
2942                    .and_then(|s| s.strip_suffix(')'))
2943            })
2944        {
2945            parse_link_component_list(components_inner)?;
2946            return Err(FormulaDslError::IncompatibleTerm {
2947                reason:
2948                    "flexible(...) does not support blended(...)/mixture(...) links; wiggle is only supported for jointly fit standard links"
2949                        .to_string(),
2950            });
2951        }
2952        let link = parse_linkname(inner)?;
2953        if !linkname_supports_joint_wiggle(link) {
2954            return Err(FormulaDslError::IncompatibleTerm {
2955                reason:
2956                    "flexible(...) does not support sas/beta-logistic links; wiggle is only supported for jointly fit standard links"
2957                        .to_string(),
2958            });
2959        }
2960        return Ok(Some(LinkChoice {
2961            mode: LinkMode::Flexible,
2962            link,
2963            mixture_components: None,
2964        }));
2965    }
2966    if let Some(inner) = t
2967        .strip_prefix("blended(")
2968        .and_then(|s| s.strip_suffix(')'))
2969        .or_else(|| t.strip_prefix("mixture(").and_then(|s| s.strip_suffix(')')))
2970    {
2971        if flexible_flag {
2972            return Err(FormulaDslError::IncompatibleTerm {
2973                reason:
2974                    "--flexible-link cannot be combined with --link blended(...)/mixture(...); blended inverse links are not flexible-link mode"
2975                        .to_string(),
2976            });
2977        }
2978        let components = parse_link_component_list(inner)?;
2979        return Ok(Some(LinkChoice {
2980            mode: LinkMode::Strict,
2981            link: LinkFunction::Logit,
2982            mixture_components: Some(components),
2983        }));
2984    }
2985
2986    let link = parse_linkname(&t)?;
2987    if flexible_flag && !linkname_supports_joint_wiggle(link) {
2988        return Err(FormulaDslError::IncompatibleTerm {
2989            reason:
2990                "--flexible-link does not support sas/beta-logistic links; wiggle is only supported for jointly fit standard links"
2991                    .to_string(),
2992        });
2993    }
2994    Ok(Some(LinkChoice {
2995        mode: if flexible_flag {
2996            LinkMode::Flexible
2997        } else {
2998            LinkMode::Strict
2999        },
3000        link,
3001        mixture_components: None,
3002    }))
3003}
3004
3005pub fn parse_linkname(v: &str) -> Result<LinkFunction, FormulaDslError> {
3006    match v.trim() {
3007        "identity" => Ok(LinkFunction::Identity),
3008        "log" => Ok(LinkFunction::Log),
3009        "logit" | "binomial-logit" => Ok(LinkFunction::Logit),
3010        "probit" | "binomial-probit" => Ok(LinkFunction::Probit),
3011        "cloglog" | "binomial-cloglog" => Ok(LinkFunction::CLogLog),
3012        "sas" => Ok(LinkFunction::Sas),
3013        "beta-logistic" => Ok(LinkFunction::BetaLogistic),
3014        other => Err(FormulaDslError::UnknownIdentifier {
3015            reason: format!(
3016                "unsupported link type '{other}'; \
3017                 use one of identity|log|logit|probit|cloglog|binomial-logit|binomial-probit|binomial-cloglog|sas|beta-logistic|blended(...)/mixture(...) or flexible(...). \
3018                 Both `--link <type>` (CLI flag) and `link(type=<type>)` (formula term) accept the same set."
3019            ),
3020        }),
3021    }
3022}
3023
3024pub fn parse_link_component(v: &str) -> Result<LinkComponent, String> {
3025    match v.trim() {
3026        "logit" => Ok(LinkComponent::Logit),
3027        "probit" => Ok(LinkComponent::Probit),
3028        "cloglog" => Ok(LinkComponent::CLogLog),
3029        "loglog" => Ok(LinkComponent::LogLog),
3030        "cauchit" => Ok(LinkComponent::Cauchit),
3031        other => Err(FormulaDslError::UnknownIdentifier {
3032            reason: format!(
3033                "unsupported blended-link component '{other}'; use probit|logit|cloglog|loglog|cauchit"
3034            ),
3035        }
3036        .into()),
3037    }
3038}
3039
3040pub fn parse_link_component_list(v: &str) -> Result<Vec<LinkComponent>, String> {
3041    let mut out = Vec::new();
3042    for part in v.split(',') {
3043        let trimmed = part.trim();
3044        if trimmed.is_empty() {
3045            continue;
3046        }
3047        let comp = parse_link_component(trimmed)?;
3048        if out.contains(&comp) {
3049            return Err(FormulaDslError::IncompatibleTerm {
3050                reason: "blended(...) cannot contain duplicate components".to_string(),
3051            }
3052            .into());
3053        }
3054        out.push(comp);
3055    }
3056    if out.len() < 2 {
3057        return Err(FormulaDslError::InvalidArgument {
3058            reason: "blended(...) requires at least two components".to_string(),
3059        }
3060        .into());
3061    }
3062    Ok(out)
3063}