Skip to main content

gam_terms/inference/
formula_dsl.rs

1use std::collections::BTreeMap;
2
3use pest::Parser;
4use pest::iterators::Pair;
5use pest_derive::Parser;
6
7use crate::smooth::BoundedCoefficientPriorSpec;
8use gam_problem::types::{
9    InverseLink, LikelihoodSpec, LinkComponent, LinkFunction, StandardLink, WigglePenaltyConfig,
10};
11
12#[derive(Parser)]
13#[grammar_inline = r#"
14WHITESPACE = _{ " " | "\t" | NEWLINE }
15
16top_function_call = { SOI ~ function_call ~ EOI }
17top_expr = { SOI ~ expr ~ EOI }
18formula = { SOI ~ expr ~ "~" ~ rhs ~ EOI }
19rhs = { term ~ ("+" ~ term)* }
20term = { expr }
21
22expr = { sum }
23sum = { product ~ (add_op ~ product)* }
24add_op = { "+" | "-" }
25product = { interact ~ (mul_op ~ interact)* }
26mul_op = { "*" | "/" }
27interact = { power ~ (interact_op ~ power)* }
28interact_op = { ":" }
29power = { unary ~ (pow_op ~ unary)* }
30pow_op = { "^" }
31unary = { unary_op* ~ primary }
32unary_op = _{ "+" | "-" }
33
34primary = { function_call | list_lit | tuple_lit | ident | number | string_lit | "(" ~ expr ~ ")" }
35list_lit = @{ "[" ~ (!"]" ~ ANY)* ~ "]" }
36tuple_lit = @{ "(" ~ (!("," | ")") ~ ANY)+ ~ "," ~ (!")" ~ ANY)* ~ ")" }
37function_call = { ident ~ "(" ~ arg_list? ~ ")" }
38arg_list = { arg ~ ("," ~ arg)* }
39arg = { named_arg | expr }
40named_arg = { ident ~ "=" ~ expr }
41
42ident = @{ ident_start ~ ident_continue* }
43ident_start = _{ ASCII_ALPHA | "_" }
44ident_continue = _{ ASCII_ALPHANUMERIC | "_" | "." }
45
46number = @{
47    "-"?
48    ~ (ASCII_DIGIT+ ~ ("." ~ ASCII_DIGIT*)? | "." ~ ASCII_DIGIT+)
49    ~ (("e" | "E") ~ ("+" | "-")? ~ ASCII_DIGIT+)?
50}
51
52string_lit = @{ "\"" ~ (!"\"" ~ ANY)* ~ "\"" | "'" ~ (!"'" ~ ANY)* ~ "'" }
53"#]
54struct FormulaParser;
55
56#[derive(Clone, Debug, PartialEq, Eq)]
57pub struct FormulaDslParse {
58    pub response_expr: String,
59    pub rhs_terms: Vec<String>,
60}
61
62#[derive(Clone, Debug, PartialEq, Eq)]
63pub enum CallArgSpec {
64    Positional(String),
65    Named { key: String, value: String },
66}
67
68#[derive(Clone, Debug, PartialEq, Eq)]
69pub struct FunctionCallSpec {
70    pub name: String,
71    pub args: Vec<CallArgSpec>,
72}
73
74/// Typed error surface for the formula DSL parser.
75///
76/// Every variant carries a free-form `reason: String` payload; `Display`
77/// emits exactly that payload, so converting a `FormulaDslError` into
78/// `String` (via the `From` impl below) is byte-equivalent to the pre-
79/// refactor `Err(format!(...))` / `Err("...".to_string())` strings that
80/// the same call sites produced. Public entry points keep their existing
81/// `Result<_, String>` signatures — CLI input handling stays unchanged —
82/// and typed errors flow across the boundary via `From<FormulaDslError>
83/// for String`.
84#[derive(Clone, Debug, PartialEq, Eq)]
85pub enum FormulaDslError {
86    /// Pest grammar failure, unbalanced delimiters, empty terms, or
87    /// missing required parse fragments — i.e. the formula text is
88    /// not a well-formed DSL string.
89    ParseError { reason: String },
90    /// A referenced symbol (link name, blended-link component, term
91    /// function name, top-level RHS identifier) is not part of the
92    /// supported vocabulary.
93    UnknownIdentifier { reason: String },
94    /// A named option's value is unparseable, out of range, or not a
95    /// finite number / valid integer.
96    InvalidArgument { reason: String },
97    /// A combination of terms or options is disallowed (duplicate
98    /// terms, multiple linkwiggle/link/survmodel, mutually exclusive
99    /// option groups in bounded(), wiggle-incompatible links, etc.).
100    IncompatibleTerm { reason: String },
101    /// A required configuration option is missing or empty (e.g.
102    /// `link()` without `type=`, `survmodel()` with no options,
103    /// `bounded()` without a required argument).
104    MalformedConfig { reason: String },
105}
106
107gam_linalg::impl_reason_error_boilerplate! {
108    FormulaDslError {
109        ParseError,
110        UnknownIdentifier,
111        InvalidArgument,
112        IncompatibleTerm,
113        MalformedConfig,
114    }
115}
116
117/// Inbound conversion from `String` is used by `?` cascades inside `parse_formula`
118/// and friends so that internal parser helpers still returning `Result<_, String>`
119/// can flow through without each call site needing an explicit `.map_err(...)`.
120/// We route into `ParseError` because by construction every internal helper that
121/// still produces a raw `String` is itself a parse/term-resolution stage.
122impl From<String> for FormulaDslError {
123    fn from(reason: String) -> Self {
124        FormulaDslError::ParseError { reason }
125    }
126}
127
128pub fn parse_formula_dsl(formula: &str) -> Result<FormulaDslParse, String> {
129    validate_balanced_delimiters(formula, "invalid formula syntax")?;
130    let mut parsed =
131        FormulaParser::parse(Rule::formula, formula).map_err(|e| FormulaDslError::ParseError {
132            reason: format!("invalid formula syntax: {e}"),
133        })?;
134    let formula_pair = parsed.next().ok_or_else(|| FormulaDslError::ParseError {
135        reason: "invalid formula syntax: empty parse".to_string(),
136    })?;
137
138    let mut response_expr: Option<String> = None;
139    let mut rhs_terms: Option<Vec<String>> = None;
140
141    for part in formula_pair.into_inner() {
142        match part.as_rule() {
143            Rule::expr if response_expr.is_none() => {
144                response_expr = Some(part.as_str().trim().to_string());
145            }
146            Rule::rhs => {
147                rhs_terms = Some(extract_rhs_terms(part)?);
148            }
149            _ => {}
150        }
151    }
152
153    let response_expr = response_expr.ok_or_else(|| FormulaDslError::ParseError {
154        reason: "invalid formula: missing response expression".to_string(),
155    })?;
156    let rhs_terms = rhs_terms.ok_or_else(|| FormulaDslError::ParseError {
157        reason: "invalid formula: missing RHS terms".to_string(),
158    })?;
159    if rhs_terms.is_empty() {
160        return Err(FormulaDslError::ParseError {
161            reason: "formula has no usable terms".to_string(),
162        }
163        .into());
164    }
165
166    Ok(FormulaDslParse {
167        response_expr,
168        rhs_terms,
169    })
170}
171
172fn delimiter_balance_error(prefix: &str) -> String {
173    format!("{prefix}: unbalanced parentheses or quotes")
174}
175
176// Pest reports malformed delimiters as a generic parse failure. We validate the
177// raw text first so callers get a stable, specific error class for unmatched
178// parentheses/quotes instead of whichever grammar branch happened to fail last.
179fn validate_balanced_delimiters(input: &str, prefix: &str) -> Result<(), String> {
180    let mut stack = Vec::<char>::new();
181    let mut in_single = false;
182    let mut in_double = false;
183
184    for ch in input.chars() {
185        match ch {
186            '\'' if !in_double => in_single = !in_single,
187            '"' if !in_single => in_double = !in_double,
188            '(' | '[' | '{' if !in_single && !in_double => stack.push(ch),
189            ')' | ']' | '}' if !in_single && !in_double => {
190                let expected = match ch {
191                    ')' => '(',
192                    ']' => '[',
193                    // The outer match arm guarantees ch is one of ')', ']', '}'.
194                    _ => '{',
195                };
196                if stack.pop() != Some(expected) {
197                    return Err(FormulaDslError::ParseError {
198                        reason: delimiter_balance_error(prefix),
199                    }
200                    .into());
201                }
202            }
203            _ => {}
204        }
205    }
206
207    if in_single || in_double || !stack.is_empty() {
208        return Err(FormulaDslError::ParseError {
209            reason: delimiter_balance_error(prefix),
210        }
211        .into());
212    }
213    Ok(())
214}
215
216fn extract_rhs_terms(rhs: Pair<'_, Rule>) -> Result<Vec<String>, String> {
217    let mut out = Vec::new();
218    let mut depth = 0_i32;
219    let mut in_single = false;
220    let mut in_double = false;
221    let mut start = 0_usize;
222    // Last non-whitespace character seen at depth 0 outside quotes. A top-level
223    // `+` only separates terms when it follows a completed operand; when it
224    // follows a binary operator (`:`, `*`, `/`, `^`) or another sign — or opens
225    // the RHS — it is a UNARY sign, not a separator. Splitting on it there would
226    // hand the grammar a truncated fragment like `x:` (from `x:+z`) and surface
227    // a confusing "invalid term syntax in `x:`" instead of the dedicated unary
228    // diagnostic the grammar raises when the whole `x:+z` term reaches it. This
229    // never suppresses a split in a valid formula: no valid Wilkinson-Rogers RHS
230    // places `+` immediately after an operator. `-` is already never treated as
231    // a separator here (it is rejected downstream as a binary term operator), so
232    // the unary `-` path already flows through intact — this restores the same
233    // intact flow for unary `+`.
234    let mut last_significant: Option<char> = None;
235    let text = rhs.as_str();
236    let bytes = text.as_bytes();
237    for (idx, &b) in bytes.iter().enumerate() {
238        let ch = b as char;
239        match ch {
240            '\'' if !in_double => in_single = !in_single,
241            '"' if !in_single => in_double = !in_double,
242            '(' | '[' | '{' if !in_single && !in_double => depth += 1,
243            ')' | ']' | '}' if !in_single && !in_double && depth > 0 => depth -= 1,
244            '+' if !in_single
245                && !in_double
246                && depth == 0
247                && !matches!(
248                    last_significant,
249                    None | Some(':' | '*' | '/' | '^' | '+' | '-')
250                ) =>
251            {
252                let term = text[start..idx].trim();
253                if term.is_empty() {
254                    return Err(FormulaDslError::ParseError {
255                        reason: "formula RHS contains an empty term".to_string(),
256                    }
257                    .into());
258                }
259                out.push(term.to_string());
260                start = idx + 1;
261            }
262            _ => {}
263        }
264        if !ch.is_ascii_whitespace() {
265            last_significant = Some(ch);
266        }
267    }
268    if in_single || in_double || depth != 0 {
269        return Err(FormulaDslError::ParseError {
270            reason: "formula RHS has unbalanced quotes or parentheses".to_string(),
271        }
272        .into());
273    }
274    let tail = text[start..].trim();
275    if tail.is_empty() {
276        return Err(FormulaDslError::ParseError {
277            reason: "formula RHS contains an empty term".to_string(),
278        }
279        .into());
280    }
281    out.push(tail.to_string());
282    Ok(out)
283}
284
285/// Wilkinson-Rogers operator-family expansion.
286///
287/// A raw RHS term (already split on top-level `+`) may use the documented
288/// formula operators `:`, `*`, `/`, `^`, and parenthesization. This function
289/// expands the term into a normalized list of `WrAtomList`s, where each
290/// `WrAtomList` is one resulting model term — either a single atom (linear
291/// main effect / function-call term) or multiple atoms (interaction).
292///
293/// Wilkinson-Rogers semantics implemented here:
294/// * `a` produces `{a}` — one main effect.
295/// * `a:b` produces `{a:b}` — one interaction.
296/// * `a*b` produces `{a, b, a:b}` — crossing (expanded).
297/// * `a/b` produces `{a, a:b}` — nesting.
298/// * `(a + b + ... )^n` produces every non-empty subset of `{a, b, ...}` of
299///   size at most `n`, each subset being one interaction.
300/// * `+` unions two term sets; `-` is rejected.
301///
302/// Atoms inside the AST may be bare identifiers or function calls. Function
303/// calls are opaque — they pass through as-is and cannot participate in an
304/// `:` interaction with other atoms (smooths/factors require dedicated
305/// constructors like `te()`).
306type WrAtomList = Vec<String>;
307
308fn expand_wr_term(raw: &str) -> Result<Vec<WrAtomList>, String> {
309    let mut parsed = FormulaParser::parse(Rule::top_expr, raw).map_err(|e| {
310        FormulaDslError::ParseError {
311            reason: format!("invalid term syntax in `{raw}`: {e}"),
312        }
313        .to_string()
314    })?;
315    let top = parsed.next().ok_or_else(|| {
316        FormulaDslError::ParseError {
317            reason: format!("invalid term syntax in `{raw}`: empty parse"),
318        }
319        .to_string()
320    })?;
321    let expr = top
322        .into_inner()
323        .find(|p| p.as_rule() == Rule::expr)
324        .ok_or_else(|| {
325            FormulaDslError::ParseError {
326                reason: format!("invalid term syntax in `{raw}`: missing expr"),
327            }
328            .to_string()
329        })?;
330    let interactions = expand_expr(expr, raw)?;
331    let normalized: Vec<WrAtomList> = interactions
332        .into_iter()
333        .map(normalize_interaction)
334        .collect();
335    // Deduplicate while preserving order: a*b yields {a, b, a:b}; a*b + a
336    // would otherwise list `a` twice.
337    let mut seen = std::collections::BTreeSet::<Vec<String>>::new();
338    let mut out = Vec::<WrAtomList>::new();
339    for term in normalized {
340        let key = term.clone();
341        if seen.insert(key) {
342            out.push(term);
343        }
344    }
345    Ok(out)
346}
347
348fn normalize_interaction(mut atoms: WrAtomList) -> WrAtomList {
349    atoms.sort();
350    atoms.dedup();
351    atoms
352}
353
354fn expand_expr(pair: Pair<'_, Rule>, raw: &str) -> Result<Vec<WrAtomList>, String> {
355    match pair.as_rule() {
356        Rule::expr => {
357            let inner = pair.into_inner().next().ok_or_else(|| {
358                FormulaDslError::ParseError {
359                    reason: format!("invalid term syntax in `{raw}`: empty expr"),
360                }
361                .to_string()
362            })?;
363            expand_expr(inner, raw)
364        }
365        Rule::sum => {
366            let mut iter = pair.into_inner();
367            let first = iter.next().ok_or_else(|| {
368                FormulaDslError::ParseError {
369                    reason: format!("invalid term syntax in `{raw}`: empty sum"),
370                }
371                .to_string()
372            })?;
373            let mut acc = expand_expr(first, raw)?;
374            while let Some(op) = iter.next() {
375                if op.as_rule() != Rule::add_op {
376                    return Err(FormulaDslError::ParseError {
377                        reason: format!(
378                            "invalid term syntax in `{raw}`: expected add operator, got `{:?}`",
379                            op.as_rule()
380                        ),
381                    }
382                    .into());
383                }
384                let op_str = op.as_str().trim();
385                let operand = iter.next().ok_or_else(|| {
386                    FormulaDslError::ParseError {
387                        reason: format!("invalid term syntax in `{raw}`: dangling `{op_str}`"),
388                    }
389                    .to_string()
390                })?;
391                if op_str == "-" {
392                    return Err(FormulaDslError::IncompatibleTerm {
393                        reason: format!(
394                            "binary `-` is not supported inside a formula term in `{raw}` \
395                             (use multiple `+` terms or drop the unwanted predictor explicitly)"
396                        ),
397                    }
398                    .into());
399                }
400                let mut rhs = expand_expr(operand, raw)?;
401                acc.append(&mut rhs);
402            }
403            Ok(acc)
404        }
405        Rule::product => {
406            let mut iter = pair.into_inner();
407            let first = iter.next().ok_or_else(|| {
408                FormulaDslError::ParseError {
409                    reason: format!("invalid term syntax in `{raw}`: empty product"),
410                }
411                .to_string()
412            })?;
413            let mut acc = expand_expr(first, raw)?;
414            while let Some(op) = iter.next() {
415                if op.as_rule() != Rule::mul_op {
416                    return Err(FormulaDslError::ParseError {
417                        reason: format!(
418                            "invalid term syntax in `{raw}`: expected `*` or `/`, got `{:?}`",
419                            op.as_rule()
420                        ),
421                    }
422                    .into());
423                }
424                let op_str = op.as_str().trim();
425                let operand = iter.next().ok_or_else(|| {
426                    FormulaDslError::ParseError {
427                        reason: format!("invalid term syntax in `{raw}`: dangling `{op_str}`"),
428                    }
429                    .to_string()
430                })?;
431                let rhs = expand_expr(operand, raw)?;
432                acc = match op_str {
433                    "*" => wr_cross(acc, rhs),
434                    "/" => wr_nest(acc, rhs),
435                    other => {
436                        return Err(FormulaDslError::ParseError {
437                            reason: format!(
438                                "invalid term syntax in `{raw}`: unrecognized mul operator `{other}`"
439                            ),
440                        }
441                        .into());
442                    }
443                };
444            }
445            Ok(acc)
446        }
447        Rule::interact => {
448            let mut iter = pair.into_inner();
449            let first = iter.next().ok_or_else(|| {
450                FormulaDslError::ParseError {
451                    reason: format!("invalid term syntax in `{raw}`: empty interact"),
452                }
453                .to_string()
454            })?;
455            let mut acc = expand_expr(first, raw)?;
456            while let Some(op) = iter.next() {
457                if op.as_rule() != Rule::interact_op {
458                    return Err(FormulaDslError::ParseError {
459                        reason: format!(
460                            "invalid term syntax in `{raw}`: expected `:`, got `{:?}`",
461                            op.as_rule()
462                        ),
463                    }
464                    .into());
465                }
466                let operand = iter.next().ok_or_else(|| {
467                    FormulaDslError::ParseError {
468                        reason: format!("invalid term syntax in `{raw}`: dangling `:`"),
469                    }
470                    .to_string()
471                })?;
472                let rhs = expand_expr(operand, raw)?;
473                acc = wr_interact(acc, rhs, raw)?;
474            }
475            Ok(acc)
476        }
477        Rule::power => {
478            let mut iter = pair.into_inner();
479            let first = iter.next().ok_or_else(|| {
480                FormulaDslError::ParseError {
481                    reason: format!("invalid term syntax in `{raw}`: empty power"),
482                }
483                .to_string()
484            })?;
485            let base = expand_expr(first, raw)?;
486            let Some(op) = iter.next() else {
487                return Ok(base);
488            };
489            if op.as_rule() != Rule::pow_op {
490                return Err(FormulaDslError::ParseError {
491                    reason: format!(
492                        "invalid term syntax in `{raw}`: expected `^`, got `{:?}`",
493                        op.as_rule()
494                    ),
495                }
496                .into());
497            }
498            let exponent_pair = iter.next().ok_or_else(|| {
499                FormulaDslError::ParseError {
500                    reason: format!("invalid term syntax in `{raw}`: dangling `^`"),
501                }
502                .to_string()
503            })?;
504            let exp_text = exponent_pair.as_str().trim();
505            let n: usize = exp_text.parse().map_err(|_| {
506                FormulaDslError::ParseError {
507                    reason: format!(
508                        "invalid term syntax in `{raw}`: `^` exponent must be a positive integer, got `{exp_text}`"
509                    ),
510                }
511                .to_string()
512            })?;
513            if n == 0 {
514                return Err(FormulaDslError::ParseError {
515                    reason: format!(
516                        "invalid term syntax in `{raw}`: `^0` is not a meaningful formula expansion"
517                    ),
518                }
519                .into());
520            }
521            if let Some(extra_op) = iter.next() {
522                let extra = extra_op.as_str().trim();
523                return Err(FormulaDslError::ParseError {
524                    reason: format!(
525                        "invalid term syntax in `{raw}`: chained `^` operators are not supported; \
526                         use one positive integer exponent, got another `{extra}`"
527                    ),
528                }
529                .into());
530            }
531            Ok(wr_power(base, n))
532        }
533        Rule::unary => {
534            let unary_start = pair.as_span().start();
535            for inner in pair.into_inner() {
536                if inner.as_rule() == Rule::primary {
537                    let prefix_len = inner.as_span().start().saturating_sub(unary_start);
538                    if prefix_len > 0 {
539                        let prefix = &raw[unary_start..inner.as_span().start()];
540                        if prefix.chars().any(|ch| matches!(ch, '+' | '-')) {
541                            return Err(FormulaDslError::IncompatibleTerm {
542                                reason: format!(
543                                    "unary `+`/`-` is not supported inside a formula term in `{raw}`"
544                                ),
545                            }
546                            .into());
547                        }
548                    }
549                    return expand_expr(inner, raw);
550                }
551            }
552            Err(FormulaDslError::ParseError {
553                reason: format!("invalid term syntax in `{raw}`: empty unary"),
554            }
555            .into())
556        }
557        Rule::primary => {
558            let span = pair.as_str().trim().to_string();
559            let inner = pair.into_inner().next();
560            match inner {
561                Some(child) if child.as_rule() == Rule::expr => expand_expr(child, raw),
562                Some(child) => Ok(vec![vec![child.as_str().trim().to_string()]]),
563                None => Ok(vec![vec![span]]),
564            }
565        }
566        _ => Err(FormulaDslError::ParseError {
567            reason: format!(
568                "invalid term syntax in `{raw}`: unexpected node `{:?}`",
569                pair.as_rule()
570            ),
571        }
572        .into()),
573    }
574}
575
576fn wr_cross(left: Vec<WrAtomList>, right: Vec<WrAtomList>) -> Vec<WrAtomList> {
577    // a*b = a + b + a:b
578    let mut out = Vec::with_capacity(left.len() + right.len() + left.len() * right.len());
579    out.extend(left.iter().cloned());
580    out.extend(right.iter().cloned());
581    for l in &left {
582        for r in &right {
583            let mut merged: WrAtomList = l.iter().cloned().chain(r.iter().cloned()).collect();
584            merged.sort();
585            merged.dedup();
586            out.push(merged);
587        }
588    }
589    out
590}
591
592fn wr_nest(left: Vec<WrAtomList>, right: Vec<WrAtomList>) -> Vec<WrAtomList> {
593    // a/b = a + a:b
594    let mut out = Vec::with_capacity(left.len() + left.len() * right.len());
595    out.extend(left.iter().cloned());
596    for l in &left {
597        for r in &right {
598            let mut merged: WrAtomList = l.iter().cloned().chain(r.iter().cloned()).collect();
599            merged.sort();
600            merged.dedup();
601            out.push(merged);
602        }
603    }
604    out
605}
606
607fn wr_interact(
608    left: Vec<WrAtomList>,
609    right: Vec<WrAtomList>,
610    raw: &str,
611) -> Result<Vec<WrAtomList>, String> {
612    // a:b — Cartesian merge of every (l, r) pair.
613    let mut out = Vec::with_capacity(left.len() * right.len());
614    for l in &left {
615        for r in &right {
616            let combined: WrAtomList = l.iter().cloned().chain(r.iter().cloned()).collect();
617            // Reject interactions whose atoms contain function-call syntax.
618            // These would build design columns that are not simple products
619            // (factors, smooths) and need a dedicated constructor.
620            for atom in &combined {
621                if atom.contains('(') {
622                    return Err(FormulaDslError::IncompatibleTerm {
623                        reason: format!(
624                            "interaction operator `:` with function-call atom is not supported in `{raw}`. \
625                             Use te(...) for smooth interactions or group()/factor() with a separate \
626                             interaction strategy for categorical effects."
627                        ),
628                    }
629                    .into());
630                }
631            }
632            let mut merged = combined;
633            merged.sort();
634            merged.dedup();
635            out.push(merged);
636        }
637    }
638    Ok(out)
639}
640
641fn wr_power(base: Vec<WrAtomList>, n: usize) -> Vec<WrAtomList> {
642    // (e)^n produces every non-empty subset of `base` of size ≤ n,
643    // each subset being one interaction. The standard WR semantics
644    // treat the base as a *set* of atoms — interactions inside `base`
645    // are kept as-is and not re-crossed.
646    if base.is_empty() {
647        return Vec::new();
648    }
649    let m = base.len();
650    let mut out = Vec::<WrAtomList>::new();
651    let max_size = n.min(m);
652    for size in 1..=max_size {
653        // Enumerate combinations of `size` elements out of m.
654        let mut indices: Vec<usize> = (0..size).collect();
655        loop {
656            let mut merged = WrAtomList::new();
657            for &i in &indices {
658                merged.extend(base[i].iter().cloned());
659            }
660            merged.sort();
661            merged.dedup();
662            out.push(merged);
663            // Next combination
664            let mut k = size;
665            while k > 0 {
666                k -= 1;
667                if indices[k] != k + m - size {
668                    indices[k] += 1;
669                    for j in (k + 1)..size {
670                        indices[j] = indices[j - 1] + 1;
671                    }
672                    break;
673                }
674                if k == 0 {
675                    k = usize::MAX;
676                    break;
677                }
678            }
679            if k == usize::MAX {
680                break;
681            }
682        }
683    }
684    out
685}
686
687fn is_exact_ident(raw: &str) -> bool {
688    let mut chars = raw.chars();
689    let Some(first) = chars.next() else {
690        return false;
691    };
692    if !first.is_ascii_alphabetic() && first != '_' {
693        return false;
694    }
695    chars.all(|ch| ch.is_ascii_alphanumeric() || ch == '_' || ch == '.')
696}
697
698pub fn parse_function_call(input: &str) -> Result<FunctionCallSpec, String> {
699    validate_balanced_delimiters(input, "invalid function call syntax")?;
700    let mut parsed = FormulaParser::parse(Rule::top_function_call, input).map_err(|e| {
701        FormulaDslError::ParseError {
702            reason: format!("invalid function call syntax: {e}"),
703        }
704    })?;
705    let top = parsed.next().ok_or_else(|| FormulaDslError::ParseError {
706        reason: "invalid function call syntax: empty parse".to_string(),
707    })?;
708    let call = top
709        .into_inner()
710        .find(|p| p.as_rule() == Rule::function_call)
711        .ok_or_else(|| FormulaDslError::ParseError {
712            reason: "invalid function call syntax: missing call".to_string(),
713        })?;
714    parse_call_pair(call)
715}
716
717fn parse_call_pair(call: Pair<'_, Rule>) -> Result<FunctionCallSpec, String> {
718    let mut name: Option<String> = None;
719    let mut args = Vec::<CallArgSpec>::new();
720    for part in call.into_inner() {
721        match part.as_rule() {
722            Rule::ident => {
723                if name.is_none() {
724                    name = Some(part.as_str().trim().to_string());
725                }
726            }
727            Rule::arg_list => {
728                for a in part.into_inner() {
729                    if a.as_rule() != Rule::arg {
730                        continue;
731                    }
732                    let mut a_inner = a.into_inner();
733                    let Some(first) = a_inner.next() else {
734                        continue;
735                    };
736                    match first.as_rule() {
737                        Rule::named_arg => {
738                            let mut ni = first.into_inner();
739                            let key = ni
740                                .next()
741                                .ok_or_else(|| FormulaDslError::ParseError {
742                                    reason: "invalid named argument key".to_string(),
743                                })?
744                                .as_str()
745                                .trim()
746                                .to_ascii_lowercase();
747                            let value = ni
748                                .next()
749                                .ok_or_else(|| FormulaDslError::ParseError {
750                                    reason: "invalid named argument value".to_string(),
751                                })?
752                                .as_str()
753                                .trim()
754                                .to_string();
755                            args.push(CallArgSpec::Named { key, value });
756                        }
757                        Rule::expr => {
758                            args.push(CallArgSpec::Positional(first.as_str().trim().to_string()));
759                        }
760                        _ => {}
761                    }
762                }
763            }
764            _ => {}
765        }
766    }
767    let name = name.ok_or_else(|| FormulaDslError::ParseError {
768        reason: "invalid function call: missing name".to_string(),
769    })?;
770    Ok(FunctionCallSpec { name, args })
771}
772
773#[cfg(test)]
774mod tests {
775    use super::{
776        CallArgSpec, ParsedTerm, parse_formula, parse_formula_dsl, parse_function_call,
777        parse_linkwiggle_formulaspec, parsed_term_column_names, parsed_terms_reference_column,
778        validate_marginal_slope_z_column_exclusion,
779    };
780    use std::collections::{BTreeMap, BTreeSet};
781
782    #[test]
783    fn parsed_term_column_names_includes_by_smooth_grouping_variable() {
784        // The model's input contract must include a smooth's `by=` column, not
785        // just its positional variable. `s(x, by=g)` consumes both `x` and `g`
786        // (`term_builder` reads `options["by"]`); dropping `g` from the
787        // required/consumable set would either omit a genuine predictor at fit
788        // (CLI projected load) or — worse — make predict silently project the
789        // `g` column away (#840 regression). A bare main effect and an
790        // interaction round out the variant coverage.
791        let parsed =
792            parse_formula("y ~ s(x, by=g) + z + a:b").expect("formula with a by= smooth parses");
793        let mut cols = BTreeSet::<String>::new();
794        parsed_term_column_names(&parsed.terms, &mut cols);
795        for expected in ["x", "g", "z", "a", "b"] {
796            assert!(
797                cols.contains(expected),
798                "parsed_term_column_names dropped '{expected}'; got {cols:?}"
799            );
800        }
801        // The response is never a *term* column (it is handled separately).
802        assert!(
803            !cols.contains("y"),
804            "response leaked into term columns: {cols:?}"
805        );
806    }
807
808    #[test]
809    fn linkwiggle_parser_does_not_bake_in_cubic_only_restriction() {
810        // Regression for #384: the cubic-only constraint belongs to the
811        // score-warp / link-deviation `DeviationRuntime`, NOT to this shared
812        // parser. `parse_linkwiggle_formulaspec` also feeds `timewiggle` and
813        // the location-scale survival path, whose general monotone I-spline
814        // value basis honors any `degree >= 2`. So the parser must accept
815        // non-cubic degrees and carry them through verbatim; the cubic gate is
816        // applied downstream only where the cubic-only runtime is built. A
817        // prior fix wrongly forced `degree == 3` here, breaking timewiggle and
818        // location-scale callers — this pins that the parser stays general.
819        for deg in [2usize, 4, 5, 10] {
820            let mut options = BTreeMap::new();
821            options.insert("degree".to_string(), deg.to_string());
822            options.insert("internal_knots".to_string(), "3".to_string());
823            let raw = format!("timewiggle(degree={deg}, internal_knots=3)");
824            let spec = parse_linkwiggle_formulaspec(&options, &raw)
825                .expect("non-cubic wiggle degree must parse at the shared layer");
826            assert_eq!(
827                spec.degree, deg,
828                "parser must carry the requested degree through verbatim"
829            );
830        }
831
832        // The only universal lower bound the shared parser enforces is that a
833        // polynomial degree is positive.
834        let mut zero = BTreeMap::new();
835        zero.insert("degree".to_string(), "0".to_string());
836        zero.insert("internal_knots".to_string(), "3".to_string());
837        let err = parse_linkwiggle_formulaspec(&zero, "linkwiggle(degree=0, internal_knots=3)")
838            .expect_err("degree=0 must be rejected");
839        assert!(
840            err.contains("degree >= 1"),
841            "error should state the positive-degree lower bound, got: {err}"
842        );
843    }
844
845    #[test]
846    fn parses_nested_formula_terms() {
847        let parsed =
848            parse_formula_dsl("log(y) ~ x1 + s(log(x2 + 1), bs=\"tps\", k=10) + te(x3, x4)")
849                .expect("parse");
850        assert_eq!(parsed.response_expr, "log(y)");
851        assert_eq!(parsed.rhs_terms.len(), 3);
852        assert_eq!(parsed.rhs_terms[0], "x1");
853        assert_eq!(parsed.rhs_terms[1], "s(log(x2 + 1), bs=\"tps\", k=10)");
854        assert_eq!(parsed.rhs_terms[2], "te(x3, x4)");
855    }
856
857    #[test]
858    fn parses_cyclic_formula_aliases() {
859        let parsed = parse_formula("y ~ cyclic(theta, period_start=0, period_end=6.283)")
860            .expect("parse cyclic formula");
861        match &parsed.terms[0] {
862            super::ParsedTerm::Smooth { vars, options, .. } => {
863                assert_eq!(vars, &vec!["theta".to_string()]);
864                assert_eq!(options.get("type").map(String::as_str), Some("cyclic"));
865                assert_eq!(options.get("period_start").map(String::as_str), Some("0"));
866            }
867            other => panic!("expected cyclic smooth term, got {other:?}"),
868        }
869    }
870
871    #[test]
872    fn sphere_aliases_all_dispatch_to_intrinsic_s2_basis() {
873        // Regression for #383: `s2(lat, lon)` must build the same intrinsic
874        // S² (sphere) basis as `sphere()`/`sos()`/`spherical()`. Previously the
875        // `s2` arm returned a Smooth without `type=sphere`, so it silently fell
876        // back to a generic Euclidean 2-D smooth and diverged in the
877        // spatial-kappa optimizer. All four aliases must be byte-for-byte
878        // equivalent in their dispatch (vars + `type=sphere`).
879        for alias in ["sphere", "sos", "spherical", "s2"] {
880            let parsed = parse_formula(&format!("y ~ {alias}(lat, lon)"))
881                .unwrap_or_else(|e| panic!("parse {alias}: {e}"));
882            match &parsed.terms[0] {
883                super::ParsedTerm::Smooth { vars, options, .. } => {
884                    assert_eq!(
885                        vars,
886                        &vec!["lat".to_string(), "lon".to_string()],
887                        "{alias} should keep (lat, lon) as its variables"
888                    );
889                    assert_eq!(
890                        options.get("type").map(String::as_str),
891                        Some("sphere"),
892                        "{alias} must dispatch to the intrinsic sphere basis (type=sphere)"
893                    );
894                }
895                other => panic!("expected sphere smooth term for {alias}, got {other:?}"),
896            }
897        }
898    }
899
900    #[test]
901    fn parses_function_callwithnamed_and_positional_args() {
902        let call = parse_function_call("s(log(x + 1), type=\"duchon\", centers=12)").expect("call");
903        assert_eq!(call.name, "s");
904        assert_eq!(call.args.len(), 3);
905        assert_eq!(
906            call.args[0],
907            CallArgSpec::Positional("log(x + 1)".to_string())
908        );
909        assert_eq!(
910            call.args[1],
911            CallArgSpec::Named {
912                key: "type".to_string(),
913                value: "\"duchon\"".to_string()
914            }
915        );
916    }
917
918    #[test]
919    fn parses_tensor_boundary_list_options() {
920        let call = parse_function_call(
921            "te(day_of_week, hour, boundary=['periodic', 'periodic'], period=[7, 24])",
922        )
923        .expect("call");
924        assert_eq!(call.name, "te");
925        assert_eq!(call.args.len(), 4);
926        assert_eq!(
927            call.args[2],
928            CallArgSpec::Named {
929                key: "boundary".to_string(),
930                value: "['periodic', 'periodic']".to_string(),
931            }
932        );
933    }
934
935    #[test]
936    fn parse_formula_dsl_reports_unbalanced_parentheses() {
937        let err = parse_formula_dsl("y ~ s(x, k=10").expect_err("expected parse failure");
938        assert!(err.contains("unbalanced parentheses"));
939    }
940
941    #[test]
942    fn parse_function_call_reports_unbalanced_parentheses() {
943        let err = parse_function_call("s(x, k=10").expect_err("expected parse failure");
944        assert!(err.contains("unbalanced parentheses"));
945    }
946
947    #[test]
948    fn parse_formula_accepts_tuple_smooth_options() {
949        let parsed = parse_formula("z ~ te(x, y, k=(20, 20))")
950            .expect("tuple-valued smooth option should parse");
951        assert_eq!(parsed.terms.len(), 1);
952
953        let dsl = parse_formula_dsl("z ~ te(x, y, k=(20, 20))")
954            .expect("tuple-valued smooth option should parse in the DSL layer");
955        assert_eq!(dsl.rhs_terms, vec!["te(x, y, k=(20, 20))"]);
956
957        let call = parse_function_call("te(x, y, k=(20, 20))")
958            .expect("tuple-valued smooth option should parse as a function call");
959        assert_eq!(
960            call.args[2],
961            CallArgSpec::Named {
962                key: "k".to_string(),
963                value: "(20, 20)".to_string(),
964            }
965        );
966    }
967
968    #[test]
969    fn parse_formula_rejects_unsupported_top_level_rhs_expressions() {
970        // Binary `-`, unary `-`, bare parens, and `-1` intercept removal are
971        // not supported as top-level RHS expressions; they must surface
972        // through the bare-identifier check in `parse_term` with the same
973        // diagnostic. WR operators `:`, `*`, `/`, and `^` are intentionally
974        // supported by `expand_wr_term` and are exercised by
975        // `parse_formula_supports_wr_slash_nesting` /
976        // `parse_formula_supports_wr_star_crossing`; they must NOT appear in
977        // this list.
978        for formula in ["y ~ x - z", "y ~ -x", "y ~ (x)", "y ~ x - 1"] {
979            let err = parse_formula(formula).expect_err("expected formula parse failure");
980            assert!(err.to_string().contains("unsupported top-level RHS term"));
981        }
982    }
983
984    /// Wilkinson-Rogers `a/b` (nesting) is documented and implemented by
985    /// `expand_wr_term` to yield `{a, a:b}`. This test pins that contract end
986    /// to end so a regression in either the grammar's `mul_op = { "*" | "/" }`
987    /// branch or in `expand_expr`'s `Rule::product` handling surfaces here
988    /// instead of slipping into a silent re-interpretation as a rejection.
989    #[test]
990    fn parse_formula_supports_wr_slash_nesting() {
991        let parsed = parse_formula("y ~ x / z").expect("`/` is supported as WR nesting");
992        assert_eq!(parsed.response, "y");
993        assert_eq!(parsed.terms.len(), 2);
994        let names: Vec<String> = parsed
995            .terms
996            .iter()
997            .map(|t| match t {
998                ParsedTerm::Linear { name, .. } => format!("Linear({name})"),
999                ParsedTerm::Interaction { vars } => format!("Interaction({})", vars.join(":")),
1000                other => format!("Other({other:?})"),
1001            })
1002            .collect();
1003        assert_eq!(
1004            names,
1005            vec!["Linear(x)".to_string(), "Interaction(x:z)".to_string()]
1006        );
1007    }
1008
1009    /// Wilkinson-Rogers `a*b` (crossing) is documented and implemented by
1010    /// `expand_wr_term` to yield `{a, b, a:b}`. Pin the contract so a future
1011    /// refactor of `mul_op` / `Rule::product` handling cannot silently change
1012    /// the set of model terms.
1013    #[test]
1014    fn parse_formula_supports_wr_star_crossing() {
1015        let parsed = parse_formula("y ~ x * z").expect("`*` is supported as WR crossing");
1016        assert_eq!(parsed.response, "y");
1017        assert_eq!(parsed.terms.len(), 3);
1018        let names: Vec<String> = parsed
1019            .terms
1020            .iter()
1021            .map(|t| match t {
1022                ParsedTerm::Linear { name, .. } => format!("Linear({name})"),
1023                ParsedTerm::Interaction { vars } => format!("Interaction({})", vars.join(":")),
1024                other => format!("Other({other:?})"),
1025            })
1026            .collect();
1027        assert_eq!(
1028            names,
1029            vec![
1030                "Linear(x)".to_string(),
1031                "Linear(z)".to_string(),
1032                "Interaction(x:z)".to_string(),
1033            ]
1034        );
1035    }
1036
1037    #[test]
1038    fn parse_formula_rejects_unary_signs_inside_wr_expansion() {
1039        for formula in ["y ~ x:-z", "y ~ a*-b", "y ~ x/-z", "y ~ x:+z"] {
1040            let err = parse_formula(formula)
1041                .expect_err("WR expansion must not silently drop unary signs");
1042            let msg = err.to_string();
1043            assert!(
1044                msg.contains("unary `+`/`-` is not supported"),
1045                "unexpected error for {formula}: {msg}"
1046            );
1047        }
1048    }
1049
1050    #[test]
1051    fn parse_formula_supports_wr_power_crossing() {
1052        let parsed = parse_formula("y ~ (x + z)^2").expect("`^` is supported as WR power");
1053        assert_eq!(parsed.response, "y");
1054        assert_eq!(parsed.terms.len(), 3);
1055        let names: Vec<String> = parsed
1056            .terms
1057            .iter()
1058            .map(|t| match t {
1059                ParsedTerm::Linear { name, .. } => format!("Linear({name})"),
1060                ParsedTerm::Interaction { vars } => format!("Interaction({})", vars.join(":")),
1061                other => format!("Other({other:?})"),
1062            })
1063            .collect();
1064        assert_eq!(
1065            names,
1066            vec![
1067                "Linear(x)".to_string(),
1068                "Linear(z)".to_string(),
1069                "Interaction(x:z)".to_string(),
1070            ]
1071        );
1072    }
1073
1074    #[test]
1075    fn parse_formula_rejects_chained_wr_power() {
1076        let err = parse_formula("y ~ (x + z)^2^3")
1077            .expect_err("chained WR powers must not silently drop later exponents");
1078        let msg = err.to_string();
1079        assert!(
1080            msg.contains("chained `^` operators are not supported"),
1081            "error should explain that chained powers are rejected, got: {msg}"
1082        );
1083    }
1084
1085    #[test]
1086    fn parsed_terms_reference_column_sees_the_by_smooth_variable() {
1087        // Regression for #807: the by= grouping variable lives in
1088        // options["by"], not the smooth's positional vars. The reference
1089        // predicate must still recognise it, both so the CLI loads the column
1090        // and so the marginal-slope z-column exclusion check (which reuses this
1091        // predicate) cannot be fooled into aliasing a reserved z onto a by=.
1092        let parsed = parse_formula("y ~ s(x, by=g)").expect("parse by-smooth");
1093        assert!(
1094            parsed_terms_reference_column(&parsed.terms, "g"),
1095            "s(x, by=g) references column g via options[\"by\"]"
1096        );
1097        assert!(parsed_terms_reference_column(&parsed.terms, "x"));
1098        assert!(!parsed_terms_reference_column(&parsed.terms, "absent"));
1099    }
1100
1101    #[test]
1102    fn marginal_slope_z_column_validator_detects_linear_and_smooth_reuse() {
1103        let main = parse_formula("y ~ x + z").expect("parse main");
1104        let logslope = parse_formula("y ~ s(z, type=duchon, centers=6)").expect("parse logslope");
1105
1106        assert!(parsed_terms_reference_column(&main.terms, "z"));
1107        assert!(parsed_terms_reference_column(&logslope.terms, "z"));
1108
1109        let err = validate_marginal_slope_z_column_exclusion(
1110            &main,
1111            &parse_formula("y ~ 1").expect("parse clean logslope"),
1112            "z",
1113            "bernoulli marginal-slope",
1114            "--logslope-formula",
1115        )
1116        .expect_err("main formula should be rejected");
1117        assert!(err.contains("cannot also appear in the main formula"));
1118
1119        let err = validate_marginal_slope_z_column_exclusion(
1120            &parse_formula("y ~ x").expect("parse clean main"),
1121            &logslope,
1122            "z",
1123            "bernoulli marginal-slope",
1124            "--logslope-formula",
1125        )
1126        .expect_err("logslope formula should be rejected");
1127        assert!(err.contains("cannot also appear in --logslope-formula"));
1128    }
1129
1130    #[test]
1131    fn logslope_surface_declarations_are_additive() {
1132        let parsed = parse_formula("y ~ s(pc1) + logslope(z2, s(pc2)) + logslope(z3, x3)")
1133            .expect("parse additive logslope surfaces");
1134        assert_eq!(parsed.terms.len(), 1);
1135        assert_eq!(parsed.logslope_surfaces.len(), 2);
1136        assert_eq!(parsed.logslope_surfaces[0].z_column, "z2");
1137        assert_eq!(parsed.logslope_surfaces[0].terms.len(), 1);
1138        assert_eq!(parsed.logslope_surfaces[1].z_column, "z3");
1139        assert_eq!(parsed.logslope_surfaces[1].terms.len(), 1);
1140    }
1141
1142    #[test]
1143    fn marginal_slope_z_column_validator_reserves_all_surface_z_columns() {
1144        let main = parse_formula("y ~ x").expect("parse main");
1145        let logslope = parse_formula("y ~ s(pc1) + logslope(z2, s(z3)) + logslope(z3, x)")
1146            .expect("parse logslope surfaces");
1147        let err = validate_marginal_slope_z_column_exclusion(
1148            &main,
1149            &logslope,
1150            "z1",
1151            "bernoulli marginal-slope",
1152            "--logslope-formula",
1153        )
1154        .expect_err("surface formula should reject another reserved z coordinate");
1155        assert!(err.contains("reserves z column 'z3'"));
1156    }
1157}
1158
1159// ---------------------------------------------------------------------------
1160// Higher-level formula parsing: ParsedFormula, ParsedTerm, and friends
1161// ---------------------------------------------------------------------------
1162
1163#[derive(Clone, Debug)]
1164pub struct LinkWiggleFormulaSpec {
1165    pub degree: usize,
1166    pub num_internal_knots: usize,
1167    pub penalty_orders: Vec<usize>,
1168    pub double_penalty: bool,
1169}
1170
1171pub fn default_linkwiggle_formulaspec() -> LinkWiggleFormulaSpec {
1172    let cfg = WigglePenaltyConfig::cubic_triple_operator_default();
1173    LinkWiggleFormulaSpec {
1174        degree: cfg.degree,
1175        num_internal_knots: cfg.num_internal_knots,
1176        penalty_orders: cfg.penalty_orders,
1177        double_penalty: cfg.double_penalty,
1178    }
1179}
1180
1181#[derive(Clone, Debug)]
1182pub struct LinkFormulaSpec {
1183    pub link: String,
1184    pub mixture_rho: Option<String>,
1185    pub sas_init: Option<String>,
1186    pub beta_logistic_init: Option<String>,
1187}
1188
1189#[derive(Clone, Debug)]
1190pub struct SurvivalFormulaSpec {
1191    pub spec: Option<String>,
1192    pub survival_distribution: Option<String>,
1193}
1194
1195#[derive(Clone, Debug)]
1196pub struct ParsedFormula {
1197    pub response: String,
1198    pub terms: Vec<ParsedTerm>,
1199    pub logslope_surfaces: Vec<LogSlopeSurfaceSpec>,
1200    pub linkwiggle: Option<LinkWiggleFormulaSpec>,
1201    pub timewiggle: Option<LinkWiggleFormulaSpec>,
1202    pub linkspec: Option<LinkFormulaSpec>,
1203    pub survivalspec: Option<SurvivalFormulaSpec>,
1204}
1205
1206#[derive(Clone, Debug)]
1207pub struct LogSlopeSurfaceSpec {
1208    pub z_column: String,
1209    pub terms: Vec<ParsedTerm>,
1210}
1211
1212pub fn marginal_slope_logslope_surfaces(
1213    logslope_formula: &ParsedFormula,
1214    default_z_column: &str,
1215) -> Result<Vec<LogSlopeSurfaceSpec>, String> {
1216    let mut surfaces = Vec::new();
1217    if !logslope_formula.terms.is_empty() {
1218        surfaces.push(LogSlopeSurfaceSpec {
1219            z_column: default_z_column.to_string(),
1220            terms: logslope_formula.terms.clone(),
1221        });
1222    }
1223    surfaces.extend(logslope_formula.logslope_surfaces.clone());
1224    if surfaces.is_empty() {
1225        surfaces.push(LogSlopeSurfaceSpec {
1226            z_column: default_z_column.to_string(),
1227            terms: Vec::new(),
1228        });
1229    }
1230    let mut seen = std::collections::BTreeSet::<String>::new();
1231    for surface in &surfaces {
1232        if !seen.insert(surface.z_column.clone()) {
1233            return Err(FormulaDslError::IncompatibleTerm {
1234                reason: format!(
1235                    "logslope formula declares z column '{}' more than once; each z coordinate needs exactly one log-slope surface",
1236                    surface.z_column
1237                ),
1238            }
1239            .into());
1240        }
1241    }
1242    Ok(surfaces)
1243}
1244
1245#[derive(Clone, Debug)]
1246pub enum ParsedTerm {
1247    Linear {
1248        name: String,
1249        explicit: bool,
1250        coefficient_min: Option<f64>,
1251        coefficient_max: Option<f64>,
1252    },
1253    BoundedLinear {
1254        name: String,
1255        min: f64,
1256        max: f64,
1257        prior: BoundedCoefficientPriorSpec,
1258    },
1259    RandomEffect {
1260        name: String,
1261    },
1262    Smooth {
1263        label: String,
1264        vars: Vec<String>,
1265        kind: SmoothKind,
1266        options: BTreeMap<String, String>,
1267    },
1268    LinkWiggle {
1269        options: BTreeMap<String, String>,
1270    },
1271    TimeWiggle {
1272        options: BTreeMap<String, String>,
1273    },
1274    LinkConfig {
1275        options: BTreeMap<String, String>,
1276    },
1277    SurvivalConfig {
1278        options: BTreeMap<String, String>,
1279    },
1280    LogSlopeSurface {
1281        z_column: String,
1282        terms: Vec<ParsedTerm>,
1283    },
1284    /// Wilkinson-Rogers interaction term `a:b[:c...]`.
1285    ///
1286    /// `vars` is a sorted, deduplicated list of base column names. Each element
1287    /// must be a bare identifier — interactions with function-call atoms
1288    /// (smooths, factors, etc.) are rejected upstream because their design
1289    /// columns are not simple products. The design column is the elementwise
1290    /// product of the referenced numeric columns.
1291    Interaction {
1292        vars: Vec<String>,
1293    },
1294}
1295
1296/// Collect the names of every data column the parsed terms consume.
1297///
1298/// This is the canonical formula→columns walk shared by the fit-time and
1299/// predict-time required-column computations (the CLI and PyFFI surfaces both
1300/// route through it). It includes a smooth's positional `vars` *and* its `by=`
1301/// grouping/scaling column (`s(x, by=g)`), which `term_builder` reads from
1302/// `options["by"]` but which is not among the positional variables — omitting
1303/// it would drop a genuine predictor from the model's input contract.
1304pub fn parsed_term_column_names(
1305    terms: &[ParsedTerm],
1306    out: &mut std::collections::BTreeSet<String>,
1307) {
1308    for term in terms {
1309        match term {
1310            ParsedTerm::Linear { name, .. }
1311            | ParsedTerm::BoundedLinear { name, .. }
1312            | ParsedTerm::RandomEffect { name } => {
1313                out.insert(name.clone());
1314            }
1315            ParsedTerm::Smooth { vars, options, .. } => {
1316                out.extend(vars.iter().cloned());
1317                if let Some(by) = options.get("by") {
1318                    out.insert(by.clone());
1319                }
1320            }
1321            ParsedTerm::Interaction { vars } => {
1322                out.extend(vars.iter().cloned());
1323            }
1324            ParsedTerm::LinkWiggle { .. }
1325            | ParsedTerm::TimeWiggle { .. }
1326            | ParsedTerm::LinkConfig { .. }
1327            | ParsedTerm::SurvivalConfig { .. } => {}
1328            ParsedTerm::LogSlopeSurface { z_column, terms } => {
1329                out.insert(z_column.clone());
1330                parsed_term_column_names(terms, out);
1331            }
1332        }
1333    }
1334}
1335
1336pub fn parsed_terms_reference_column(terms: &[ParsedTerm], column_name: &str) -> bool {
1337    terms.iter().any(|term| match term {
1338        ParsedTerm::Linear { name, .. }
1339        | ParsedTerm::BoundedLinear { name, .. }
1340        | ParsedTerm::RandomEffect { name } => name == column_name,
1341        ParsedTerm::Smooth { vars, options, .. } => {
1342            vars.iter().any(|var| var == column_name)
1343                || options.get("by").is_some_and(|by| by == column_name)
1344        }
1345        ParsedTerm::Interaction { vars } => vars.iter().any(|var| var == column_name),
1346        ParsedTerm::LinkWiggle { .. }
1347        | ParsedTerm::TimeWiggle { .. }
1348        | ParsedTerm::LinkConfig { .. }
1349        | ParsedTerm::SurvivalConfig { .. } => false,
1350        ParsedTerm::LogSlopeSurface { z_column, terms } => {
1351            z_column == column_name || parsed_terms_reference_column(terms, column_name)
1352        }
1353    })
1354}
1355
1356pub fn validate_marginal_slope_z_column_exclusion(
1357    main_formula: &ParsedFormula,
1358    logslope_formula: &ParsedFormula,
1359    z_column: &str,
1360    context: &str,
1361    logslope_label: &str,
1362) -> Result<(), String> {
1363    let surfaces = marginal_slope_logslope_surfaces(logslope_formula, z_column)?;
1364    // The CLI/configured z column is reserved even when the log-slope formula
1365    // is intercept-only (`~ 1`) and therefore contributes no surface terms.
1366    // Explicit logslope(...) declarations may reserve additional z coordinates.
1367    let mut reserved_z_columns = std::collections::BTreeSet::<&str>::new();
1368    reserved_z_columns.insert(z_column);
1369    reserved_z_columns.extend(surfaces.iter().map(|surface| surface.z_column.as_str()));
1370
1371    for reserved in &reserved_z_columns {
1372        if parsed_terms_reference_column(&main_formula.terms, reserved) {
1373            return Err(FormulaDslError::IncompatibleTerm {
1374                reason: format!(
1375                    "{context} reserves z column '{reserved}' as the auxiliary latent score; it cannot also appear in the main formula"
1376                ),
1377            }
1378            .into());
1379        }
1380    }
1381    for reserved in &reserved_z_columns {
1382        if parsed_terms_reference_column(&logslope_formula.terms, reserved) {
1383            return Err(FormulaDslError::IncompatibleTerm {
1384                reason: format!(
1385                    "{context} reserves z column '{reserved}' as the auxiliary latent score; it cannot also appear in {logslope_label}"
1386                ),
1387            }
1388            .into());
1389        }
1390        for surface in &surfaces {
1391            if parsed_terms_reference_column(&surface.terms, reserved) {
1392                return Err(FormulaDslError::IncompatibleTerm {
1393                    reason: format!(
1394                        "{context} reserves z column '{reserved}' as an auxiliary latent score; it cannot also appear in {logslope_label}"
1395                    ),
1396                }
1397                .into());
1398            }
1399        }
1400    }
1401    Ok(())
1402}
1403
1404#[derive(Clone, Copy, Debug)]
1405pub enum SmoothKind {
1406    S,
1407    Te,
1408    /// Tensor smooth (`t2(...)`) using mgcv's separable penalty decomposition:
1409    /// the tensor coefficient space is split into marginal penalized/null-space
1410    /// tensor subspaces, with one smoothing parameter per non-null subspace.
1411    T2,
1412    /// Tensor *interaction* smooth (`ti(...)`): a tensor-product smooth whose
1413    /// marginal main effects are excluded, so the term captures only the pure
1414    /// interaction between its variables. Materializes through the same tensor
1415    /// path as [`SmoothKind::Te`] but with per-margin sum-to-zero
1416    /// identifiability (`TensorBSplineIdentifiability::MarginalSumToZero`).
1417    Ti,
1418}
1419
1420#[derive(Clone, Copy, Debug)]
1421pub enum LinkMode {
1422    Strict,
1423    Flexible,
1424}
1425
1426#[derive(Clone, Debug)]
1427pub struct LinkChoice {
1428    pub mode: LinkMode,
1429    pub link: LinkFunction,
1430    pub mixture_components: Option<Vec<LinkComponent>>,
1431}
1432
1433// ---------------------------------------------------------------------------
1434// Link wiggle / link choice helpers
1435// ---------------------------------------------------------------------------
1436
1437pub fn effectivelinkwiggle_formulaspec(
1438    formula_linkwiggle: Option<&LinkWiggleFormulaSpec>,
1439    link_choice: Option<&LinkChoice>,
1440) -> Option<LinkWiggleFormulaSpec> {
1441    formula_linkwiggle.cloned().or_else(|| {
1442        link_choice.and_then(|choice| {
1443            if matches!(choice.mode, LinkMode::Flexible) {
1444                Some(default_linkwiggle_formulaspec())
1445            } else {
1446                None
1447            }
1448        })
1449    })
1450}
1451
1452pub const fn linkname_supports_joint_wiggle(link: LinkFunction) -> bool {
1453    !matches!(link, LinkFunction::Sas | LinkFunction::BetaLogistic)
1454}
1455
1456pub const fn linkchoice_supports_joint_wiggle(choice: &LinkChoice) -> bool {
1457    match &choice.mixture_components {
1458        None => linkname_supports_joint_wiggle(choice.link),
1459        Some(_) => false,
1460    }
1461}
1462
1463pub fn require_linkchoice_supports_joint_wiggle(
1464    choice: &LinkChoice,
1465    context: &str,
1466) -> Result<(), String> {
1467    if linkchoice_supports_joint_wiggle(choice) {
1468        Ok(())
1469    } else {
1470        Err(joint_wiggle_unsupported_link_message(context))
1471    }
1472}
1473
1474pub const fn likelihood_spec_supports_joint_wiggle(likelihood: &LikelihoodSpec) -> bool {
1475    inverse_link_supports_joint_wiggle(&likelihood.link)
1476}
1477
1478pub fn require_likelihood_spec_supports_joint_wiggle(
1479    likelihood: &LikelihoodSpec,
1480    context: &str,
1481) -> Result<(), String> {
1482    if likelihood_spec_supports_joint_wiggle(likelihood) {
1483        Ok(())
1484    } else {
1485        Err(joint_wiggle_unsupported_link_message(context))
1486    }
1487}
1488
1489pub const fn inverse_link_supports_joint_wiggle(link: &InverseLink) -> bool {
1490    matches!(
1491        link,
1492        InverseLink::Standard(StandardLink::Identity)
1493            | InverseLink::Standard(StandardLink::Log)
1494            | InverseLink::Standard(StandardLink::Logit)
1495            | InverseLink::Standard(StandardLink::Probit)
1496            | InverseLink::Standard(StandardLink::CLogLog)
1497    )
1498}
1499
1500pub fn require_inverse_link_supports_joint_wiggle(
1501    link: &InverseLink,
1502    context: &str,
1503) -> Result<(), String> {
1504    if inverse_link_supports_joint_wiggle(link) {
1505        Ok(())
1506    } else {
1507        Err(joint_wiggle_unsupported_link_message(context))
1508    }
1509}
1510
1511pub const fn binomial_inverse_link_supports_joint_wiggle(link: &InverseLink) -> bool {
1512    matches!(
1513        link,
1514        InverseLink::Standard(StandardLink::Logit)
1515            | InverseLink::Standard(StandardLink::Probit)
1516            | InverseLink::Standard(StandardLink::CLogLog)
1517    )
1518}
1519
1520pub fn require_binomial_inverse_link_supports_joint_wiggle(
1521    link: &InverseLink,
1522    context: &str,
1523) -> Result<(), String> {
1524    if binomial_inverse_link_supports_joint_wiggle(link) {
1525        Ok(())
1526    } else {
1527        Err(FormulaDslError::IncompatibleTerm {
1528            reason: format!(
1529                "{context} does not support identity, log, latent-cloglog, SAS, BetaLogistic, or Mixture links; wiggle is only available for jointly fitted logit/probit/cloglog links"
1530            ),
1531        }
1532        .into())
1533    }
1534}
1535
1536pub fn joint_wiggle_unsupported_link_message(context: &str) -> String {
1537    format!(
1538        "{context} does not support latent-cloglog, SAS, BetaLogistic, or Mixture links; wiggle is only available for jointly fitted standard links"
1539    )
1540}
1541
1542// ---------------------------------------------------------------------------
1543// Option-map helpers (shared by formula parsing and term construction)
1544// ---------------------------------------------------------------------------
1545
1546pub fn option_usize(map: &BTreeMap<String, String>, key: &str) -> Option<usize> {
1547    map.get(key).and_then(|v| v.parse::<usize>().ok())
1548}
1549
1550/// Local sibling of `term_builder::validate_known_options` used by the
1551/// parser-side `linear / bounded / constrain / nonnegative / nonpositive`
1552/// branches (which build their `ParsedTerm` here and never enter
1553/// `term_builder::build_smooth_basis`). Without this, typos like
1554/// `bounded(x, min=0, max=1, foo=bar)` silently succeed because the
1555/// `foo` key was just never read.
1556fn validate_known_term_options(
1557    term_name: &str,
1558    options: &BTreeMap<String, String>,
1559    known: &[&str],
1560    raw: &str,
1561) -> Result<(), String> {
1562    let known_set: std::collections::BTreeSet<&&str> = known.iter().collect();
1563    for key in options.keys() {
1564        if !known_set.contains(&key.as_str()) {
1565            let known_sorted = {
1566                let mut v = known.to_vec();
1567                v.sort_unstable();
1568                v.join(", ")
1569            };
1570            let known_hint = if known.is_empty() {
1571                "no options".to_string()
1572            } else {
1573                format!("[{known_sorted}]")
1574            };
1575            return Err(FormulaDslError::InvalidArgument {
1576                reason: format!(
1577                    "{term_name}() does not accept option `{key}` (in `{raw}`); known options: {known_hint}"
1578                ),
1579            }
1580            .into());
1581        }
1582    }
1583    Ok(())
1584}
1585
1586pub fn option_usize_any(map: &BTreeMap<String, String>, keys: &[&str]) -> Option<usize> {
1587    for key in keys {
1588        if let Some(v) = option_usize(map, key) {
1589            return Some(v);
1590        }
1591    }
1592    None
1593}
1594
1595/// Strict integer option: returns `Ok(None)` if not present, `Ok(Some(n))` if
1596/// it parses as a non-negative integer, and `Err(msg)` if the user supplied a
1597/// value that isn't a valid usize (negative, decimal, garbage). Without this
1598/// the lenient `option_usize` silently drops invalid values and reverts to
1599/// the default — `k=-1` and `k=1.5` were both accepted as "k not specified"
1600/// instead of being flagged as user mistakes.
1601pub fn option_usize_strict(
1602    map: &BTreeMap<String, String>,
1603    key: &str,
1604) -> Result<Option<usize>, String> {
1605    match map.get(key) {
1606        None => Ok(None),
1607        Some(raw) => raw.parse::<usize>().map(Some).map_err(|err| {
1608            FormulaDslError::InvalidArgument {
1609                reason: format!(
1610                    "option `{key}={raw}` is not a non-negative integer; \
1611                     expected a whole number >= 0: {err}"
1612                ),
1613            }
1614            .into()
1615        }),
1616    }
1617}
1618
1619/// Strict variant of `option_usize_any` that errors on the first present-but-
1620/// unparseable key rather than silently falling through.
1621pub fn option_usize_any_strict(
1622    map: &BTreeMap<String, String>,
1623    keys: &[&str],
1624) -> Result<Option<usize>, String> {
1625    for key in keys {
1626        if let Some(v) = option_usize_strict(map, key)? {
1627            return Ok(Some(v));
1628        }
1629    }
1630    Ok(None)
1631}
1632
1633pub fn option_f64(map: &BTreeMap<String, String>, key: &str) -> Option<f64> {
1634    map.get(key).and_then(|v| v.parse::<f64>().ok())
1635}
1636
1637/// Strict float option: `Ok(None)` if absent, `Ok(Some(n))` if parses as a
1638/// finite f64, `Err` if the user passed an unparseable value (rather than
1639/// silently dropping it like the lenient `option_f64`).
1640pub fn option_f64_strict(map: &BTreeMap<String, String>, key: &str) -> Result<Option<f64>, String> {
1641    match map.get(key) {
1642        None => Ok(None),
1643        Some(raw) => match raw.parse::<f64>() {
1644            Ok(v) if v.is_finite() => Ok(Some(v)),
1645            Ok(v) => Err(FormulaDslError::InvalidArgument {
1646                reason: format!("option `{key}={raw}` parses as {v} which is not a finite number"),
1647            }
1648            .into()),
1649            Err(err) => Err(FormulaDslError::InvalidArgument {
1650                reason: format!(
1651                    "option `{key}={raw}` is not a valid number; expected a finite decimal: {err}"
1652                ),
1653            }
1654            .into()),
1655        },
1656    }
1657}
1658
1659pub fn option_bool(map: &BTreeMap<String, String>, key: &str) -> Option<bool> {
1660    map.get(key)
1661        .and_then(|v| match v.trim().to_ascii_lowercase().as_str() {
1662            "true" | "1" | "yes" | "y" => Some(true),
1663            "false" | "0" | "no" | "n" => Some(false),
1664            _ => None,
1665        })
1666}
1667
1668/// Strict boolean option: `Ok(None)` if absent, `Ok(Some(b))` for a recognized
1669/// truthy/falsy token, and `Err(msg)` for a present-but-unparseable value. The
1670/// lenient `option_bool` maps an unrecognized value to `None`, which callers
1671/// then silently treat as "not specified" — masking user typos like
1672/// `double_penalty=ture`.
1673pub fn option_bool_strict(
1674    map: &BTreeMap<String, String>,
1675    key: &str,
1676) -> Result<Option<bool>, String> {
1677    match map.get(key) {
1678        None => Ok(None),
1679        Some(raw) => match raw.trim().to_ascii_lowercase().as_str() {
1680            "true" | "1" | "yes" | "y" => Ok(Some(true)),
1681            "false" | "0" | "no" | "n" => Ok(Some(false)),
1682            _ => Err(FormulaDslError::InvalidArgument {
1683                reason: format!(
1684                    "option `{key}={raw}` is not a boolean; \
1685                     expected one of true/false/yes/no/1/0"
1686                ),
1687            }
1688            .into()),
1689        },
1690    }
1691}
1692
1693pub fn strip_quotes(v: &str) -> &str {
1694    let b = v.as_bytes();
1695    if b.len() >= 2
1696        && ((b[0] == b'\'' && b[b.len() - 1] == b'\'') || (b[0] == b'"' && b[b.len() - 1] == b'"'))
1697    {
1698        &v[1..v.len() - 1]
1699    } else {
1700        v
1701    }
1702}
1703
1704// ---------------------------------------------------------------------------
1705// Sub-parsers for formula option blocks
1706// ---------------------------------------------------------------------------
1707
1708fn parse_linear_constraint_bounds(
1709    options: &BTreeMap<String, String>,
1710    raw: &str,
1711) -> Result<(Option<f64>, Option<f64>), String> {
1712    let min = parse_optional_f64_option_alias(options, &["min", "lower"], raw, "linear")?;
1713    let max = parse_optional_f64_option_alias(options, &["max", "upper"], raw, "linear")?;
1714    if let (Some(min), Some(max)) = (min, max)
1715        && (!min.is_finite() || !max.is_finite() || min > max)
1716    {
1717        return Err(FormulaDslError::InvalidArgument {
1718            reason: format!(
1719                "linear coefficient constraints require finite min <= max, got min={min}, max={max}: {raw}"
1720            ),
1721        }
1722        .into());
1723    }
1724    Ok((min, max))
1725}
1726
1727fn parse_required_f64_option(
1728    options: &BTreeMap<String, String>,
1729    key: &str,
1730    raw: &str,
1731) -> Result<f64, String> {
1732    let value = options
1733        .get(key)
1734        .ok_or_else(|| FormulaDslError::MalformedConfig {
1735            reason: format!("bounded() is missing required '{key}' argument: {raw}"),
1736        })?;
1737    value.parse::<f64>().map_err(|err| {
1738        FormulaDslError::InvalidArgument {
1739            reason: format!(
1740                "bounded() argument '{key}' must be a finite number, got '{}': {err}: {raw}",
1741                value
1742            ),
1743        }
1744        .into()
1745    })
1746}
1747
1748fn parse_optional_f64_option(
1749    options: &BTreeMap<String, String>,
1750    key: &str,
1751    raw: &str,
1752) -> Result<Option<f64>, String> {
1753    match options.get(key) {
1754        Some(value) => value.parse::<f64>().map(Some).map_err(|err| {
1755            FormulaDslError::InvalidArgument {
1756                reason: format!(
1757                    "bounded() argument '{key}' must be a finite number, got '{}': {err}: {raw}",
1758                    value
1759                ),
1760            }
1761            .into()
1762        }),
1763        None => Ok(None),
1764    }
1765}
1766
1767fn parse_optional_f64_option_alias(
1768    options: &BTreeMap<String, String>,
1769    keys: &[&str],
1770    raw: &str,
1771    fn_label: &str,
1772) -> Result<Option<f64>, String> {
1773    let mut found: Option<(&str, f64)> = None;
1774    for key in keys {
1775        if let Some(value) = options.get(*key) {
1776            let parsed = value
1777                .parse::<f64>()
1778                .map_err(|err| FormulaDslError::InvalidArgument {
1779                    reason: format!(
1780                        "{fn_label}() argument '{key}' must be a finite number, got '{}': {err}: {raw}",
1781                        value
1782                    ),
1783                })?;
1784            if found.is_some() {
1785                return Err(FormulaDslError::IncompatibleTerm {
1786                    reason: format!(
1787                        "{fn_label}() cannot specify both '{}' and '{}': {raw}",
1788                        found.expect("present").0,
1789                        key
1790                    ),
1791                }
1792                .into());
1793            }
1794            found = Some((key, parsed));
1795        }
1796    }
1797    Ok(found.map(|(_, v)| v))
1798}
1799
1800fn parse_linkwiggle_penalty_orders(raw: Option<&str>) -> Result<Vec<usize>, String> {
1801    let Some(raw) = raw.map(str::trim) else {
1802        return Ok(WigglePenaltyConfig::cubic_triple_operator_default().penalty_orders);
1803    };
1804    if raw.is_empty() {
1805        return Ok(WigglePenaltyConfig::cubic_triple_operator_default().penalty_orders);
1806    }
1807    let mut out = Vec::<usize>::new();
1808    for token in raw.split(',') {
1809        let t = token.trim().to_ascii_lowercase();
1810        if t.is_empty() {
1811            continue;
1812        }
1813        match t.as_str() {
1814            "all" => {
1815                out.extend([1, 2, 3]);
1816            }
1817            "slope" | "1" => out.push(1),
1818            "curvature" | "2" => out.push(2),
1819            "curvature-change" | "curvature_change" | "3" => out.push(3),
1820            _ => {
1821                return Err(FormulaDslError::InvalidArgument {
1822                    reason: format!(
1823                        "invalid linkwiggle penalty_order '{t}'; use all|slope|curvature|curvature-change or 1/2/3"
1824                    ),
1825                }
1826                .into());
1827            }
1828        }
1829    }
1830    if out.is_empty() {
1831        out.extend(WigglePenaltyConfig::cubic_triple_operator_default().penalty_orders);
1832    }
1833    out.sort_unstable();
1834    out.dedup();
1835    Ok(out)
1836}
1837
1838pub fn parse_linkwiggle_formulaspec(
1839    options: &BTreeMap<String, String>,
1840    raw: &str,
1841) -> Result<LinkWiggleFormulaSpec, String> {
1842    let allowed = [
1843        "degree",
1844        "internal_knots",
1845        "penalty_order",
1846        "double_penalty",
1847    ];
1848    let unknown = options
1849        .keys()
1850        .filter(|key| !allowed.contains(&key.as_str()))
1851        .cloned()
1852        .collect::<Vec<_>>();
1853    let term_name = raw.split('(').next().unwrap_or("linkwiggle");
1854    if !unknown.is_empty() {
1855        return Err(FormulaDslError::InvalidArgument {
1856            reason: format!(
1857                "{}() does not support option(s) {}: {raw}",
1858                term_name,
1859                unknown.join(", ")
1860            ),
1861        }
1862        .into());
1863    }
1864    let defaults = WigglePenaltyConfig::cubic_triple_operator_default();
1865    // Strict parsing: a present-but-unparseable value (`degree=abc`, `=-3`,
1866    // `=6.5`) must be rejected, not silently dropped and replaced by the
1867    // default as the lossy `option_usize`/`option_bool` readers would do.
1868    //
1869    // This parser is shared by *all* wiggle grammars: `linkwiggle` and
1870    // `timewiggle` (see `parse_formula`), the standard-model flexible-link
1871    // wiggle, and the marginal-slope score-warp / link-deviation routing.
1872    // The general monotone I-spline value basis (`monotone_wiggle_*` in
1873    // `families::gamlss`, used by `timewiggle` and the location-scale survival
1874    // path) honors arbitrary `degree >= 2`, while only the cubic-only
1875    // score-warp / link-deviation `DeviationRuntime` is restricted to 3.
1876    // A consumer-specific limit therefore must NOT be baked into this shared
1877    // parser — it is enforced at the routing layer that feeds the cubic-only
1878    // runtime (`deviation_block_config_from_formula_linkwiggle`). Here we only
1879    // enforce the universal lower bound that a polynomial degree is positive.
1880    let degree = option_usize_strict(options, "degree")?.unwrap_or(defaults.degree);
1881    if degree < 1 {
1882        return Err(FormulaDslError::InvalidArgument {
1883            reason: format!("{term_name}() requires degree >= 1: {raw}"),
1884        }
1885        .into());
1886    }
1887    let num_internal_knots =
1888        option_usize_strict(options, "internal_knots")?.unwrap_or(defaults.num_internal_knots);
1889    if num_internal_knots == 0 {
1890        return Err(FormulaDslError::InvalidArgument {
1891            reason: format!("{term_name}() requires internal_knots > 0: {raw}"),
1892        }
1893        .into());
1894    }
1895    let penalty_orders =
1896        parse_linkwiggle_penalty_orders(options.get("penalty_order").map(String::as_str))?;
1897    let double_penalty =
1898        option_bool_strict(options, "double_penalty")?.unwrap_or(defaults.double_penalty);
1899    Ok(LinkWiggleFormulaSpec {
1900        degree,
1901        num_internal_knots,
1902        penalty_orders,
1903        double_penalty,
1904    })
1905}
1906
1907fn parse_link_formulaspec(
1908    options: &BTreeMap<String, String>,
1909    raw: &str,
1910) -> Result<LinkFormulaSpec, String> {
1911    let link = options
1912        .get("type")
1913        .map(|s| s.trim().to_string())
1914        .ok_or_else(|| FormulaDslError::MalformedConfig {
1915            reason: format!("link() requires type=<link-name>: {raw}"),
1916        })?;
1917    if link.is_empty() {
1918        return Err(FormulaDslError::MalformedConfig {
1919            reason: format!("link() requires a non-empty type: {raw}"),
1920        }
1921        .into());
1922    }
1923    let mixture_rho = options.get("rho").map(|s| s.trim().to_string());
1924    let sas_init = options.get("sas_init").map(|s| s.trim().to_string());
1925    let beta_logistic_init = options
1926        .get("beta_logistic_init")
1927        .map(|s| s.trim().to_string());
1928    Ok(LinkFormulaSpec {
1929        link,
1930        mixture_rho,
1931        sas_init,
1932        beta_logistic_init,
1933    })
1934}
1935
1936fn parse_survival_formulaspec(
1937    options: &BTreeMap<String, String>,
1938    raw: &str,
1939) -> Result<SurvivalFormulaSpec, String> {
1940    if options.is_empty() {
1941        return Err(FormulaDslError::MalformedConfig {
1942            reason: format!(
1943                "survmodel() requires at least one named option (e.g., spec=..., distribution=...): {raw}"
1944            ),
1945        }
1946        .into());
1947    }
1948    Ok(SurvivalFormulaSpec {
1949        spec: options.get("spec").map(|s| s.trim().to_string()),
1950        survival_distribution: options.get("distribution").map(|s| s.trim().to_string()),
1951    })
1952}
1953
1954fn parse_bounded_priorspec(
1955    options: &BTreeMap<String, String>,
1956    min: f64,
1957    max: f64,
1958    raw: &str,
1959) -> Result<BoundedCoefficientPriorSpec, String> {
1960    let prior_mode = options.get("prior").map(|s| s.to_ascii_lowercase());
1961    let pull = options.get("pull").map(|s| s.to_ascii_lowercase());
1962    let target = parse_optional_f64_option(options, "target", raw)?;
1963    let strength = parse_optional_f64_option(options, "strength", raw)?;
1964
1965    let target_mode = target.is_some() || strength.is_some();
1966    if prior_mode.is_some() && pull.is_some() {
1967        return Err(FormulaDslError::IncompatibleTerm {
1968            reason: format!("bounded() cannot combine prior=... with pull=...: {raw}"),
1969        }
1970        .into());
1971    }
1972    if prior_mode.is_some() && target_mode {
1973        return Err(FormulaDslError::IncompatibleTerm {
1974            reason: format!("bounded() cannot combine prior=... with target/strength: {raw}"),
1975        }
1976        .into());
1977    }
1978    if pull.is_some() && target_mode {
1979        return Err(FormulaDslError::IncompatibleTerm {
1980            reason: format!("bounded() cannot combine pull=... with target/strength: {raw}"),
1981        }
1982        .into());
1983    }
1984
1985    if let Some(priorname) = prior_mode {
1986        return match priorname.as_str() {
1987            "none" => Ok(BoundedCoefficientPriorSpec::None),
1988            "uniform" | "log-jacobian" | "log_jacobian" | "jacobian" => {
1989                Ok(BoundedCoefficientPriorSpec::Uniform)
1990            }
1991            "center" => Ok(BoundedCoefficientPriorSpec::Beta { a: 2.0, b: 2.0 }),
1992            _ => Err(FormulaDslError::InvalidArgument {
1993                reason: format!(
1994                    "bounded() prior must currently be one of none|uniform|log-jacobian|center, got '{}': {raw}",
1995                    priorname
1996                ),
1997            }
1998            .into()),
1999        };
2000    }
2001
2002    if let Some(pull_mode) = pull {
2003        return match pull_mode.as_str() {
2004            "uniform" | "log-jacobian" | "log_jacobian" | "jacobian" => {
2005                Ok(BoundedCoefficientPriorSpec::Uniform)
2006            }
2007            "center" => Ok(BoundedCoefficientPriorSpec::Beta { a: 2.0, b: 2.0 }),
2008            _ => Err(FormulaDslError::InvalidArgument {
2009                reason: format!(
2010                    "bounded() pull must currently be 'uniform'/'log-jacobian' or 'center', got '{}': {raw}",
2011                    pull_mode
2012                ),
2013            }
2014            .into()),
2015        };
2016    }
2017
2018    if target_mode {
2019        let targetvalue = target.ok_or_else(|| FormulaDslError::MalformedConfig {
2020            reason: format!("bounded() target is required with strength: {raw}"),
2021        })?;
2022        let strengthvalue = strength.ok_or_else(|| FormulaDslError::MalformedConfig {
2023            reason: format!("bounded() strength is required with target: {raw}"),
2024        })?;
2025        if !(min < targetvalue && targetvalue < max) {
2026            return Err(FormulaDslError::InvalidArgument {
2027                reason: format!("bounded() target must lie strictly inside ({min}, {max}): {raw}"),
2028            }
2029            .into());
2030        }
2031        if !strengthvalue.is_finite() || strengthvalue <= 0.0 {
2032            return Err(FormulaDslError::InvalidArgument {
2033                reason: format!("bounded() strength must be finite and > 0: {raw}"),
2034            }
2035            .into());
2036        }
2037        let z = (targetvalue - min) / (max - min);
2038        let a = 1.0 + strengthvalue * z;
2039        let b = 1.0 + strengthvalue * (1.0 - z);
2040        return Ok(BoundedCoefficientPriorSpec::Beta { a, b });
2041    }
2042
2043    Ok(BoundedCoefficientPriorSpec::None)
2044}
2045
2046// ---------------------------------------------------------------------------
2047// Top-level formula and term parsers
2048// ---------------------------------------------------------------------------
2049
2050pub fn formula_rhs_text(formula: &str) -> Result<String, String> {
2051    let parsed = parse_formula_dsl(formula)?;
2052    if parsed.rhs_terms.is_empty() {
2053        return Err(FormulaDslError::ParseError {
2054            reason: "formula right-hand side cannot be empty".to_string(),
2055        }
2056        .into());
2057    }
2058    Ok(parsed.rhs_terms.join(" + "))
2059}
2060
2061/// Parsed Surv(...) response specification.
2062///
2063/// `entry` is `None` for the 2-arg right-censored shorthand
2064/// `Surv(time, event)`, which matches the R survival/mgcv default: every
2065/// subject has entry time zero. Callers materialize a zero entry column
2066/// when this is `None`.
2067pub fn parse_surv_response(
2068    lhs: &str,
2069) -> Result<Option<(Option<String>, String, String)>, FormulaDslError> {
2070    let trimmed = lhs.trim();
2071    let call = match parse_function_call(trimmed) {
2072        Ok(call) => call,
2073        Err(_) => return Ok(None),
2074    };
2075    if !call.name.eq_ignore_ascii_case("surv") {
2076        return Ok(None);
2077    }
2078    let vars = call
2079        .args
2080        .iter()
2081        .filter_map(|arg| match arg {
2082            CallArgSpec::Positional(v) => Some(v.trim().to_string()),
2083            CallArgSpec::Named { .. } => None,
2084        })
2085        .filter(|s| !s.is_empty())
2086        .collect::<Vec<_>>();
2087    match vars.len() {
2088        // Right-censored shorthand: Surv(time, event) ≡ Surv(0, time, event)
2089        // with a synthetic zero entry column. This matches R's
2090        // `survival::Surv(time, event)` default for left-truncation-free data.
2091        2 => Ok(Some((None, vars[0].clone(), vars[1].clone()))),
2092        3 => Ok(Some((
2093            Some(vars[0].clone()),
2094            vars[1].clone(),
2095            vars[2].clone(),
2096        ))),
2097        n => Err(FormulaDslError::InvalidArgument {
2098            reason: format!(
2099                "Surv(...) expects either Surv(time, event) (right-censored) or \
2100                 Surv(entry, exit, event) (left-truncated); got {n} columns"
2101            ),
2102        }),
2103    }
2104}
2105
2106/// Parsed `SurvInterval(L, R, event)` interval-censored response.
2107///
2108/// Returns `Some((left_col, right_col, event_col))` when the left-hand side is a
2109/// `SurvInterval(...)` call, `None` otherwise (so a plain `Surv(...)` or a bare
2110/// column response falls through to the other response parsers).
2111///
2112/// Interval censoring observes only a bracket `T ∈ (L, R]` — the exact event
2113/// time is never seen — and its row contribution is the survival-mass difference
2114/// `log[S(L) − S(R)]`, distinct from both the exact-event point density and the
2115/// single-sided right-censored survival. A *dedicated call name* (rather than
2116/// overloading the 3-argument `Surv(entry, exit, event)` delayed-entry form,
2117/// which is also 3-argument and semantically incompatible) is the unambiguous
2118/// DSL spelling: it mirrors flexsurv's `Surv(L, R, type="interval2")` intent
2119/// without colliding with the existing left-truncation grammar.
2120pub fn parse_surv_interval_response(
2121    lhs: &str,
2122) -> Result<Option<(String, String, String)>, FormulaDslError> {
2123    let trimmed = lhs.trim();
2124    let call = match parse_function_call(trimmed) {
2125        Ok(call) => call,
2126        Err(_) => return Ok(None),
2127    };
2128    if !call.name.eq_ignore_ascii_case("survinterval") {
2129        return Ok(None);
2130    }
2131    let vars = call
2132        .args
2133        .iter()
2134        .filter_map(|arg| match arg {
2135            CallArgSpec::Positional(v) => Some(v.trim().to_string()),
2136            CallArgSpec::Named { .. } => None,
2137        })
2138        .filter(|s| !s.is_empty())
2139        .collect::<Vec<_>>();
2140    match vars.len() {
2141        3 => Ok(Some((vars[0].clone(), vars[1].clone(), vars[2].clone()))),
2142        n => Err(FormulaDslError::InvalidArgument {
2143            reason: format!(
2144                "SurvInterval(...) expects SurvInterval(L, R, event) (interval-censored, the \
2145                 observed bracket T ∈ (L, R]); got {n} columns"
2146            ),
2147        }),
2148    }
2149}
2150
2151fn top_level_formula_separator(input: &str) -> Result<Option<usize>, String> {
2152    let mut depth = 0_i32;
2153    let mut in_single = false;
2154    let mut in_double = false;
2155
2156    for (idx, ch) in input.char_indices() {
2157        match ch {
2158            '\'' if !in_double => in_single = !in_single,
2159            '"' if !in_single => in_double = !in_double,
2160            '(' | '[' | '{' if !in_single && !in_double => depth += 1,
2161            ')' | ']' | '}' if !in_single && !in_double && depth > 0 => depth -= 1,
2162            '~' if !in_single && !in_double && depth == 0 => return Ok(Some(idx)),
2163            _ => {}
2164        }
2165    }
2166
2167    if in_single || in_double || depth != 0 {
2168        return Err(FormulaDslError::ParseError {
2169            reason: "invalid auxiliary formula syntax: unbalanced parentheses or quotes"
2170                .to_string(),
2171        }
2172        .into());
2173    }
2174    Ok(None)
2175}
2176
2177pub fn parse_matching_auxiliary_formula(
2178    formula: &str,
2179    response: &str,
2180    flag_name: &str,
2181) -> Result<(String, ParsedFormula), FormulaDslError> {
2182    let rhs = formula.trim();
2183    if top_level_formula_separator(rhs)?.is_some() {
2184        return Err(FormulaDslError::InvalidArgument {
2185            reason: format!(
2186                "{flag_name} expects only the terms after '~', not a full 'response ~ terms' formula; use {flag_name} 's(x)' instead of {flag_name} 'y ~ s(x)' (or pass '1' for an intercept-only noise model)"
2187            ),
2188        });
2189    }
2190    let parsed_formula = parse_formula(&format!("{response} ~ {rhs}"))?;
2191    Ok((rhs.to_string(), parsed_formula))
2192}
2193
2194pub fn validate_auxiliary_formula_controls(
2195    parsed_formula: &ParsedFormula,
2196    flag_name: &str,
2197) -> Result<(), String> {
2198    if parsed_formula.linkwiggle.is_some() {
2199        return Err(FormulaDslError::IncompatibleTerm {
2200            reason: format!(
2201                "linkwiggle(...) is only supported in the main formula, not {flag_name}"
2202            ),
2203        }
2204        .into());
2205    }
2206    if parsed_formula.timewiggle.is_some() {
2207        return Err(FormulaDslError::IncompatibleTerm {
2208            reason: format!(
2209                "timewiggle(...) is only supported in the main survival formula, not {flag_name}"
2210            ),
2211        }
2212        .into());
2213    }
2214    if parsed_formula.linkspec.is_some() {
2215        return Err(FormulaDslError::IncompatibleTerm {
2216            reason: format!("link(...) is only supported in the main formula, not {flag_name}"),
2217        }
2218        .into());
2219    }
2220    if parsed_formula.survivalspec.is_some() {
2221        return Err(FormulaDslError::IncompatibleTerm {
2222            reason: format!(
2223                "survmodel(...) is only supported in the main survival formula, not {flag_name}"
2224            ),
2225        }
2226        .into());
2227    }
2228    if !parsed_formula.logslope_surfaces.is_empty() && flag_name != "--logslope-formula" {
2229        return Err(FormulaDslError::IncompatibleTerm {
2230            reason: format!(
2231                "logslope(...) is only supported in --logslope-formula, not {flag_name}"
2232            ),
2233        }
2234        .into());
2235    }
2236    Ok::<(), _>(())
2237}
2238
2239pub fn parse_formula(formula: &str) -> Result<ParsedFormula, FormulaDslError> {
2240    let parsed_dsl =
2241        parse_formula_dsl(formula).map_err(|reason| FormulaDslError::ParseError { reason })?;
2242    let lhs = parsed_dsl.response_expr.trim();
2243    if lhs.is_empty() {
2244        return Err(FormulaDslError::ParseError {
2245            reason: "formula response (left-hand side) cannot be empty".to_string(),
2246        });
2247    }
2248    let mut terms = Vec::<ParsedTerm>::new();
2249    let mut linkwiggle: Option<LinkWiggleFormulaSpec> = None;
2250    let mut timewiggle: Option<LinkWiggleFormulaSpec> = None;
2251    let mut linkspec: Option<LinkFormulaSpec> = None;
2252    let mut survivalspec: Option<SurvivalFormulaSpec> = None;
2253    let mut logslope_surfaces = Vec::<LogSlopeSurfaceSpec>::new();
2254    // Track seen-term-keys so we can reject exact duplicates like
2255    // `y ~ smooth(x) + smooth(x)` upfront — without this the duplicate
2256    // produces a rank-deficient design and the user has no idea why their
2257    // fit is over-parameterized.
2258    let mut seen_term_keys: std::collections::BTreeSet<String> = std::collections::BTreeSet::new();
2259    let mut expanded_terms = Vec::<String>::new();
2260    for raw in parsed_dsl.rhs_terms {
2261        let trimmed = raw.trim();
2262        if trimmed.is_empty() {
2263            expanded_terms.push(String::new());
2264            continue;
2265        }
2266        // Single function-call terms (smooths, group(), etc.) are opaque to
2267        // WR expansion; pass them through verbatim so parse_term sees the
2268        // exact source string. Bare identifiers and any term mentioning
2269        // `:`, `*`, `/`, `^` are routed through the AST-driven WR expander.
2270        let is_call = parse_function_call(trimmed).is_ok();
2271        let needs_expansion = !is_call
2272            && trimmed
2273                .chars()
2274                .scan(0i32, |depth, ch| {
2275                    let d_before = *depth;
2276                    match ch {
2277                        '(' | '[' | '{' => *depth += 1,
2278                        ')' | ']' | '}' if *depth > 0 => *depth -= 1,
2279                        _ => {}
2280                    }
2281                    Some((d_before, ch))
2282                })
2283                .any(|(d, ch)| d == 0 && matches!(ch, ':' | '*' | '/' | '^'));
2284        if needs_expansion {
2285            for atoms in
2286                expand_wr_term(trimmed).map_err(|reason| FormulaDslError::ParseError { reason })?
2287            {
2288                if atoms.is_empty() {
2289                    continue;
2290                }
2291                expanded_terms.push(atoms.join(":"));
2292            }
2293        } else {
2294            expanded_terms.push(trimmed.to_string());
2295        }
2296    }
2297
2298    for raw in expanded_terms {
2299        let t = raw.trim();
2300        if t.is_empty() || t == "1" {
2301            continue;
2302        }
2303        if t == "0" || t == "-1" {
2304            return Err(FormulaDslError::IncompatibleTerm {
2305                reason: "formula terms '0'/'-1' (intercept removal) are not supported yet"
2306                    .to_string(),
2307            });
2308        }
2309        // Normalize whitespace so `smooth(x)` and `smooth( x )` match,
2310        // but preserve whitespace inside string literals so that
2311        // `bs="a b"` and `bs="ab"` do not collide.
2312        let key: String = {
2313            let mut acc = String::with_capacity(t.len());
2314            let mut in_single = false;
2315            let mut in_double = false;
2316            for ch in t.chars() {
2317                match ch {
2318                    '\'' if !in_double => {
2319                        in_single = !in_single;
2320                        acc.push(ch);
2321                    }
2322                    '"' if !in_single => {
2323                        in_double = !in_double;
2324                        acc.push(ch);
2325                    }
2326                    c if c.is_whitespace() && !in_single && !in_double => {}
2327                    _ => acc.push(ch),
2328                }
2329            }
2330            acc
2331        };
2332        if !seen_term_keys.insert(key.clone()) {
2333            return Err(FormulaDslError::IncompatibleTerm {
2334                reason: format!(
2335                    "formula `{formula}` lists term `{t}` more than once. \
2336                     Duplicate terms produce a rank-deficient design; \
2337                     keep one copy or differentiate them (e.g. distinct k=, bs= options)."
2338                ),
2339            });
2340        }
2341        match parse_term(t)? {
2342            ParsedTerm::LinkWiggle { options } => {
2343                if linkwiggle.is_some() {
2344                    return Err(FormulaDslError::IncompatibleTerm {
2345                        reason: "formula can include at most one linkwiggle(...) term".to_string(),
2346                    });
2347                }
2348                linkwiggle = Some(parse_linkwiggle_formulaspec(&options, t)?);
2349            }
2350            ParsedTerm::TimeWiggle { options } => {
2351                if timewiggle.is_some() {
2352                    return Err(FormulaDslError::IncompatibleTerm {
2353                        reason: "formula can include at most one timewiggle(...) term".to_string(),
2354                    });
2355                }
2356                timewiggle = Some(parse_linkwiggle_formulaspec(&options, t)?);
2357            }
2358            ParsedTerm::LinkConfig { options } => {
2359                if linkspec.is_some() {
2360                    return Err(FormulaDslError::IncompatibleTerm {
2361                        reason: "formula can include at most one link(...) term".to_string(),
2362                    });
2363                }
2364                linkspec = Some(parse_link_formulaspec(&options, t)?);
2365            }
2366            ParsedTerm::SurvivalConfig { options } => {
2367                if survivalspec.is_some() {
2368                    return Err(FormulaDslError::IncompatibleTerm {
2369                        reason: "formula can include at most one survmodel(...) term".to_string(),
2370                    });
2371                }
2372                survivalspec = Some(parse_survival_formulaspec(&options, t)?);
2373            }
2374            ParsedTerm::LogSlopeSurface { z_column, terms } => {
2375                logslope_surfaces.push(LogSlopeSurfaceSpec { z_column, terms });
2376            }
2377            other => terms.push(other),
2378        }
2379    }
2380    // Reject self-referential formulas like `y ~ smooth(y)` or `y ~ y`: the
2381    // response is its own predictor, which is a trivial identity fit and
2382    // almost certainly a user mistake. Only flag the simple-identifier case
2383    // (so Surv(entry, exit, event) ~ smooth(time) is left alone — the
2384    // response is the Surv triple, not the bare "time" column).
2385    if lhs.chars().all(|c| c.is_alphanumeric() || c == '_')
2386        && !lhs.is_empty()
2387        && parsed_terms_reference_column(&terms, lhs)
2388    {
2389        return Err(FormulaDslError::IncompatibleTerm {
2390            reason: format!(
2391                "formula `{formula}` uses response column `{lhs}` as its own predictor. \
2392                 This fits y as a function of itself and is almost certainly a typo. \
2393                 Drop the term that mentions `{lhs}` from the right-hand side."
2394            ),
2395        });
2396    }
2397    Ok(ParsedFormula {
2398        response: lhs.to_string(),
2399        terms,
2400        logslope_surfaces,
2401        linkwiggle,
2402        timewiggle,
2403        linkspec,
2404        survivalspec,
2405    })
2406}
2407
2408pub fn parse_term(raw: &str) -> Result<ParsedTerm, String> {
2409    fn split_call_args(call: &FunctionCallSpec) -> (Vec<String>, BTreeMap<String, String>) {
2410        let mut vars = Vec::<String>::new();
2411        let mut options = BTreeMap::<String, String>::new();
2412        for arg in &call.args {
2413            match arg {
2414                CallArgSpec::Positional(v) => vars.push(v.trim().to_string()),
2415                CallArgSpec::Named { key, value } => {
2416                    options.insert(key.to_ascii_lowercase(), strip_quotes(value).to_string());
2417                }
2418            }
2419        }
2420        (vars, options)
2421    }
2422
2423    // Wilkinson-Rogers `:` interaction term. The expander in `parse_formula`
2424    // produces `a:b[:c...]` for these; parse_term is also reached directly
2425    // from tests, so handle the syntax here as well.
2426    if raw.contains(':')
2427        && !raw.contains('(')
2428        && raw.split(':').all(|piece| is_exact_ident(piece.trim()))
2429    {
2430        let vars: Vec<String> = raw
2431            .split(':')
2432            .map(|piece| piece.trim().to_string())
2433            .collect();
2434        if vars.len() >= 2 {
2435            let mut sorted = vars.clone();
2436            sorted.sort();
2437            sorted.dedup();
2438            if sorted.len() != vars.len() {
2439                return Err(FormulaDslError::IncompatibleTerm {
2440                    reason: format!(
2441                        "interaction term `{raw}` references the same variable more than once"
2442                    ),
2443                }
2444                .into());
2445            }
2446            return Ok(ParsedTerm::Interaction { vars: sorted });
2447        }
2448    }
2449
2450    let call = parse_function_call(raw).ok();
2451    if let Some(call) = call {
2452        let name = call.name.to_ascii_lowercase();
2453        let (vars, mut options) = split_call_args(&call);
2454        match name.as_str() {
2455            "constrain" | "constraint" | "box" => {
2456                if vars.len() != 1 {
2457                    return Err(FormulaDslError::InvalidArgument {
2458                        reason: format!(
2459                            "constrain()/constraint()/box() expects exactly one variable: {raw}"
2460                        ),
2461                    }
2462                    .into());
2463                }
2464                validate_known_term_options(
2465                    "constrain",
2466                    &options,
2467                    &["min", "lower", "max", "upper"],
2468                    raw,
2469                )?;
2470                let (coefficient_min, coefficient_max) =
2471                    parse_linear_constraint_bounds(&options, raw)?;
2472                if coefficient_min.is_none() && coefficient_max.is_none() {
2473                    return Err(FormulaDslError::MalformedConfig {
2474                        reason: format!(
2475                            "constrain()/constraint()/box() requires at least one of min/lower/max/upper: {raw}"
2476                        ),
2477                    }
2478                    .into());
2479                }
2480                return Ok(ParsedTerm::Linear {
2481                    name: vars[0].clone(),
2482                    explicit: true,
2483                    coefficient_min,
2484                    coefficient_max,
2485                });
2486            }
2487            "nonnegative" | "nonnegative_coef" => {
2488                if vars.len() != 1 {
2489                    return Err(FormulaDslError::InvalidArgument {
2490                        reason: format!("nonnegative() expects exactly one variable: {raw}"),
2491                    }
2492                    .into());
2493                }
2494                validate_known_term_options("nonnegative", &options, &[], raw)?;
2495                return Ok(ParsedTerm::Linear {
2496                    name: vars[0].clone(),
2497                    explicit: true,
2498                    coefficient_min: Some(0.0),
2499                    coefficient_max: None,
2500                });
2501            }
2502            "nonpositive" | "nonpositive_coef" => {
2503                if vars.len() != 1 {
2504                    return Err(FormulaDslError::InvalidArgument {
2505                        reason: format!("nonpositive() expects exactly one variable: {raw}"),
2506                    }
2507                    .into());
2508                }
2509                validate_known_term_options("nonpositive", &options, &[], raw)?;
2510                return Ok(ParsedTerm::Linear {
2511                    name: vars[0].clone(),
2512                    explicit: true,
2513                    coefficient_min: None,
2514                    coefficient_max: Some(0.0),
2515                });
2516            }
2517            "bounded" => {
2518                if vars.len() != 1 {
2519                    return Err(FormulaDslError::InvalidArgument {
2520                        reason: format!("bounded() expects exactly one variable: {raw}"),
2521                    }
2522                    .into());
2523                }
2524                validate_known_term_options(
2525                    "bounded",
2526                    &options,
2527                    &["min", "max", "prior", "pull", "target", "strength"],
2528                    raw,
2529                )?;
2530                let min = parse_required_f64_option(&options, "min", raw)?;
2531                let max = parse_required_f64_option(&options, "max", raw)?;
2532                if !min.is_finite() || !max.is_finite() || min >= max {
2533                    return Err(FormulaDslError::InvalidArgument {
2534                        reason: format!(
2535                            "bounded() requires finite min < max, got min={min}, max={max}: {raw}"
2536                        ),
2537                    }
2538                    .into());
2539                }
2540                let prior = parse_bounded_priorspec(&options, min, max, raw)?;
2541                return Ok(ParsedTerm::BoundedLinear {
2542                    name: vars[0].clone(),
2543                    min,
2544                    max,
2545                    prior,
2546                });
2547            }
2548            "group" | "re" | "factor" => {
2549                if vars.len() != 1 {
2550                    return Err(FormulaDslError::InvalidArgument {
2551                        reason: format!(
2552                            "{name}() expects exactly one variable, got '{}': {raw}",
2553                            vars.join(",")
2554                        ),
2555                    }
2556                    .into());
2557                }
2558                return Ok(ParsedTerm::RandomEffect {
2559                    name: vars[0].clone(),
2560                });
2561            }
2562            "tensor" | "interaction" | "te" => {
2563                if vars.len() < 2 {
2564                    return Err(FormulaDslError::InvalidArgument {
2565                        reason: format!(
2566                            "tensor()/interaction()/te() requires at least two variables: {raw}"
2567                        ),
2568                    }
2569                    .into());
2570                }
2571                return Ok(ParsedTerm::Smooth {
2572                    label: raw.to_string(),
2573                    vars,
2574                    kind: SmoothKind::Te,
2575                    options,
2576                });
2577            }
2578            "t2" => {
2579                if vars.len() < 2 {
2580                    return Err(FormulaDslError::InvalidArgument {
2581                        reason: format!("t2() requires at least two variables: {raw}"),
2582                    }
2583                    .into());
2584                }
2585                return Ok(ParsedTerm::Smooth {
2586                    label: raw.to_string(),
2587                    vars,
2588                    kind: SmoothKind::T2,
2589                    options,
2590                });
2591            }
2592            "ti" => {
2593                // Tensor interaction smooth (mgcv `ti`): structurally a
2594                // tensor-product smooth, but the marginal main effects are
2595                // excluded so only the pure interaction is modeled. Shares the
2596                // tensor materialization path with `te`; the distinct
2597                // `SmoothKind::Ti` drives per-margin sum-to-zero
2598                // identifiability in the term builder.
2599                if vars.len() < 2 {
2600                    return Err(FormulaDslError::InvalidArgument {
2601                        reason: format!("ti() requires at least two variables: {raw}"),
2602                    }
2603                    .into());
2604                }
2605                return Ok(ParsedTerm::Smooth {
2606                    label: raw.to_string(),
2607                    vars,
2608                    kind: SmoothKind::Ti,
2609                    options,
2610                });
2611            }
2612            "fs" | "sz" => {
2613                if vars.len() != 2 {
2614                    return Err(format!("{}() expects exactly two variables: {raw}", name));
2615                }
2616                options.insert("bs".to_string(), name.clone());
2617                return Ok(ParsedTerm::Smooth {
2618                    label: raw.to_string(),
2619                    vars,
2620                    kind: SmoothKind::S,
2621                    options,
2622                });
2623            }
2624            "thinplate" | "thin_plate" | "tps" => {
2625                if vars.len() < 2 {
2626                    return Err(FormulaDslError::InvalidArgument {
2627                        reason: format!(
2628                            "thinplate()/thin_plate()/tps() requires at least two variables: {raw}"
2629                        ),
2630                    }
2631                    .into());
2632                }
2633                options.insert("type".to_string(), "tps".to_string());
2634                return Ok(ParsedTerm::Smooth {
2635                    label: raw.to_string(),
2636                    vars,
2637                    kind: SmoothKind::S,
2638                    options,
2639                });
2640            }
2641            "smooth" | "s" | "cyclic" | "periodic" | "cc" | "cp" => {
2642                if vars.is_empty() {
2643                    return Err(FormulaDslError::InvalidArgument {
2644                        reason: format!("smooth()/s() requires at least one variable: {raw}"),
2645                    }
2646                    .into());
2647                }
2648                // mgcv idiom: `s(g, bs='re')` with a single variable is a
2649                // random intercept on the factor `g`. Route it to the
2650                // dedicated random-effect machinery (which expects a single
2651                // categorical column) rather than to the factor-smooth path
2652                // (which requires a numeric companion).
2653                let bs_is_re = options
2654                    .get("bs")
2655                    .or_else(|| options.get("type"))
2656                    .map(|v| {
2657                        v.trim()
2658                            .trim_matches(|c| c == '\'' || c == '"')
2659                            .to_ascii_lowercase()
2660                    })
2661                    .as_deref()
2662                    == Some("re");
2663                if bs_is_re && vars.len() == 1 {
2664                    return Ok(ParsedTerm::RandomEffect {
2665                        name: vars[0].clone(),
2666                    });
2667                }
2668                if matches!(name.as_str(), "cyclic" | "periodic" | "cc" | "cp") {
2669                    options.insert("type".to_string(), "cyclic".to_string());
2670                }
2671                if matches!(name.as_str(), "fs" | "sz") {
2672                    options.insert("bs".to_string(), name.clone());
2673                }
2674                return Ok(ParsedTerm::Smooth {
2675                    label: raw.to_string(),
2676                    vars,
2677                    kind: SmoothKind::S,
2678                    options,
2679                });
2680            }
2681            "sphere" | "sos" | "spherical" | "s2" => {
2682                // `s2()` is an alias for the intrinsic S² (sphere) smooth, just
2683                // like `sphere()`/`sos()`/`spherical()`. All four share the
2684                // Wahba/harmonic sphere basis, so they must dispatch through
2685                // the identical `type=sphere` route; otherwise `s2()` would
2686                // silently fall back to a generic Euclidean 2-D smooth over
2687                // (lat, lon) and diverge in the spatial-kappa optimizer.
2688                if vars.len() != 2 {
2689                    return Err(FormulaDslError::InvalidArgument {
2690                        reason: format!(
2691                            "{name}() expects exactly two variables: latitude and longitude; got {} in {raw}",
2692                            vars.len()
2693                        ),
2694                    }
2695                    .into());
2696                }
2697                options.insert("type".to_string(), "sphere".to_string());
2698                return Ok(ParsedTerm::Smooth {
2699                    label: raw.to_string(),
2700                    vars,
2701                    kind: SmoothKind::S,
2702                    options,
2703                });
2704            }
2705            "mjs" | "measurejet" | "measure_jet" | "web" => {
2706                // Measure-jet spline smooth (`basis::measure_jet_smooth` docs)
2707                // for responses varying along an unknown low-dimensional set
2708                // inside a higher-dimensional ambient space. All aliases
2709                // dispatch through the identical `type=measurejet` route,
2710                // mirroring the sphere/curvature alias rule.
2711                if vars.is_empty() {
2712                    return Err(FormulaDslError::InvalidArgument {
2713                        reason: format!("{name}() requires at least one variable: {raw}"),
2714                    }
2715                    .into());
2716                }
2717                options.insert("type".to_string(), "measurejet".to_string());
2718                return Ok(ParsedTerm::Smooth {
2719                    label: raw.to_string(),
2720                    vars,
2721                    kind: SmoothKind::S,
2722                    options,
2723                });
2724            }
2725            "curv" | "curvature" | "constant_curvature" | "mkappa" => {
2726                // Constant-curvature (M_κ) geodesic-kernel smooth (#944): the
2727                // κ-generic sibling of sphere()/s2(), interpolating
2728                // S^d → ℝ^d → H^d through `kappa=` (default 0 = flat). All
2729                // four aliases must dispatch through the identical
2730                // `type=curvature` route, mirroring the sphere alias rule.
2731                if vars.is_empty() {
2732                    return Err(FormulaDslError::InvalidArgument {
2733                        reason: format!("{name}() requires at least one variable: {raw}"),
2734                    }
2735                    .into());
2736                }
2737                options.insert("type".to_string(), "curvature".to_string());
2738                return Ok(ParsedTerm::Smooth {
2739                    label: raw.to_string(),
2740                    vars,
2741                    kind: SmoothKind::S,
2742                    options,
2743                });
2744            }
2745            "matern" => {
2746                if vars.is_empty() {
2747                    return Err(FormulaDslError::InvalidArgument {
2748                        reason: format!("matern() requires at least one variable: {raw}"),
2749                    }
2750                    .into());
2751                }
2752                options.insert("type".to_string(), "matern".to_string());
2753                return Ok(ParsedTerm::Smooth {
2754                    label: raw.to_string(),
2755                    vars,
2756                    kind: SmoothKind::S,
2757                    options,
2758                });
2759            }
2760            "duchon" => {
2761                if vars.is_empty() {
2762                    return Err(FormulaDslError::InvalidArgument {
2763                        reason: format!("duchon() requires at least one variable: {raw}"),
2764                    }
2765                    .into());
2766                }
2767                if option_bool(&options, "cyclic").unwrap_or(false)
2768                    || option_bool(&options, "periodic").unwrap_or(false)
2769                {
2770                    options.insert("cyclic".to_string(), "true".to_string());
2771                }
2772                options.insert("type".to_string(), "duchon".to_string());
2773                return Ok(ParsedTerm::Smooth {
2774                    label: raw.to_string(),
2775                    vars,
2776                    kind: SmoothKind::S,
2777                    options,
2778                });
2779            }
2780            "pca" => {
2781                if vars.is_empty() {
2782                    return Err(FormulaDslError::InvalidArgument {
2783                        reason: format!("pca() requires at least one variable: {raw}"),
2784                    }
2785                    .into());
2786                }
2787                options.insert("type".to_string(), "pca".to_string());
2788                return Ok(ParsedTerm::Smooth {
2789                    label: raw.to_string(),
2790                    vars,
2791                    kind: SmoothKind::S,
2792                    options,
2793                });
2794            }
2795            "linkwiggle" => {
2796                if !vars.is_empty() {
2797                    return Err(FormulaDslError::InvalidArgument {
2798                        reason: format!(
2799                            "linkwiggle() takes named options only; positional args are not supported: {raw}"
2800                        ),
2801                    }
2802                    .into());
2803                }
2804                return Ok(ParsedTerm::LinkWiggle { options });
2805            }
2806            "timewiggle" => {
2807                if !vars.is_empty() {
2808                    return Err(FormulaDslError::InvalidArgument {
2809                        reason: format!(
2810                            "timewiggle() takes named options only; positional args are not supported: {raw}"
2811                        ),
2812                    }
2813                    .into());
2814                }
2815                return Ok(ParsedTerm::TimeWiggle { options });
2816            }
2817            "link" => {
2818                if !vars.is_empty() {
2819                    return Err(FormulaDslError::InvalidArgument {
2820                        reason: format!(
2821                            "link() takes named options only; positional args are not supported: {raw}"
2822                        ),
2823                    }
2824                    .into());
2825                }
2826                return Ok(ParsedTerm::LinkConfig { options });
2827            }
2828            "survmodel" => {
2829                if !vars.is_empty() {
2830                    return Err(FormulaDslError::InvalidArgument {
2831                        reason: format!(
2832                            "survmodel() takes named options only; positional args are not supported: {raw}"
2833                        ),
2834                    }
2835                    .into());
2836                }
2837                return Ok(ParsedTerm::SurvivalConfig { options });
2838            }
2839            "logslope" | "log_slope" | "log_slope_surface" => {
2840                validate_known_term_options("logslope", &options, &[], raw)?;
2841                if vars.len() < 2 {
2842                    return Err(FormulaDslError::InvalidArgument {
2843                        reason: format!(
2844                            "logslope() expects a z column followed by one or more RHS terms; add one logslope(z, ...) declaration per vector-z coordinate: {raw}"
2845                        ),
2846                    }
2847                    .into());
2848                }
2849                let z_column = vars[0].trim();
2850                if !is_exact_ident(z_column) {
2851                    return Err(FormulaDslError::InvalidArgument {
2852                        reason: format!(
2853                            "logslope() z column must be a bare column name, got `{z_column}` in {raw}"
2854                        ),
2855                    }
2856                    .into());
2857                }
2858                let rhs = vars[1..].join(" + ");
2859                let parsed = parse_formula(&format!("__logslope__ ~ {rhs}"))?;
2860                if !parsed.logslope_surfaces.is_empty() {
2861                    return Err(FormulaDslError::IncompatibleTerm {
2862                        reason: format!(
2863                            "logslope() declarations cannot be nested inside another logslope(): {raw}"
2864                        ),
2865                    }
2866                    .into());
2867                }
2868                validate_auxiliary_formula_controls(&parsed, "logslope()")?;
2869                return Ok(ParsedTerm::LogSlopeSurface {
2870                    z_column: z_column.to_string(),
2871                    terms: parsed.terms,
2872                });
2873            }
2874            "linear" => {
2875                if vars.len() != 1 {
2876                    return Err(FormulaDslError::InvalidArgument {
2877                        reason: format!("linear() expects exactly one variable: {raw}"),
2878                    }
2879                    .into());
2880                }
2881                validate_known_term_options(
2882                    "linear",
2883                    &options,
2884                    &["min", "lower", "max", "upper"],
2885                    raw,
2886                )?;
2887                let (coefficient_min, coefficient_max) =
2888                    parse_linear_constraint_bounds(&options, raw)?;
2889                return Ok(ParsedTerm::Linear {
2890                    name: vars[0].clone(),
2891                    explicit: true,
2892                    coefficient_min,
2893                    coefficient_max,
2894                });
2895            }
2896            _ => {
2897                return Err(format!(
2898                    "unknown term function `{name}` in '{raw}'. Supported: bounded(), linear(), constrain()/constraint()/box(), nonnegative(), nonpositive(), smooth()/s(), cyclic()/periodic()/cc()/cp(), thinplate()/thin_plate()/tps(), tensor()/interaction()/te(), t2(), ti(), fs(), sz(), group()/re()/factor(), sphere()/sos()/spherical(), s2(), matern(), duchon(), pca(), logslope()/log_slope(), linkwiggle(), timewiggle(), link(), survmodel()"
2899                ));
2900            }
2901        }
2902    }
2903
2904    let ident = raw.trim();
2905    if !is_exact_ident(ident) {
2906        return Err(FormulaDslError::UnknownIdentifier {
2907            reason: format!("unsupported top-level RHS term: {raw}"),
2908        }
2909        .into());
2910    }
2911
2912    Ok(ParsedTerm::Linear {
2913        name: ident.to_string(),
2914        explicit: false,
2915        coefficient_min: None,
2916        coefficient_max: None,
2917    })
2918}
2919
2920// ---------------------------------------------------------------------------
2921// Link choice parsing
2922// ---------------------------------------------------------------------------
2923
2924pub fn parse_link_choice(
2925    raw: Option<&str>,
2926    flexible_flag: bool,
2927) -> Result<Option<LinkChoice>, FormulaDslError> {
2928    if raw.is_none() && !flexible_flag {
2929        return Ok(None);
2930    }
2931    let Some(v) = raw else {
2932        return Ok(Some(LinkChoice {
2933            mode: LinkMode::Flexible,
2934            link: LinkFunction::Probit,
2935            mixture_components: None,
2936        }));
2937    };
2938    let t = v.trim().to_ascii_lowercase();
2939    if let Some(inner) = t
2940        .strip_prefix("flexible(")
2941        .and_then(|s| s.strip_suffix(')'))
2942    {
2943        if let Some(components_inner) = inner
2944            .strip_prefix("blended(")
2945            .and_then(|s| s.strip_suffix(')'))
2946            .or_else(|| {
2947                inner
2948                    .strip_prefix("mixture(")
2949                    .and_then(|s| s.strip_suffix(')'))
2950            })
2951        {
2952            parse_link_component_list(components_inner)?;
2953            return Err(FormulaDslError::IncompatibleTerm {
2954                reason:
2955                    "flexible(...) does not support blended(...)/mixture(...) links; wiggle is only supported for jointly fit standard links"
2956                        .to_string(),
2957            });
2958        }
2959        let link = parse_linkname(inner)?;
2960        if !linkname_supports_joint_wiggle(link) {
2961            return Err(FormulaDslError::IncompatibleTerm {
2962                reason:
2963                    "flexible(...) does not support sas/beta-logistic links; wiggle is only supported for jointly fit standard links"
2964                        .to_string(),
2965            });
2966        }
2967        return Ok(Some(LinkChoice {
2968            mode: LinkMode::Flexible,
2969            link,
2970            mixture_components: None,
2971        }));
2972    }
2973    if let Some(inner) = t
2974        .strip_prefix("blended(")
2975        .and_then(|s| s.strip_suffix(')'))
2976        .or_else(|| t.strip_prefix("mixture(").and_then(|s| s.strip_suffix(')')))
2977    {
2978        if flexible_flag {
2979            return Err(FormulaDslError::IncompatibleTerm {
2980                reason:
2981                    "--flexible-link cannot be combined with --link blended(...)/mixture(...); blended inverse links are not flexible-link mode"
2982                        .to_string(),
2983            });
2984        }
2985        let components = parse_link_component_list(inner)?;
2986        return Ok(Some(LinkChoice {
2987            mode: LinkMode::Strict,
2988            link: LinkFunction::Logit,
2989            mixture_components: Some(components),
2990        }));
2991    }
2992
2993    let link = parse_linkname(&t)?;
2994    if flexible_flag && !linkname_supports_joint_wiggle(link) {
2995        return Err(FormulaDslError::IncompatibleTerm {
2996            reason:
2997                "--flexible-link does not support sas/beta-logistic links; wiggle is only supported for jointly fit standard links"
2998                    .to_string(),
2999        });
3000    }
3001    Ok(Some(LinkChoice {
3002        mode: if flexible_flag {
3003            LinkMode::Flexible
3004        } else {
3005            LinkMode::Strict
3006        },
3007        link,
3008        mixture_components: None,
3009    }))
3010}
3011
3012pub fn parse_linkname(v: &str) -> Result<LinkFunction, FormulaDslError> {
3013    match v.trim() {
3014        "identity" => Ok(LinkFunction::Identity),
3015        "log" => Ok(LinkFunction::Log),
3016        "logit" | "binomial-logit" => Ok(LinkFunction::Logit),
3017        "probit" | "binomial-probit" => Ok(LinkFunction::Probit),
3018        "cloglog" | "binomial-cloglog" => Ok(LinkFunction::CLogLog),
3019        "sas" => Ok(LinkFunction::Sas),
3020        "beta-logistic" => Ok(LinkFunction::BetaLogistic),
3021        other => Err(FormulaDslError::UnknownIdentifier {
3022            reason: format!(
3023                "unsupported link type '{other}'; \
3024                 use one of identity|log|logit|probit|cloglog|binomial-logit|binomial-probit|binomial-cloglog|sas|beta-logistic|blended(...)/mixture(...) or flexible(...). \
3025                 Both `--link <type>` (CLI flag) and `link(type=<type>)` (formula term) accept the same set."
3026            ),
3027        }),
3028    }
3029}
3030
3031pub fn parse_link_component(v: &str) -> Result<LinkComponent, String> {
3032    match v.trim() {
3033        "logit" => Ok(LinkComponent::Logit),
3034        "probit" => Ok(LinkComponent::Probit),
3035        "cloglog" => Ok(LinkComponent::CLogLog),
3036        "loglog" => Ok(LinkComponent::LogLog),
3037        "cauchit" => Ok(LinkComponent::Cauchit),
3038        other => Err(FormulaDslError::UnknownIdentifier {
3039            reason: format!(
3040                "unsupported blended-link component '{other}'; use probit|logit|cloglog|loglog|cauchit"
3041            ),
3042        }
3043        .into()),
3044    }
3045}
3046
3047pub fn parse_link_component_list(v: &str) -> Result<Vec<LinkComponent>, String> {
3048    let mut out = Vec::new();
3049    for part in v.split(',') {
3050        let trimmed = part.trim();
3051        if trimmed.is_empty() {
3052            continue;
3053        }
3054        let comp = parse_link_component(trimmed)?;
3055        if out.contains(&comp) {
3056            return Err(FormulaDslError::IncompatibleTerm {
3057                reason: "blended(...) cannot contain duplicate components".to_string(),
3058            }
3059            .into());
3060        }
3061        out.push(comp);
3062    }
3063    if out.len() < 2 {
3064        return Err(FormulaDslError::InvalidArgument {
3065            reason: "blended(...) requires at least two components".to_string(),
3066        }
3067        .into());
3068    }
3069    Ok(out)
3070}