badness 0.0.1

An LSP, formatter, and linter for LaTeX
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
//! The formatter entry points and the CST → [`Ir`] lowering.
//!
//! Implemented rules:
//! - **Whitespace normalization**: trailing whitespace is trimmed, runs of 2+
//!   blank lines collapse to a single blank line, and the document ends with
//!   exactly one newline.
//! - **Environment indentation**: the body of `\begin{…} … \end{…}` is indented
//!   one step, nesting recursively, with `\begin`/`\end` flush. All indentation
//!   is computed by the printer, never preserved from input — so reformatting
//!   re-indents idempotently.
//! - **Group/argument indentation**: the body of a *multi-line* brace group
//!   `{…}` or optional-argument group `[…]` is indented one step, the same way
//!   (delimiters flush, body indented). Single-line groups are left inline;
//!   existing line breaks are respected.
//! - **Prose-argument reflow** (under [`WrapMode::Reflow`]): an argument the
//!   signature DB marks `prose` (a `\footnote`/`\caption` body, a sectioning
//!   title) is reflowed to the line width like a paragraph — joined when it fits,
//!   wrapped when it does not (see [`lower_command`] / [`lower_prose_group`]).
//!   Non-prose groups (`\newcommand` body, `\label`) are left as authored.
//!
//! Everything else is emitted verbatim: paragraph structure, intra-line spacing,
//! and protected regions (`\verb`, verbatim bodies, comments) are preserved.
//!
//! The mechanism flows entirely through the Wadler [`Ir`]: each maximal run of
//! `WHITESPACE`/`NEWLINE` trivia is replaced by a single break primitive
//! ([`Ir::hard_line`] for one newline, [`Ir::empty_line`] for a blank line),
//! whose printer (`super::printer`) defers indentation and so drops trailing
//! whitespace for free, and [`Ir::indent`] raises the indent inside environment
//! bodies.
//!
//! The lowering (`lower_node`) is the LaTeX-specific part that replaces arity's
//! R `ir_expr_node` dispatch; the surrounding `format`/`format_with_style`
//! framework mirrors arity's `src/formatter/core.rs`.

use std::iter::Peekable;

use crate::ast::{command_name, environment_name};
use crate::parser::parse;
use crate::semantic::{ArgKind, ArgSpec, Signatures, scan_definitions};
use crate::syntax::{SyntaxElement, SyntaxKind, SyntaxNode, SyntaxToken};

use super::context::FormatContext;
use super::ir::Ir;
use super::printer::Printer;
use super::style::{FormatStyle, WrapMode};

/// Why a document could not be formatted. The formatter only operates on a clean
/// parse: anything the parser flagged, or any `ERROR` token, is refused rather
/// than silently reshaped.
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum FormatError {
    /// The input parsed with `count` syntax error(s); the formatter only
    /// supports input the parser accepts without diagnostics.
    ParseErrors { count: usize },
    /// The CST contains an `ERROR` token the lowering does not handle.
    UnsupportedConstruct { kind: SyntaxKind, snippet: String },
}

impl std::fmt::Display for FormatError {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        match self {
            Self::ParseErrors { count } => write!(
                f,
                "input contains {count} parser diagnostic(s); formatter only supports parseable input"
            ),
            Self::UnsupportedConstruct { kind, snippet } => {
                write!(
                    f,
                    "unsupported construct for formatter: {kind:?} near {snippet:?}"
                )
            }
        }
    }
}

impl std::error::Error for FormatError {}

/// Format `input` with the default [`FormatStyle`].
pub fn format(input: &str) -> Result<String, FormatError> {
    format_with_style(input, FormatStyle::default())
}

/// Format `input` under `style`. Returns [`FormatError`] if the input does not
/// parse cleanly. Note: badness's [`crate::parser::Parse`] carries `errors` +
/// `syntax()` (arity uses `diagnostics` + `cst`).
pub fn format_with_style(input: &str, style: FormatStyle) -> Result<String, FormatError> {
    let parsed = parse(input);
    if !parsed.errors.is_empty() {
        return Err(FormatError::ParseErrors {
            count: parsed.errors.len(),
        });
    }

    format_node(&parsed.syntax(), style)
}

/// Format an already-parsed CST `root` under `style`. This is the
/// reparse-free entry: the language server hands it the salsa-cached tree
/// (`db.parsed_tree`) instead of re-running the parser. The caller owns the
/// `ParseErrors` guard — this entry assumes the parse was clean and only
/// enforces the `ERROR`-token invariant ([`validate_supported_tokens`]).
/// [`format_with_style`] is the parse-then-format convenience wrapper.
pub fn format_node(root: &SyntaxNode, style: FormatStyle) -> Result<String, FormatError> {
    validate_supported_tokens(root)?;

    let ctx = FormatContext::new(style);
    let mut formatted = format_root(root, ctx);
    // Normalize the document's trailing edge: drop any trailing blank lines and
    // per-line trailing whitespace at EOF, then guarantee exactly one final
    // newline. Empty output stays empty. Only ASCII whitespace/newlines are
    // trimmed, so trailing Unicode content (e.g. a non-breaking space) survives.
    let trimmed_len = formatted.trim_end_matches([' ', '\t', '\n', '\r']).len();
    formatted.truncate(trimmed_len);
    if !formatted.is_empty() {
        formatted.push('\n');
    }
    Ok(formatted)
}

/// Refuse any `ERROR` token. A clean parse should contain none, but the parser
/// can emit them on recovery; the formatter never reshapes around them.
fn validate_supported_tokens(root: &SyntaxNode) -> Result<(), FormatError> {
    for element in root.descendants_with_tokens() {
        let Some(token) = element.into_token() else {
            continue;
        };
        if token.kind() == SyntaxKind::ERROR {
            return Err(FormatError::UnsupportedConstruct {
                kind: token.kind(),
                snippet: token.text().to_string(),
            });
        }
    }
    Ok(())
}

fn format_root(root: &SyntaxNode, ctx: FormatContext) -> String {
    // Scan the document's own `\newcommand`/`\newenvironment`/xparse definitions
    // once, so the lowering resolves a locally-defined environment's arity (not
    // just the built-in DB's). Held by value for the whole lowering.
    let user = scan_definitions(root);
    let cx = LowerCtx {
        wrap: ctx.style().wrap,
        signatures: Signatures::new(&user),
    };
    let ir = lower_node(root, cx);
    Printer::new(ctx.style()).print(&ir)
}

/// The state threaded through every lowering call: the active [`WrapMode`] plus the
/// per-document [`Signatures`] overlay (scanned definitions over the built-in DB)
/// that [`lower_begin`] consults for environment arity. `Copy`, so it passes by
/// value like the bare `wrap` mode it replaced.
#[derive(Clone, Copy)]
struct LowerCtx<'a> {
    wrap: WrapMode,
    signatures: Signatures<'a>,
}

/// Lower a CST node to IR. Most nodes lower generically (see
/// [`lower_element_stream`]); an [`SyntaxKind::ENVIRONMENT`] is special-cased to
/// indent its body (see [`lower_environment`]), and under [`WrapMode::Reflow`] a
/// [`SyntaxKind::PARAGRAPH`] is wrapped to the line width (see
/// [`lower_paragraph_reflow`]). The [`LowerCtx`] (wrap mode + signature overlay) is
/// threaded through so it reaches every nested paragraph (including environment and
/// group bodies).
fn lower_node(node: &SyntaxNode, cx: LowerCtx<'_>) -> Ir {
    match node.kind() {
        SyntaxKind::PARAGRAPH if cx.wrap == WrapMode::Reflow => {
            return lower_paragraph_reflow(node, cx);
        }
        SyntaxKind::ENVIRONMENT if !has_verbatim_body(node) => {
            return lower_environment(node, cx);
        }
        SyntaxKind::COMMAND if cx.wrap == WrapMode::Reflow && command_has_prose_arg(node, cx) => {
            return lower_command(node, cx);
        }
        SyntaxKind::GROUP if spans_multiple_lines(node) => {
            return lower_bracketed(node, SyntaxKind::L_BRACE, SyntaxKind::R_BRACE, cx);
        }
        SyntaxKind::OPTIONAL if spans_multiple_lines(node) => {
            return lower_bracketed(node, SyntaxKind::L_BRACKET, SyntaxKind::R_BRACKET, cx);
        }
        _ => {}
    }
    Ir::concat(lower_element_stream(node.children_with_tokens(), cx))
}

/// Lower a [`SyntaxKind::PARAGRAPH`] under [`WrapMode::Reflow`]: greedily wrap its
/// prose to the line width. Maximal runs of *adjacent* non-whitespace elements
/// glue into one unbreakable *atom* (so `Hello,` and `\emph{x}` never split);
/// inter-word whitespace — or a lone newline, since a paragraph holds no blank
/// lines — is a break opportunity. The run lowers to an [`Ir::fill`], which the
/// printer wraps word-by-word.
///
/// Three things end a line rather than flow into the fill: an explicit `\\` line
/// break (a [`SyntaxKind::LINE_BREAK`] node — the parser groups `\\` with its
/// `*` / `[len]` so the whole unit stays on one line), a `%` comment (which must
/// terminate its line), and a nested *block* (an environment or multi-line group
/// whose IR carries a forced break). Each emits the run-so-far as a fill, then
/// the line breaks; a fresh run continues after. The paragraph's lines are joined
/// by [`Ir::hard_line`].
fn lower_paragraph_reflow(node: &SyntaxNode, cx: LowerCtx<'_>) -> Ir {
    reflow_elements(node.children_with_tokens(), cx)
}

/// Greedily reflow a stream of inline elements to the line width, the shared core
/// of paragraph reflow ([`lower_paragraph_reflow`]) and prose-argument reflow
/// ([`lower_prose_group`]). Maximal runs of *adjacent* non-whitespace elements glue
/// into one unbreakable *atom* (so `Hello,` and `\emph{x}` never split); inter-word
/// whitespace or a lone newline is a break opportunity. A run of atoms lowers to an
/// [`Ir::fill`], which the printer wraps word-by-word.
///
/// Three things end a fill line rather than flow into it: an explicit `\\` line
/// break (a [`SyntaxKind::LINE_BREAK`] node), a `%` comment (which must terminate
/// its line), and a nested *block* (an environment or multi-line group whose IR
/// carries a forced break). Each commits the run-so-far as a fill, then a fresh run
/// continues after, the lines joined by [`Ir::hard_line`].
///
/// Unlike a `PARAGRAPH` (which holds no blank lines by construction), an argument
/// *group* body may contain blank-line paragraph breaks; a blank-line trivia run
/// ends the current line and separates the next with an [`Ir::empty_line`].
fn reflow_elements(elements: impl Iterator<Item = SyntaxElement>, cx: LowerCtx<'_>) -> Ir {
    // Glued pieces of the atom in progress.
    let mut atom: Vec<Ir> = Vec::new();
    // Atoms of the current fill run (the current logical line).
    let mut run: Vec<Ir> = Vec::new();
    // Completed lines (fills and blocks), interleaved with `seps` at the end.
    let mut lines: Vec<Ir> = Vec::new();
    // The separator *preceding* each committed line (`seps[0]` is unused). A blank
    // line in the source promotes the next separator to an [`Ir::empty_line`].
    let mut seps: Vec<Ir> = Vec::new();
    // The separator to record before the next committed line. Default: one break.
    let mut pending_sep: Ir = Ir::hard_line();

    /// Commit the atom in progress (if any) as one atom of the current run.
    fn flush_atom(atom: &mut Vec<Ir>, run: &mut Vec<Ir>) {
        if !atom.is_empty() {
            run.push(Ir::concat(atom.drain(..)));
        }
    }
    /// Commit `content` as the next logical line, recording the separator before
    /// it and resetting `pending_sep` to a single break.
    fn push_segment(content: Ir, lines: &mut Vec<Ir>, seps: &mut Vec<Ir>, pending_sep: &mut Ir) {
        seps.push(std::mem::replace(pending_sep, Ir::hard_line()));
        lines.push(content);
    }
    /// End the current logical line: flush the atom and, when non-empty, commit the
    /// run as a fill segment.
    fn end_line(
        atom: &mut Vec<Ir>,
        run: &mut Vec<Ir>,
        lines: &mut Vec<Ir>,
        seps: &mut Vec<Ir>,
        pending_sep: &mut Ir,
    ) {
        flush_atom(atom, run);
        if !run.is_empty() {
            push_segment(Ir::fill(run.drain(..)), lines, seps, pending_sep);
        }
    }

    let mut iter = elements.peekable();
    while let Some(element) = iter.next() {
        match element {
            // Whitespace / newline run: an atom boundary. A blank line additionally
            // ends the line and promotes the next separator to a blank line.
            SyntaxElement::Token(token) if is_collapsible_trivia(token.kind()) => {
                let (newlines, _) = consume_trivia_run(&token, &mut iter);
                if newlines >= 2 {
                    end_line(&mut atom, &mut run, &mut lines, &mut seps, &mut pending_sep);
                    pending_sep = Ir::empty_line();
                } else {
                    flush_atom(&mut atom, &mut run);
                }
            }
            // A comment rides the end of the current line, then forces a break.
            SyntaxElement::Token(token) if token.kind() == SyntaxKind::COMMENT => {
                atom.push(Ir::verbatim(token.text()));
                end_line(&mut atom, &mut run, &mut lines, &mut seps, &mut pending_sep);
            }
            // A token that carries its own newline — a `\`-at-end-of-line control
            // symbol, kept verbatim for losslessness — ends the line: emit the
            // part before the break as a flat atom and let the line break supply
            // the newline, so the result reparses to the same token (idempotent)
            // instead of leaving an unbreakable multi-line atom inside the fill.
            SyntaxElement::Token(token) if token.text().contains('\n') => {
                let before = token.text().split_once('\n').map(|(b, _)| b).unwrap_or("");
                if !before.is_empty() {
                    atom.push(Ir::verbatim(before));
                }
                end_line(&mut atom, &mut run, &mut lines, &mut seps, &mut pending_sep);
            }
            // Any other token (WORD, `~`, `&`, `#`, `^`, `_`, brackets, `\verb`,
            // a bare control symbol) glues onto the current atom.
            SyntaxElement::Token(token) => atom.push(Ir::verbatim(token.text())),
            // An explicit `\\` line break (with its `*` / `[len]`, grouped by the
            // parser into one node) rides the end of the current line, then breaks.
            SyntaxElement::Node(child) if child.kind() == SyntaxKind::LINE_BREAK => {
                atom.push(lower_node(&child, cx));
                end_line(&mut atom, &mut run, &mut lines, &mut seps, &mut pending_sep);
            }
            SyntaxElement::Node(child) => {
                let ir = lower_node(&child, cx);
                if ir.contains_forced_break() {
                    // A block amid prose: end the current line, then place the
                    // block on its own line(s); a fresh run continues after.
                    end_line(&mut atom, &mut run, &mut lines, &mut seps, &mut pending_sep);
                    push_segment(ir, &mut lines, &mut seps, &mut pending_sep);
                } else {
                    atom.push(ir);
                }
            }
        }
    }
    end_line(&mut atom, &mut run, &mut lines, &mut seps, &mut pending_sep);

    // Interleave the recorded separators between committed lines.
    let mut result: Vec<Ir> = Vec::with_capacity(lines.len().saturating_mul(2));
    for (i, line) in lines.into_iter().enumerate() {
        if i > 0 {
            result.push(seps[i].clone());
        }
        result.push(line);
    }
    Ir::concat(result)
}

/// Lower a stream of elements: child nodes recurse, non-trivia tokens (and the
/// protected `\verb`/verbatim/comment tokens) are emitted verbatim, and maximal
/// runs of `WHITESPACE`/`NEWLINE` trivia are collapsed into a single break
/// primitive by [`classify_trivia`]. Comments deliberately *break* a trivia run
/// (they are content, never collapsed away), so the run on either side is
/// classified independently.
fn lower_element_stream(
    elements: impl Iterator<Item = SyntaxElement>,
    cx: LowerCtx<'_>,
) -> Vec<Ir> {
    let mut out = Vec::new();
    let mut iter = elements.peekable();
    while let Some(element) = iter.next() {
        match element {
            SyntaxElement::Node(child) => out.push(lower_node(&child, cx)),
            SyntaxElement::Token(token) if is_collapsible_trivia(token.kind()) => {
                let (newlines, trailing_ws) = consume_trivia_run(&token, &mut iter);
                out.push(classify_trivia(newlines, trailing_ws));
            }
            SyntaxElement::Token(token) => out.push(Ir::verbatim(token.text())),
        }
    }
    out
}

/// Lower an `\begin{…} … \end{…}` environment, indenting its body one step. A
/// clean-parse environment is `[BEGIN, body…, END]`: the framing nodes are
/// lowered directly, and the body between them is wrapped in [`Ir::indent`] with
/// a leading [`Ir::hard_line`] (so it starts on its own indented line) and a
/// trailing `hard_line` at the *outer* indent (so `\end` sits flush with
/// `\begin`). All indentation is owned by the printer, so the body's own leading
/// and trailing breaks are trimmed before wrapping — this is what makes
/// re-indentation idempotent.
///
/// Verbatim-like environments never reach here (their opaque `VERBATIM_BODY`
/// token would be corrupted by reflow); [`lower_node`] routes them to the
/// generic path, which emits the body verbatim.
fn lower_environment(node: &SyntaxNode, cx: LowerCtx<'_>) -> Ir {
    let mut begin = Ir::Nil;
    let mut end = Ir::Nil;
    let mut body_elements: Vec<SyntaxElement> = Vec::new();
    for element in node.children_with_tokens() {
        match &element {
            SyntaxElement::Node(child) if child.kind() == SyntaxKind::BEGIN => {
                begin = lower_begin(child, cx);
            }
            SyntaxElement::Node(child) if child.kind() == SyntaxKind::END => {
                end = lower_node(child, cx);
            }
            _ => body_elements.push(element),
        }
    }

    let body = Ir::concat(lower_element_stream(body_elements.into_iter(), cx));
    let body = trim_trailing_break(trim_leading_break(body));

    if matches!(body, Ir::Nil) {
        // Empty body: keep `\begin` and `\end` on their own lines.
        Ir::concat([begin, Ir::hard_line(), end])
    } else {
        Ir::concat([
            begin,
            Ir::indent(Ir::concat([Ir::hard_line(), body])),
            Ir::hard_line(),
            end,
        ])
    }
}

/// Lower a `\begin{name}` node, keeping the environment's *declared* argument
/// groups on the `\begin` header line instead of letting a source line break push
/// them onto their own (indented) line. For example `\begin{tabular}\n{cc}` renders
/// as a single `\begin{tabular}{cc}` header.
///
/// The arity comes from the [`Signatures`] overlay (`cx.signatures`): a document's
/// own `\newenvironment{thm}[1]…` is honored just like a built-in `tabular`, with
/// the scanned definition shadowing a built-in of the same name. The first `arity`
/// argument groups are glued to `\begin{name}` (intervening breaks and inline
/// whitespace dropped), and anything past the declared arity — which the greedy
/// parser may have over-attached — lowers generically, preserving today's behavior.
/// Environments neither the document nor the DB knows, or that take no arguments,
/// also take the generic path, so nothing regresses. A `\begin` header carrying a
/// comment is left to the generic path too: gluing across a `%` comment would let
/// it swallow the next line.
fn lower_begin(begin: &SyntaxNode, cx: LowerCtx<'_>) -> Ir {
    let arity = environment_name(begin)
        .and_then(|name| cx.signatures.environment(&name))
        .map(|sig| sig.args.len())
        .unwrap_or(0);
    let has_comment = begin
        .children_with_tokens()
        .filter_map(|element| element.into_token())
        .any(|token| token.kind() == SyntaxKind::COMMENT);
    if arity == 0 || has_comment {
        return lower_node(begin, cx);
    }

    let mut head: Vec<Ir> = Vec::new();
    let mut tail: Vec<SyntaxElement> = Vec::new();
    let mut args_seen = 0;
    let mut in_tail = false;
    for element in begin.children_with_tokens() {
        if in_tail {
            tail.push(element);
            continue;
        }
        match &element {
            SyntaxElement::Node(child)
                if matches!(child.kind(), SyntaxKind::GROUP | SyntaxKind::OPTIONAL) =>
            {
                head.push(lower_node(child, cx));
                args_seen += 1;
                if args_seen == arity {
                    in_tail = true;
                }
            }
            // The `\begin` control word and the `{name}` group stay on the line.
            SyntaxElement::Node(child) => head.push(lower_node(child, cx)),
            // Drop header breaks/whitespace: the arguments glue to `\begin{name}`.
            SyntaxElement::Token(token) if is_collapsible_trivia(token.kind()) => {}
            SyntaxElement::Token(token) => head.push(Ir::verbatim(token.text())),
        }
    }
    if !tail.is_empty() {
        head.extend(lower_element_stream(tail.into_iter(), cx));
    }
    Ir::concat(head)
}

/// Lower a delimited group — a brace group `{…}` (`open`/`close` =
/// `L_BRACE`/`R_BRACE`) or an optional-argument group `[…]`
/// (`L_BRACKET`/`R_BRACKET`) — indenting its body one step, exactly like
/// [`lower_environment`] but with token delimiters instead of `BEGIN`/`END`
/// nodes. Only called for multi-line groups (see [`spans_multiple_lines`]);
/// single-line groups stay inline on the generic path.
///
/// Inside a group the parser emits body tokens directly (no `PARAGRAPH`
/// wrapping), so the only `open` token is the first child and the only `close`
/// token is the last — but an `OPTIONAL` body may contain a stray `[` (TeX does
/// not nest `[`), so the opener is captured only once (`open_ir` still `Nil`).
fn lower_bracketed(node: &SyntaxNode, open: SyntaxKind, close: SyntaxKind, cx: LowerCtx<'_>) -> Ir {
    let mut open_ir = Ir::Nil;
    let mut close_ir = Ir::Nil;
    let mut body_elements: Vec<SyntaxElement> = Vec::new();
    for element in node.children_with_tokens() {
        match &element {
            SyntaxElement::Token(t) if t.kind() == open && matches!(open_ir, Ir::Nil) => {
                open_ir = Ir::verbatim(t.text());
            }
            SyntaxElement::Token(t) if t.kind() == close => {
                close_ir = Ir::verbatim(t.text());
            }
            _ => body_elements.push(element),
        }
    }

    let body = Ir::concat(lower_element_stream(body_elements.into_iter(), cx));
    let body = trim_trailing_break(trim_leading_break(body));

    if matches!(body, Ir::Nil) {
        // Empty multi-line body collapses to the bare delimiters, e.g. `{\n}` → `{}`.
        Ir::concat([open_ir, close_ir])
    } else {
        Ir::concat([
            open_ir,
            Ir::indent(Ir::concat([Ir::hard_line(), body])),
            Ir::hard_line(),
            close_ir,
        ])
    }
}

/// Whether `command`'s signature marks any argument as reflowable prose. The
/// cheap guard that gates the [`lower_command`] path in [`lower_node`]: a command
/// with no prose argument (the overwhelming common case) lowers generically, so
/// nothing regresses.
fn command_has_prose_arg(command: &SyntaxNode, cx: LowerCtx<'_>) -> bool {
    command_name(command)
        .and_then(|name| cx.signatures.command(&name))
        .is_some_and(|sig| sig.args.iter().any(|spec| spec.prose))
}

/// Lower a `COMMAND` whose signature marks an argument as prose (see
/// [`command_has_prose_arg`], which gates this path). Each attached `{…}`/`[…]`
/// group is matched to its signature slot — kind-aware, so an omitted optional does
/// not misalign positions (`\section{Title}` binds the `{title}` slot, not a
/// leading `[short]`) — and a group filling a prose slot is reflowed via
/// [`lower_prose_group`]. Everything else (non-prose slots, groups past the declared
/// arity that the greedy parser over-attached, trivia) lowers exactly as the generic
/// path would.
fn lower_command(node: &SyntaxNode, cx: LowerCtx<'_>) -> Ir {
    let Some(sig) = command_name(node).and_then(|name| cx.signatures.command(&name)) else {
        // Defensive: the guard already proved a prose signature exists.
        return Ir::concat(lower_element_stream(node.children_with_tokens(), cx));
    };

    let mut out: Vec<Ir> = Vec::new();
    let mut slot = 0usize;
    let mut iter = node.children_with_tokens().peekable();
    while let Some(element) = iter.next() {
        match element {
            SyntaxElement::Node(child)
                if matches!(child.kind(), SyntaxKind::GROUP | SyntaxKind::OPTIONAL) =>
            {
                let is_bracket = child.kind() == SyntaxKind::OPTIONAL;
                let prose =
                    match_arg_slot(&sig.args, &mut slot, is_bracket).is_some_and(|spec| spec.prose);
                if prose {
                    let (open, close) = if is_bracket {
                        (SyntaxKind::L_BRACKET, SyntaxKind::R_BRACKET)
                    } else {
                        (SyntaxKind::L_BRACE, SyntaxKind::R_BRACE)
                    };
                    out.push(lower_prose_group(&child, open, close, cx));
                } else {
                    out.push(lower_node(&child, cx));
                }
            }
            SyntaxElement::Node(child) => out.push(lower_node(&child, cx)),
            SyntaxElement::Token(token) if is_collapsible_trivia(token.kind()) => {
                let (newlines, trailing_ws) = consume_trivia_run(&token, &mut iter);
                out.push(classify_trivia(newlines, trailing_ws));
            }
            SyntaxElement::Token(token) => out.push(Ir::verbatim(token.text())),
        }
    }
    Ir::concat(out)
}

/// Match the next attached argument group (a brace group, or a bracket group when
/// `is_bracket`) to a signature slot, advancing `slot` past it. Skips leading
/// optional (`[…]`) slots the document omitted, so a mandatory prose slot still
/// binds when an optional before it is absent. Returns the matched [`ArgSpec`], or
/// `None` when the group has no matching slot (e.g. an unexpected `[…]` the greedy
/// parser over-attached, or a group past the declared arity), in which case `slot`
/// is left untouched so later groups still match.
fn match_arg_slot(args: &[ArgSpec], slot: &mut usize, is_bracket: bool) -> Option<ArgSpec> {
    while *slot < args.len() {
        let spec = args[*slot];
        let spec_bracket = matches!(spec.kind, ArgKind::Bracket);
        if spec_bracket == is_bracket {
            *slot += 1;
            return Some(spec);
        }
        if spec_bracket {
            // A declared optional the document omitted: skip it and keep matching.
            *slot += 1;
            continue;
        }
        // A required `{…}` slot but the group is a `[…]`: not this slot. Leave the
        // slot intact for a later brace group and treat this group as non-prose.
        return None;
    }
    None
}

/// Lower a prose argument group: like [`lower_bracketed`], but the body is reflowed
/// to the line width ([`reflow_elements`]) and the whole thing is wrapped in a soft
/// [`Ir::group`] so it stays on one line when it fits (`\footnote{short}`) and
/// breaks the delimiters onto their own lines, indenting and word-wrapping the body,
/// when it does not. Empty bodies collapse to the bare delimiters.
fn lower_prose_group(
    node: &SyntaxNode,
    open: SyntaxKind,
    close: SyntaxKind,
    cx: LowerCtx<'_>,
) -> Ir {
    let mut open_ir = Ir::Nil;
    let mut close_ir = Ir::Nil;
    let mut body_elements: Vec<SyntaxElement> = Vec::new();
    for element in node.children_with_tokens() {
        match &element {
            SyntaxElement::Token(t) if t.kind() == open && matches!(open_ir, Ir::Nil) => {
                open_ir = Ir::verbatim(t.text());
            }
            SyntaxElement::Token(t) if t.kind() == close => {
                close_ir = Ir::verbatim(t.text());
            }
            _ => body_elements.push(element),
        }
    }

    let body = reflow_elements(body_elements.into_iter(), cx);
    if matches!(body, Ir::Nil) {
        Ir::concat([open_ir, close_ir])
    } else {
        Ir::group(Ir::concat([
            open_ir,
            Ir::indent(Ir::concat([Ir::soft_line(), body])),
            Ir::soft_line(),
            close_ir,
        ]))
    }
}

/// True if `node` directly contains a `NEWLINE` token — i.e. the group itself
/// spans multiple physical lines. Newlines inside a *nested* group/environment
/// belong to that child node, not to `node`, so this attributes line-spanning to
/// the group that physically owns the break — which keeps re-indentation stable.
fn spans_multiple_lines(node: &SyntaxNode) -> bool {
    node.children_with_tokens()
        .filter_map(|e| e.into_token())
        .any(|t| t.kind() == SyntaxKind::NEWLINE)
}

/// True if `node` directly contains a `VERBATIM_BODY` token — i.e. it is a
/// verbatim-like environment whose body must be emitted byte-for-byte.
fn has_verbatim_body(node: &SyntaxNode) -> bool {
    node.children_with_tokens()
        .filter_map(|e| e.into_token())
        .any(|t| t.kind() == SyntaxKind::VERBATIM_BODY)
}

/// Whitespace and newlines are the only trivia the formatter rewrites. Comments
/// are preserved verbatim and so are *not* collapsible.
fn is_collapsible_trivia(kind: SyntaxKind) -> bool {
    matches!(kind, SyntaxKind::WHITESPACE | SyntaxKind::NEWLINE)
}

/// Consume the maximal run of collapsible trivia beginning at `first`, returning
/// the number of newlines it spans and the whitespace following the *last*
/// newline (the run's preserved leading indentation; whitespace before a newline
/// is trailing whitespace and is dropped). For a run with no newline the whole
/// run is whitespace and is returned as `trailing_ws`.
fn consume_trivia_run(
    first: &SyntaxToken,
    iter: &mut Peekable<impl Iterator<Item = SyntaxElement>>,
) -> (usize, String) {
    let mut newlines = 0;
    let mut trailing_ws = String::new();
    absorb(first, &mut newlines, &mut trailing_ws);
    loop {
        match iter.peek() {
            Some(SyntaxElement::Token(tok)) if is_collapsible_trivia(tok.kind()) => {}
            _ => break,
        }
        let token = match iter.next() {
            Some(SyntaxElement::Token(tok)) => tok,
            _ => unreachable!("peeked a collapsible trivia token"),
        };
        absorb(&token, &mut newlines, &mut trailing_ws);
    }
    (newlines, trailing_ws)
}

fn absorb(tok: &SyntaxToken, newlines: &mut usize, trailing_ws: &mut String) {
    if tok.kind() == SyntaxKind::NEWLINE {
        *newlines += 1;
        trailing_ws.clear();
    } else {
        trailing_ws.push_str(tok.text());
    }
}

/// Map a trivia run to a single IR primitive: no newline → the inline whitespace
/// (a genuine inter-word space) kept verbatim; one newline → a [`Ir::hard_line`];
/// two or more → a single [`Ir::empty_line`] (one blank line). Whitespace that
/// followed the last newline is *indentation*, which the printer owns and
/// recreates, so it is dropped here — keeping it would double-indent on reformat.
fn classify_trivia(newlines: usize, trailing_ws: String) -> Ir {
    match newlines {
        0 => Ir::verbatim(trailing_ws),
        1 => Ir::hard_line(),
        _ => Ir::empty_line(),
    }
}

/// A break the indenter supplies itself and so trims from a body edge: a forced
/// line break, an inline whitespace chunk (indentation), or [`Ir::Nil`]. A
/// `VERBATIM_BODY` (force-break verbatim, or non-blank text) is never trimmable,
/// so protected content survives.
fn is_trimmable_break(ir: &Ir) -> bool {
    match ir {
        Ir::HardLine | Ir::EmptyLine | Ir::Nil => true,
        Ir::Verbatim { text, force_break } => {
            !force_break && text.chars().all(|c| c == ' ' || c == '\t')
        }
        _ => false,
    }
}

/// Drop leading break/indentation IR from `ir`, recursing into a leading
/// `Concat` (the body's first break is often buried inside the first paragraph).
fn trim_leading_break(ir: Ir) -> Ir {
    if is_trimmable_break(&ir) {
        return Ir::Nil;
    }
    match ir {
        Ir::Concat(items) => {
            let mut v: Vec<Ir> = items.iter().cloned().collect();
            while !v.is_empty() {
                let head = trim_leading_break(v.remove(0));
                if matches!(head, Ir::Nil) {
                    continue;
                }
                v.insert(0, head);
                break;
            }
            Ir::concat(v)
        }
        other => other,
    }
}

/// Drop trailing break/indentation IR from `ir`, recursing into a trailing
/// `Concat` (mirror of [`trim_leading_break`]).
fn trim_trailing_break(ir: Ir) -> Ir {
    if is_trimmable_break(&ir) {
        return Ir::Nil;
    }
    match ir {
        Ir::Concat(items) => {
            let mut v: Vec<Ir> = items.iter().cloned().collect();
            while let Some(last) = v.pop() {
                let tail = trim_trailing_break(last);
                if matches!(tail, Ir::Nil) {
                    continue;
                }
                v.push(tail);
                break;
            }
            Ir::concat(v)
        }
        other => other,
    }
}