Skip to main content

panproto_parse/emit_pretty/
layout.rs

1#![allow(
2    clippy::module_name_repetitions,
3    clippy::too_many_lines,
4    clippy::too_many_arguments,
5    clippy::map_unwrap_or,
6    clippy::option_if_let_else,
7    clippy::elidable_lifetime_names,
8    clippy::items_after_statements,
9    clippy::needless_pass_by_value,
10    clippy::single_match_else,
11    clippy::manual_let_else,
12    clippy::match_same_arms,
13    clippy::missing_const_for_fn,
14    clippy::single_char_pattern,
15    clippy::naive_bytecount,
16    clippy::expect_used,
17    clippy::redundant_pub_crate,
18    clippy::used_underscore_binding,
19    clippy::redundant_field_names,
20    clippy::struct_field_names,
21    clippy::redundant_else,
22    clippy::similar_names
23)]
24
25//! `emit_pretty::layout` (Phase A decomposition).
26
27use super::{Grammar, TokenRole, is_word_like};
28
29// ═══════════════════════════════════════════════════════════════════
30
31/// Whitespace and indentation policy applied during emission.
32///
33/// The default policy inserts a single space between adjacent tokens,
34/// a newline after `;` / `}` / `{`, and tracks indent on `{` / `}`
35/// boundaries. Per-language overrides (idiomatic indent width,
36/// trailing-comma rules, blank-line conventions) can ride alongside
37/// this struct in a follow-up branch; today's defaults aim only for
38/// syntactic validity.
39#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
40pub struct FormatPolicy {
41    /// Number of spaces per indent level.
42    pub indent_width: usize,
43    /// Separator inserted between adjacent terminals that the lexer
44    /// would otherwise glue together (word ↔ word, operator ↔ operator).
45    /// Default is a single space.
46    pub separator: String,
47    /// Newline byte sequence emitted after `line_break_after` tokens
48    /// and at end-of-output. Default is `"\n"`.
49    pub newline: String,
50    /// Tokens after which the walker breaks to a new line.
51    pub line_break_after: Vec<String>,
52    /// Tokens that increase indent on emission.
53    pub indent_open: Vec<String>,
54    /// Tokens that decrease indent on emission.
55    pub indent_close: Vec<String>,
56}
57
58impl Default for FormatPolicy {
59    fn default() -> Self {
60        Self {
61            indent_width: 2,
62            separator: " ".to_owned(),
63            newline: "\n".to_owned(),
64            line_break_after: vec![";".into(), "{".into(), "}".into()],
65            indent_open: vec!["{".into()],
66            indent_close: vec!["}".into()],
67        }
68    }
69}
70
71// ═══════════════════════════════════════════════════════════════════
72// Token list output with Spacing algebra
73// ═══════════════════════════════════════════════════════════════════
74//
75// Emit produces a free monoid over `Token`. Layout (spaces, newlines,
76// indentation) is a homomorphism `Vec<Token> -> Vec<u8>` parameterised
77// by `FormatPolicy`. Separating the structural output from the layout
78// decision means each phase has one job: emit walks the grammar and
79// pushes tokens; layout is a single fold, locally driven by adjacent
80// pairs and a depth counter. Snapshot/restore is just `tokens.len()`.
81
82#[derive(Clone)]
83pub(crate) enum Token {
84    /// A user-visible terminal contributed by the grammar, annotated
85    /// with its structural role for spacing decisions.
86    Lit(String, TokenRole),
87    /// `indent_open` marker emitted when a `Lit` matched the policy's
88    /// open list. Carried as a separate token so layout can decide to
89    /// break + indent without re-scanning.
90    IndentOpen,
91    /// `indent_close` marker emitted before a closer-`Lit`.
92    IndentClose,
93    /// "Break a line here if not already at line start" — used after
94    /// statements/declarations and after open braces.
95    LineBreak,
96    /// Force a space before the next Lit even if the role-pair table
97    /// says tight. Pushed between consecutive content-producing SEQ
98    /// members (e.g. between `command_name` and `argument`) to ensure
99    /// sibling-vertex tokens are separated.
100    ForceSpace,
101    /// Suppress the next inter-Lit separator. Pushed by the REPEAT
102    /// walker when an iteration's "separator slot" (a CHOICE-with-BLANK
103    /// or OPTIONAL at SEQ position 0) emitted zero content tokens, so
104    /// the categorical reading is "no source-level separator existed
105    /// between these two sibling iterations of the body".
106    NoSpace,
107    /// Guard emitted right after a greedy unbounded negated-class
108    /// terminal (`[^...]+`, e.g. HTML's unquoted `attribute_value`). The
109    /// carried string is the negated set's inner content. If the NEXT
110    /// `Lit` begins with a character that set ADMITS, the terminal would
111    /// swallow that character on re-parse (`Ok` + `/>` lexes as the value
112    /// `Ok/>`, turning a `self_closing_tag` into a `start_tag`), so the
113    /// layout fold forces a separator. Transparent otherwise.
114    AbsorberGuard(String),
115    /// Exact source bytes replayed from the layout complement
116    /// (`reconstruct_subtree_bytes`): a whole vertex subtree whose
117    /// `interstitial-N` / `literal-value` fibre tiled its byte span exactly.
118    /// The fold writes these bytes verbatim and inserts NO role-derived
119    /// separator on either side — the replayed text already carries its own
120    /// leading and trailing whitespace, so the byte-faithful path bypasses the
121    /// role table entirely. The carried bytes may contain newlines; they are
122    /// written through without disturbing the indent counter (the replay is
123    /// self-contained, including its own indentation).
124    Verbatim(String),
125}
126
127pub(crate) struct Output<'a> {
128    pub(crate) tokens: Vec<Token>,
129    pub(crate) policy: &'a FormatPolicy,
130    pub(crate) grammar: &'a Grammar,
131    pub(crate) current_rule: Option<String>,
132    pub(crate) cassette: Option<&'a dyn crate::languages::cassettes::GrammarCassette>,
133}
134
135#[derive(Clone)]
136pub(crate) struct OutputSnapshot {
137    pub(crate) tokens_len: usize,
138}
139
140impl<'a> Output<'a> {
141    pub(crate) fn new(
142        policy: &'a FormatPolicy,
143        grammar: &'a Grammar,
144        cassette: Option<&'a dyn crate::languages::cassettes::GrammarCassette>,
145    ) -> Self {
146        Self {
147            tokens: Vec::new(),
148            policy,
149            grammar,
150            current_rule: None,
151            cassette,
152        }
153    }
154
155    pub(crate) fn token(&mut self, value: &str) {
156        self.token_with_role(value, None);
157    }
158
159    /// Emit a verbatim string-region leaf with NO layout side effects:
160    /// the literal is pushed with the `Terminal` role but the
161    /// `line_break_after` / `indent_open` machinery is bypassed. Tight
162    /// string content (`kind_is_tight_content`, `string_content_kinds`,
163    /// `external_content_kinds`) and the interpolation braces of a string
164    /// (`$"…{x}…"`) are part of one lexical span where a literal `{`, `}`
165    /// or `;` inside the captured text is data, not a block opener or a
166    /// statement terminator: routing them through `token_with_role` would
167    /// insert a newline / indent that the re-parse cannot absorb (the
168    /// scanner only re-lexes the interpolation when the brace abuts its
169    /// neighbours). The caller is responsible for any surrounding
170    /// [`no_space`](Self::no_space) markers.
171    pub(crate) fn tight_token(&mut self, value: &str) {
172        if value.is_empty() {
173            return;
174        }
175        // Verbatim string-region content is glued to its delimiters and is
176        // *data*, not syntax: a literal `;`/`#`/`//` inside the captured text
177        // must not be re-interpreted as a line-comment opener (which would
178        // append a newline in the layout fold). The `Immediate` role is
179        // unconditionally tight on both sides and is excluded from the
180        // line-comment-prefix newline, so it is the correct role for content.
181        self.tokens
182            .push(Token::Lit(value.to_owned(), TokenRole::Immediate));
183    }
184
185    pub(crate) fn token_with_role(&mut self, value: &str, explicit_role: Option<TokenRole>) {
186        if value.is_empty() {
187            return;
188        }
189
190        if value == "\n" || value == "\r\n" || value == "\r" {
191            self.tokens.push(Token::LineBreak);
192            return;
193        }
194
195        let trimmed = value.trim_end_matches(['\n', '\r']);
196        let trailing_newlines = value.len() - trimmed.len();
197        if trailing_newlines > 0 && !trimmed.is_empty() {
198            let role = explicit_role.unwrap_or(TokenRole::Terminal);
199            if role == TokenRole::BracketClose
200                && self.policy.indent_close.iter().any(|t| t == trimmed)
201            {
202                self.tokens.push(Token::IndentClose);
203            }
204            self.tokens.push(Token::Lit(trimmed.to_owned(), role));
205            if role == TokenRole::BracketOpen {
206                if let Some(ref rule) = self.current_rule {
207                    if self
208                        .grammar
209                        .indent_triggers
210                        .contains(&(rule.clone(), trimmed.to_owned()))
211                    {
212                        self.tokens.push(Token::IndentOpen);
213                    }
214                }
215            }
216            self.tokens.push(Token::LineBreak);
217            return;
218        }
219
220        let mut role = explicit_role.unwrap_or_else(|| self.lookup_role(value));
221        // A cassette may declare a token lexically tight in a rule (a
222        // scanner fact `grammar.json` omits, e.g. bash `VAR=1`): emit it
223        // with the always-tight Connector role (which the layout pass
224        // honours over the sibling-separation ForceSpace).
225        if let (Some(rule), Some(cassette)) = (self.current_rule.as_ref(), self.cassette) {
226            if cassette.operator_is_tight(rule, value) {
227                role = TokenRole::Connector;
228            }
229        }
230
231        if role == TokenRole::BracketClose && self.policy.indent_close.iter().any(|t| t == value) {
232            self.tokens.push(Token::IndentClose);
233        }
234
235        self.tokens.push(Token::Lit(value.to_owned(), role));
236
237        if role == TokenRole::BracketOpen {
238            let grammar_indent = self.current_rule.as_ref().is_some_and(|rule| {
239                self.grammar
240                    .indent_triggers
241                    .contains(&(rule.clone(), value.to_owned()))
242            });
243            if grammar_indent {
244                self.tokens.push(Token::IndentOpen);
245                self.tokens.push(Token::LineBreak);
246            }
247        }
248        // Line-break after tokens like `;` (statement terminator).
249        // Skip for BracketOpen/BracketClose tokens that are NOT
250        // indent-triggering (e.g. `{` in interpolation should not
251        // trigger a line break).
252        let is_non_indent_bracket = self.current_rule.is_some()
253            && (role == TokenRole::BracketOpen || role == TokenRole::BracketClose)
254            && !self.current_rule.as_ref().is_some_and(|rule| {
255                self.grammar
256                    .indent_triggers
257                    .contains(&(rule.clone(), value.to_owned()))
258            });
259        if !is_non_indent_bracket && self.policy.line_break_after.iter().any(|t| t == value) {
260            self.tokens.push(Token::LineBreak);
261        }
262    }
263
264    pub(crate) fn lookup_role(&self, value: &str) -> TokenRole {
265        if let Some(role) = self.explicit_role(value) {
266            return role;
267        }
268        if is_word_like(value) {
269            TokenRole::Keyword
270        } else {
271            TokenRole::Operator
272        }
273    }
274
275    /// The role classified for `value` in the current rule, if any.
276    /// `None` when the rule's grammar-derived `token_roles` map has no
277    /// entry, leaving the caller to choose a structural default.
278    pub(crate) fn explicit_role(&self, value: &str) -> Option<TokenRole> {
279        self.current_rule
280            .as_ref()
281            .and_then(|rule| self.grammar.token_roles.get(rule))
282            .and_then(|role_map| role_map.get(value).copied())
283    }
284
285    /// Emit a bracket-open token that triggers indentation. This is the
286    /// inline-classification counterpart to the `indent_triggers` check
287    /// in `token_with_role`: the SEQ walker computes indent-triggering
288    /// from the SEQ structure directly rather than from a precomputed map.
289    pub(crate) fn token_with_indent_open(&mut self, value: &str, role: TokenRole) {
290        if value.is_empty() {
291            return;
292        }
293        if role == TokenRole::BracketClose && self.policy.indent_close.iter().any(|t| t == value) {
294            self.tokens.push(Token::IndentClose);
295        }
296        self.tokens.push(Token::Lit(value.to_owned(), role));
297        if role == TokenRole::BracketOpen {
298            self.tokens.push(Token::IndentOpen);
299            self.tokens.push(Token::LineBreak);
300        }
301    }
302
303    pub(crate) fn newline(&mut self) {
304        self.tokens.push(Token::LineBreak);
305    }
306
307    /// Push exact replayed source bytes (see [`Token::Verbatim`]). The bytes
308    /// are written through the layout fold with no role-derived spacing on
309    /// either edge: the layout complement already encodes the verbatim
310    /// inter-token whitespace, so the byte-faithful replay path bypasses the
311    /// role table for this span.
312    pub(crate) fn verbatim(&mut self, bytes: &str) {
313        if bytes.is_empty() {
314            return;
315        }
316        self.tokens.push(Token::Verbatim(bytes.to_owned()));
317    }
318
319    /// Open an indent scope: subsequent `LineBreak`s render at the
320    /// new depth until a matching `indent_close` pops it. Used by the
321    /// external-token fallback to render indent-based grammars'
322    /// `_indent` scanner outputs.
323    pub(crate) fn indent_open(&mut self) {
324        self.tokens.push(Token::IndentOpen);
325        self.tokens.push(Token::LineBreak);
326    }
327
328    /// Close one indent scope opened by `indent_open`.
329    pub(crate) fn indent_close(&mut self) {
330        self.tokens.push(Token::IndentClose);
331    }
332
333    pub(crate) fn snapshot(&self) -> OutputSnapshot {
334        OutputSnapshot {
335            tokens_len: self.tokens.len(),
336        }
337    }
338
339    pub(crate) fn restore(&mut self, snap: OutputSnapshot) {
340        self.tokens.truncate(snap.tokens_len);
341    }
342
343    /// True iff at least one `Token::Lit` was pushed since `snap`.
344    /// Control-only emissions (`LineBreak`, `IndentOpen` / `IndentClose`,
345    /// `NoSpace`) do not count as content. Used by the REPEAT walker
346    /// to detect that a "separator slot" CHOICE picked its BLANK
347    /// alternative, so the next iteration's content can be marked
348    /// tight against the previous iteration's content.
349    pub(crate) fn lit_emitted_since(&self, snap: OutputSnapshot) -> bool {
350        self.tokens[snap.tokens_len..]
351            .iter()
352            .any(|t| matches!(t, Token::Lit(_, _) | Token::Verbatim(_)))
353    }
354
355    /// Push a marker that suppresses the next inter-Lit separator the
356    /// layout pass would otherwise insert. Used to encode "no source-
357    /// level separator was emitted between these two Lits" without
358    /// having to make per-grammar adjacency decisions in the layout.
359    pub(crate) fn no_space(&mut self) {
360        self.tokens.push(Token::NoSpace);
361    }
362
363    /// Push a marker that forces a separator (space) between the
364    /// surrounding Lits. Used for an external scanner token that is
365    /// required inter-token whitespace (dockerfile `_non_newline_whitespace`
366    /// between path arguments), which carries no text of its own but
367    /// must keep the neighbours apart.
368    pub(crate) fn force_space(&mut self) {
369        self.tokens.push(Token::ForceSpace);
370    }
371
372    pub(crate) fn finish(self) -> Vec<u8> {
373        layout(
374            &self.tokens,
375            self.policy,
376            &self.grammar.line_comment_prefixes,
377            &self.grammar.trailing_break_markers,
378            self.grammar.trailing_break_on_whitespace,
379            self.grammar.top_level_text_admits_newline,
380        )
381    }
382}
383
384/// Fold a token list into bytes. The algebra:
385/// * adjacent `Lit`s get a single space iff `needs_space_between(a, b)`,
386/// * `IndentOpen` / `IndentClose` adjust a depth counter,
387/// * `LineBreak` writes `\n` if not already at line start, then the
388///   next `Lit` writes `indent * indent_width` spaces of indent.
389pub(crate) fn layout(
390    tokens: &[Token],
391    policy: &FormatPolicy,
392    line_comment_prefixes: &[String],
393    trailing_break_markers: &[String],
394    trailing_break_on_whitespace: bool,
395    top_level_text_admits_newline: bool,
396) -> Vec<u8> {
397    let mut bytes = Vec::new();
398    let mut indent: usize = 0;
399    let mut at_line_start = true;
400    let mut last_role: Option<TokenRole> = None;
401    let mut last_text: String = String::new();
402    let mut suppress_next_separator = false;
403    let mut force_next_separator = false;
404    // The negated-class content of a greedy terminal that just emitted; if
405    // the next Lit's first char is admitted by it, force a separator.
406    let mut pending_absorber: Option<String> = None;
407    // True iff the most recently emitted content token was an exact-replay
408    // `Verbatim` blob. The byte-faithful replay path reproduces the source's
409    // trailing bytes verbatim (the trailing interstitial is part of the
410    // reconstructed span), so the final line-terminating newline below must not
411    // be appended after a verbatim tail: the source may legitimately have ended
412    // without a trailing newline, and a spurious `\n` can flip a
413    // newline-sensitive scanner's parse (scala `class A\n()\n()\n{}` — the
414    // trailing `\n` inserts an automatic semicolon that re-binds the empty
415    // `class_parameters`/`template_body` as top-level `unit`/`block`). Canonical
416    // (forget_layout) schemas emit no `Verbatim` tokens, so this never relaxes
417    // the conventional terminating newline on the reformatting path.
418    let mut last_content_was_verbatim = false;
419    let newline = policy.newline.as_bytes();
420    let separator = policy.separator.as_bytes();
421
422    for (tok_idx, tok) in tokens.iter().enumerate() {
423        if std::env::var("DBG_LAYOUT").is_ok() {
424            match tok {
425                Token::Lit(v, r) => eprintln!(
426                    "  TOK: Lit({v:?}, {r:?}) at_line_start={at_line_start} last_role={last_role:?}"
427                ),
428                Token::IndentOpen => eprintln!("  TOK: IndentOpen"),
429                Token::IndentClose => eprintln!("  TOK: IndentClose"),
430                Token::LineBreak => eprintln!("  TOK: LineBreak"),
431                Token::NoSpace => eprintln!("  TOK: NoSpace"),
432                Token::ForceSpace => eprintln!("  TOK: ForceSpace"),
433                Token::AbsorberGuard(s) => eprintln!("  TOK: AbsorberGuard({s:?})"),
434                Token::Verbatim(s) => eprintln!("  TOK: Verbatim({s:?})"),
435            }
436        }
437        match tok {
438            Token::IndentOpen => indent += 1,
439            Token::IndentClose => {
440                indent = indent.saturating_sub(1);
441                pending_absorber = None;
442                if !at_line_start {
443                    bytes.extend_from_slice(newline);
444                    at_line_start = true;
445                }
446            }
447            Token::LineBreak => {
448                pending_absorber = None;
449                if !at_line_start {
450                    bytes.extend_from_slice(newline);
451                    at_line_start = true;
452                }
453            }
454            Token::NoSpace => {
455                suppress_next_separator = true;
456            }
457            Token::ForceSpace => {
458                force_next_separator = true;
459            }
460            Token::AbsorberGuard(negated) => {
461                pending_absorber = Some(negated.clone());
462            }
463            Token::Verbatim(bytes_str) => {
464                // Exact replayed source: written through with NO role-derived
465                // separator on either edge. The complement already encodes the
466                // verbatim whitespace, so the byte-faithful path must not let
467                // the role table inject or suppress a space here. Any pending
468                // absorber/force/suppress markers are discharged without effect.
469                pending_absorber = None;
470                suppress_next_separator = false;
471                force_next_separator = false;
472                // Indentation only applies to the FIRST line of the blob if we
473                // were at a fresh line start; the blob carries its own internal
474                // indentation thereafter.
475                if at_line_start && !bytes_str.is_empty() {
476                    bytes.extend(std::iter::repeat_n(b' ', indent * policy.indent_width));
477                }
478                bytes.extend_from_slice(bytes_str.as_bytes());
479                // The trailing byte determines the line state for whatever
480                // follows; the role chain is reset so the next `Lit` does not
481                // role-space against a stale predecessor.
482                at_line_start = bytes_str.ends_with(['\n', '\r']);
483                last_role = None;
484                last_text.clear();
485                last_content_was_verbatim = true;
486            }
487            Token::Lit(value, role) => {
488                // A greedy negated-class terminal just emitted: if it would
489                // lexically swallow this Lit's first char on re-parse, the
490                // boundary needs a separator regardless of the role pair.
491                if let Some(negated) = pending_absorber.take() {
492                    if value
493                        .chars()
494                        .next()
495                        .is_some_and(|c| negated_class_admits(&negated, c))
496                    {
497                        force_next_separator = true;
498                    }
499                }
500                // Block-opening bracket: BracketOpen followed by IndentOpen.
501                // After a Terminal/BracketClose, this should be spaced
502                // (`}\n` not `0{`).
503                let is_block_open = *role == TokenRole::BracketOpen
504                    && tokens
505                        .get(tok_idx + 1)
506                        .is_some_and(|t| matches!(t, Token::IndentOpen));
507                if at_line_start {
508                    bytes.extend(std::iter::repeat_n(b' ', indent * policy.indent_width));
509                } else if let Some(prev_role) = last_role {
510                    // The role-spacer inserts at most ONE separator at a token
511                    // boundary, but a content leaf can carry the boundary
512                    // whitespace inside its own captured text: a marker token
513                    // whose `literal-value` ends in a space (djot
514                    // `block_quote_marker` = `"> "`, the ATX/list markers of
515                    // lightweight-markup grammars) already supplies the gap to
516                    // the following content, and a token whose text begins with
517                    // a space supplies it to the preceding one. Adding a
518                    // role-derived space on top would double it, and the doubled
519                    // space is re-absorbed into the marker's text on re-parse, so
520                    // it accretes one space per emit (`# Heading` -> `#  Heading`
521                    // -> `#   Heading` ...): the canonical fixed point is lost.
522                    // When the boundary already carries whitespace from either
523                    // side, the separator is redundant; suppress it. This is
524                    // derived purely from the emitted token text, not any
525                    // per-language table, and applies uniformly: a genuine
526                    // no-whitespace marker (Org's `* Heading`, whose literal is
527                    // bare `*`) is unaffected, since neither side carries the
528                    // space.
529                    let boundary_has_whitespace =
530                        last_text.ends_with([' ', '\t']) || value.starts_with([' ', '\t']);
531                    // An explicit NoSpace (suppress) is authoritative: it
532                    // records that the source had no separator at this
533                    // boundary (an empty REPEAT separator slot, an
534                    // IMMEDIATE_TOKEN). It overrides the sibling-separation
535                    // ForceSpace heuristic — otherwise beamed notes
536                    // (`CDEF`) re-space to `C D E F`.
537                    let want_space = !suppress_next_separator
538                        && !boundary_has_whitespace
539                        && (force_next_separator
540                            || needs_space_by_role(prev_role, &last_text, *role, value)
541                            || (is_block_open
542                                && matches!(
543                                    prev_role,
544                                    TokenRole::Terminal | TokenRole::BracketClose
545                                )));
546                    if want_space {
547                        bytes.extend_from_slice(separator);
548                    }
549                }
550                suppress_next_separator = false;
551                force_next_separator = false;
552                bytes.extend_from_slice(value.as_bytes());
553                at_line_start = false;
554                last_content_was_verbatim = false;
555                last_role = Some(*role);
556                last_text.clear();
557                last_text.push_str(value);
558                // A verbatim string-region content leaf (`Immediate` role) is
559                // data, not syntax: a `;`/`#`/`//` inside captured string text
560                // must not open a line comment.
561                if *role != TokenRole::Immediate
562                    && line_comment_prefixes
563                        .iter()
564                        .any(|p| value.starts_with(p.as_str()))
565                {
566                    bytes.extend_from_slice(newline);
567                    at_line_start = true;
568                    last_role = None;
569                }
570            }
571        }
572    }
573
574    // Append the customary end-of-output newline only when no suppressor
575    // fires: not already at line start, not directly after an exact-replay
576    // verbatim tail (scala), not on a top-level free-text repeat that admits a
577    // bare newline (liquid `{% endcomment %}` must not gain a trailing
578    // `template_content`), and not after a hard-line-break marker
579    // (markdown_inline). Each suppressor guards against the appended newline
580    // manufacturing a phantom node on re-parse.
581    if !at_line_start
582        && !last_content_was_verbatim
583        && !top_level_text_admits_newline
584        && !ends_with_trailing_break_marker(
585            &bytes,
586            trailing_break_markers,
587            trailing_break_on_whitespace,
588        )
589    {
590        bytes.extend_from_slice(newline);
591    }
592    bytes
593}
594
595/// Whether `bytes` ends with a hard-line-break marker — a bare break
596/// literal (the `\` of `markdown_inline`'s `hard_line_break`) or, when the
597/// grammar's break idiom admits it, trailing whitespace. Appending the
598/// customary end-of-output newline after such a tail would manufacture a
599/// phantom line-break node on re-parse, so the caller suppresses it.
600fn ends_with_trailing_break_marker(bytes: &[u8], markers: &[String], on_whitespace: bool) -> bool {
601    if markers.is_empty() && !on_whitespace {
602        return false;
603    }
604    if on_whitespace && bytes.last().is_some_and(|b| *b == b' ' || *b == b'\t') {
605        return true;
606    }
607    markers.iter().any(|m| bytes.ends_with(m.as_bytes()))
608}
609
610/// True when the negated character class `[^<negated>]` ADMITS `c` — i.e.
611/// `c` is not one of the excluded characters. `negated` is the inner text
612/// of the class (the part after `[^`, before `]`), with backslash escapes
613/// (`\s`, `\t`, `\n`, `\\`) and literal members. A greedy `[^...]+`
614/// terminal continues to consume any admitted character, so an admitted
615/// leading char on the following token would be swallowed on re-parse.
616fn negated_class_admits(negated: &str, c: char) -> bool {
617    let mut chars = negated.chars();
618    while let Some(ch) = chars.next() {
619        if ch == '\\' {
620            let excluded = match chars.next() {
621                Some('s') => c.is_whitespace(),
622                Some('t') => c == '\t',
623                Some('n') => c == '\n',
624                Some('r') => c == '\r',
625                Some(esc) => c == esc,
626                None => false,
627            };
628            if excluded {
629                return false;
630            }
631        } else if ch == c {
632            return false;
633        }
634    }
635    true
636}
637
638/// Effective spacing role: word-like bracket tokens (`function`, `end`,
639/// `begin`, `done`, etc.) are structurally brackets (for indentation)
640/// but space like keywords (they need whitespace on both sides).
641pub(crate) fn effective_spacing_role(role: TokenRole, text: &str) -> TokenRole {
642    match role {
643        TokenRole::BracketOpen | TokenRole::BracketClose if is_word_like(text) => {
644            TokenRole::Keyword
645        }
646        other => other,
647    }
648}
649
650/// Role-pair spacing table. Determines whether a space separator
651/// should be inserted between two adjacent tokens based on their
652/// structural roles and word-likeness.
653pub(crate) fn needs_space_by_role(
654    last: TokenRole,
655    last_text: &str,
656    next: TokenRole,
657    next_text: &str,
658) -> bool {
659    let last = effective_spacing_role(last, last_text);
660    let next = effective_spacing_role(next, next_text);
661    match (last, next) {
662        // Immediate (IMMEDIATE_TOKEN) tokens are lexically glued to
663        // their neighbours on both sides (`0.5`, not `0 . 5`).
664        (TokenRole::Immediate, _) | (_, TokenRole::Immediate) => false,
665        // Brackets: tight on the inside
666        (TokenRole::BracketOpen, _) | (_, TokenRole::BracketClose) => false,
667        // Separators: tight before, space after
668        (_, TokenRole::Separator) => false,
669        (TokenRole::Separator, _) => true,
670        // Connectors: always tight (., ::, ->, etc.)
671        (TokenRole::Connector, _) | (_, TokenRole::Connector) => false,
672        // Terminal followed by bracket-open: tight (f() not f ())
673        (TokenRole::Terminal, TokenRole::BracketOpen) => false,
674        // Close followed by open: tight
675        (TokenRole::BracketClose, TokenRole::BracketOpen) => false,
676        // Keywords always spaced
677        (TokenRole::Keyword, _) | (_, TokenRole::Keyword) => true,
678        // Terminals and operators: space between
679        (TokenRole::Terminal, TokenRole::Terminal) => true,
680        (TokenRole::Terminal, TokenRole::Operator) | (TokenRole::Operator, TokenRole::Terminal) => {
681            true
682        }
683        (TokenRole::Operator, TokenRole::Operator) => true,
684        // Close followed by non-bracket: space
685        (TokenRole::BracketClose, _) => true,
686        // Operator before open: space
687        (TokenRole::Operator, TokenRole::BracketOpen) => true,
688    }
689}