panproto_parse/emit_pretty/layout.rs
1#![allow(
2 clippy::module_name_repetitions,
3 clippy::too_many_lines,
4 clippy::too_many_arguments,
5 clippy::map_unwrap_or,
6 clippy::option_if_let_else,
7 clippy::elidable_lifetime_names,
8 clippy::items_after_statements,
9 clippy::needless_pass_by_value,
10 clippy::single_match_else,
11 clippy::manual_let_else,
12 clippy::match_same_arms,
13 clippy::missing_const_for_fn,
14 clippy::single_char_pattern,
15 clippy::naive_bytecount,
16 clippy::expect_used,
17 clippy::redundant_pub_crate,
18 clippy::used_underscore_binding,
19 clippy::redundant_field_names,
20 clippy::struct_field_names,
21 clippy::redundant_else,
22 clippy::similar_names
23)]
24
25//! `emit_pretty::layout` (Phase A decomposition).
26
27use super::{Grammar, TokenRole, is_word_like};
28
29// ═══════════════════════════════════════════════════════════════════
30
31/// Whitespace and indentation policy applied during emission.
32///
33/// The default policy inserts a single space between adjacent tokens,
34/// a newline after `;` / `}` / `{`, and tracks indent on `{` / `}`
35/// boundaries. Per-language overrides (idiomatic indent width,
36/// trailing-comma rules, blank-line conventions) can ride alongside
37/// this struct in a follow-up branch; today's defaults aim only for
38/// syntactic validity.
39#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
40pub struct FormatPolicy {
41 /// Number of spaces per indent level.
42 pub indent_width: usize,
43 /// Separator inserted between adjacent terminals that the lexer
44 /// would otherwise glue together (word ↔ word, operator ↔ operator).
45 /// Default is a single space.
46 pub separator: String,
47 /// Newline byte sequence emitted after `line_break_after` tokens
48 /// and at end-of-output. Default is `"\n"`.
49 pub newline: String,
50 /// Tokens after which the walker breaks to a new line.
51 pub line_break_after: Vec<String>,
52 /// Tokens that increase indent on emission.
53 pub indent_open: Vec<String>,
54 /// Tokens that decrease indent on emission.
55 pub indent_close: Vec<String>,
56}
57
58impl Default for FormatPolicy {
59 fn default() -> Self {
60 Self {
61 indent_width: 2,
62 separator: " ".to_owned(),
63 newline: "\n".to_owned(),
64 line_break_after: vec![";".into(), "{".into(), "}".into()],
65 indent_open: vec!["{".into()],
66 indent_close: vec!["}".into()],
67 }
68 }
69}
70
71// ═══════════════════════════════════════════════════════════════════
72// Token list output with Spacing algebra
73// ═══════════════════════════════════════════════════════════════════
74//
75// Emit produces a free monoid over `Token`. Layout (spaces, newlines,
76// indentation) is a homomorphism `Vec<Token> -> Vec<u8>` parameterised
77// by `FormatPolicy`. Separating the structural output from the layout
78// decision means each phase has one job: emit walks the grammar and
79// pushes tokens; layout is a single fold, locally driven by adjacent
80// pairs and a depth counter. Snapshot/restore is just `tokens.len()`.
81
82#[derive(Clone)]
83pub(crate) enum Token {
84 /// A user-visible terminal contributed by the grammar, annotated
85 /// with its structural role for spacing decisions.
86 Lit(String, TokenRole),
87 /// `indent_open` marker emitted when a `Lit` matched the policy's
88 /// open list. Carried as a separate token so layout can decide to
89 /// break + indent without re-scanning.
90 IndentOpen,
91 /// `indent_close` marker emitted before a closer-`Lit`.
92 IndentClose,
93 /// "Break a line here if not already at line start" — used after
94 /// statements/declarations and after open braces.
95 LineBreak,
96 /// Force a space before the next Lit even if the role-pair table
97 /// says tight. Pushed between consecutive content-producing SEQ
98 /// members (e.g. between `command_name` and `argument`) to ensure
99 /// sibling-vertex tokens are separated.
100 ForceSpace,
101 /// Suppress the next inter-Lit separator. Pushed by the REPEAT
102 /// walker when an iteration's "separator slot" (a CHOICE-with-BLANK
103 /// or OPTIONAL at SEQ position 0) emitted zero content tokens, so
104 /// the categorical reading is "no source-level separator existed
105 /// between these two sibling iterations of the body".
106 NoSpace,
107 /// Guard emitted right after a greedy unbounded negated-class
108 /// terminal (`[^...]+`, e.g. HTML's unquoted `attribute_value`). The
109 /// carried string is the negated set's inner content. If the NEXT
110 /// `Lit` begins with a character that set ADMITS, the terminal would
111 /// swallow that character on re-parse (`Ok` + `/>` lexes as the value
112 /// `Ok/>`, turning a `self_closing_tag` into a `start_tag`), so the
113 /// layout fold forces a separator. Transparent otherwise.
114 AbsorberGuard(String),
115 /// Exact source bytes replayed from the layout complement
116 /// (`reconstruct_subtree_bytes`): a whole vertex subtree whose
117 /// `interstitial-N` / `literal-value` fibre tiled its byte span exactly.
118 /// The fold writes these bytes verbatim and inserts NO role-derived
119 /// separator on either side — the replayed text already carries its own
120 /// leading and trailing whitespace, so the byte-faithful path bypasses the
121 /// role table entirely. The carried bytes may contain newlines; they are
122 /// written through without disturbing the indent counter (the replay is
123 /// self-contained, including its own indentation).
124 Verbatim(String),
125}
126
127pub(crate) struct Output<'a> {
128 pub(crate) tokens: Vec<Token>,
129 pub(crate) policy: &'a FormatPolicy,
130 pub(crate) grammar: &'a Grammar,
131 pub(crate) current_rule: Option<String>,
132 pub(crate) cassette: Option<&'a dyn crate::languages::cassettes::GrammarCassette>,
133}
134
135#[derive(Clone)]
136pub(crate) struct OutputSnapshot {
137 pub(crate) tokens_len: usize,
138}
139
140impl<'a> Output<'a> {
141 pub(crate) fn new(
142 policy: &'a FormatPolicy,
143 grammar: &'a Grammar,
144 cassette: Option<&'a dyn crate::languages::cassettes::GrammarCassette>,
145 ) -> Self {
146 Self {
147 tokens: Vec::new(),
148 policy,
149 grammar,
150 current_rule: None,
151 cassette,
152 }
153 }
154
155 pub(crate) fn token(&mut self, value: &str) {
156 self.token_with_role(value, None);
157 }
158
159 /// Emit a verbatim string-region leaf with NO layout side effects:
160 /// the literal is pushed with the `Terminal` role but the
161 /// `line_break_after` / `indent_open` machinery is bypassed. Tight
162 /// string content (`kind_is_tight_content`, `string_content_kinds`,
163 /// `external_content_kinds`) and the interpolation braces of a string
164 /// (`$"…{x}…"`) are part of one lexical span where a literal `{`, `}`
165 /// or `;` inside the captured text is data, not a block opener or a
166 /// statement terminator: routing them through `token_with_role` would
167 /// insert a newline / indent that the re-parse cannot absorb (the
168 /// scanner only re-lexes the interpolation when the brace abuts its
169 /// neighbours). The caller is responsible for any surrounding
170 /// [`no_space`](Self::no_space) markers.
171 pub(crate) fn tight_token(&mut self, value: &str) {
172 if value.is_empty() {
173 return;
174 }
175 // Verbatim string-region content is glued to its delimiters and is
176 // *data*, not syntax: a literal `;`/`#`/`//` inside the captured text
177 // must not be re-interpreted as a line-comment opener (which would
178 // append a newline in the layout fold). The `Immediate` role is
179 // unconditionally tight on both sides and is excluded from the
180 // line-comment-prefix newline, so it is the correct role for content.
181 self.tokens
182 .push(Token::Lit(value.to_owned(), TokenRole::Immediate));
183 }
184
185 pub(crate) fn token_with_role(&mut self, value: &str, explicit_role: Option<TokenRole>) {
186 if value.is_empty() {
187 return;
188 }
189
190 if value == "\n" || value == "\r\n" || value == "\r" {
191 self.tokens.push(Token::LineBreak);
192 return;
193 }
194
195 let trimmed = value.trim_end_matches(['\n', '\r']);
196 let trailing_newlines = value.len() - trimmed.len();
197 if trailing_newlines > 0 && !trimmed.is_empty() {
198 let role = explicit_role.unwrap_or(TokenRole::Terminal);
199 if role == TokenRole::BracketClose
200 && self.policy.indent_close.iter().any(|t| t == trimmed)
201 {
202 self.tokens.push(Token::IndentClose);
203 }
204 self.tokens.push(Token::Lit(trimmed.to_owned(), role));
205 if role == TokenRole::BracketOpen {
206 if let Some(ref rule) = self.current_rule {
207 if self
208 .grammar
209 .indent_triggers
210 .contains(&(rule.clone(), trimmed.to_owned()))
211 {
212 self.tokens.push(Token::IndentOpen);
213 }
214 }
215 }
216 self.tokens.push(Token::LineBreak);
217 return;
218 }
219
220 let mut role = explicit_role.unwrap_or_else(|| self.lookup_role(value));
221 // A cassette may declare a token lexically tight in a rule (a
222 // scanner fact `grammar.json` omits, e.g. bash `VAR=1`): emit it
223 // with the always-tight Connector role (which the layout pass
224 // honours over the sibling-separation ForceSpace).
225 if let (Some(rule), Some(cassette)) = (self.current_rule.as_ref(), self.cassette) {
226 if cassette.operator_is_tight(rule, value) {
227 role = TokenRole::Connector;
228 }
229 }
230
231 if role == TokenRole::BracketClose && self.policy.indent_close.iter().any(|t| t == value) {
232 self.tokens.push(Token::IndentClose);
233 }
234
235 self.tokens.push(Token::Lit(value.to_owned(), role));
236
237 if role == TokenRole::BracketOpen {
238 let grammar_indent = self.current_rule.as_ref().is_some_and(|rule| {
239 self.grammar
240 .indent_triggers
241 .contains(&(rule.clone(), value.to_owned()))
242 });
243 if grammar_indent {
244 self.tokens.push(Token::IndentOpen);
245 self.tokens.push(Token::LineBreak);
246 }
247 }
248 // Line-break after tokens like `;` (statement terminator).
249 // Skip for BracketOpen/BracketClose tokens that are NOT
250 // indent-triggering (e.g. `{` in interpolation should not
251 // trigger a line break).
252 let is_non_indent_bracket = self.current_rule.is_some()
253 && (role == TokenRole::BracketOpen || role == TokenRole::BracketClose)
254 && !self.current_rule.as_ref().is_some_and(|rule| {
255 self.grammar
256 .indent_triggers
257 .contains(&(rule.clone(), value.to_owned()))
258 });
259 if !is_non_indent_bracket && self.policy.line_break_after.iter().any(|t| t == value) {
260 self.tokens.push(Token::LineBreak);
261 }
262 }
263
264 pub(crate) fn lookup_role(&self, value: &str) -> TokenRole {
265 if let Some(role) = self.explicit_role(value) {
266 return role;
267 }
268 if is_word_like(value) {
269 TokenRole::Keyword
270 } else {
271 TokenRole::Operator
272 }
273 }
274
275 /// The role classified for `value` in the current rule, if any.
276 /// `None` when the rule's grammar-derived `token_roles` map has no
277 /// entry, leaving the caller to choose a structural default.
278 pub(crate) fn explicit_role(&self, value: &str) -> Option<TokenRole> {
279 self.current_rule
280 .as_ref()
281 .and_then(|rule| self.grammar.token_roles.get(rule))
282 .and_then(|role_map| role_map.get(value).copied())
283 }
284
285 /// Emit a bracket-open token that triggers indentation. This is the
286 /// inline-classification counterpart to the `indent_triggers` check
287 /// in `token_with_role`: the SEQ walker computes indent-triggering
288 /// from the SEQ structure directly rather than from a precomputed map.
289 pub(crate) fn token_with_indent_open(&mut self, value: &str, role: TokenRole) {
290 if value.is_empty() {
291 return;
292 }
293 if role == TokenRole::BracketClose && self.policy.indent_close.iter().any(|t| t == value) {
294 self.tokens.push(Token::IndentClose);
295 }
296 self.tokens.push(Token::Lit(value.to_owned(), role));
297 if role == TokenRole::BracketOpen {
298 self.tokens.push(Token::IndentOpen);
299 self.tokens.push(Token::LineBreak);
300 }
301 }
302
303 pub(crate) fn newline(&mut self) {
304 self.tokens.push(Token::LineBreak);
305 }
306
307 /// Push exact replayed source bytes (see [`Token::Verbatim`]). The bytes
308 /// are written through the layout fold with no role-derived spacing on
309 /// either edge: the layout complement already encodes the verbatim
310 /// inter-token whitespace, so the byte-faithful replay path bypasses the
311 /// role table for this span.
312 pub(crate) fn verbatim(&mut self, bytes: &str) {
313 if bytes.is_empty() {
314 return;
315 }
316 self.tokens.push(Token::Verbatim(bytes.to_owned()));
317 }
318
319 /// Open an indent scope: subsequent `LineBreak`s render at the
320 /// new depth until a matching `indent_close` pops it. Used by the
321 /// external-token fallback to render indent-based grammars'
322 /// `_indent` scanner outputs.
323 pub(crate) fn indent_open(&mut self) {
324 self.tokens.push(Token::IndentOpen);
325 self.tokens.push(Token::LineBreak);
326 }
327
328 /// Close one indent scope opened by `indent_open`.
329 pub(crate) fn indent_close(&mut self) {
330 self.tokens.push(Token::IndentClose);
331 }
332
333 pub(crate) fn snapshot(&self) -> OutputSnapshot {
334 OutputSnapshot {
335 tokens_len: self.tokens.len(),
336 }
337 }
338
339 pub(crate) fn restore(&mut self, snap: OutputSnapshot) {
340 self.tokens.truncate(snap.tokens_len);
341 }
342
343 /// True iff at least one `Token::Lit` was pushed since `snap`.
344 /// Control-only emissions (`LineBreak`, `IndentOpen` / `IndentClose`,
345 /// `NoSpace`) do not count as content. Used by the REPEAT walker
346 /// to detect that a "separator slot" CHOICE picked its BLANK
347 /// alternative, so the next iteration's content can be marked
348 /// tight against the previous iteration's content.
349 pub(crate) fn lit_emitted_since(&self, snap: OutputSnapshot) -> bool {
350 self.tokens[snap.tokens_len..]
351 .iter()
352 .any(|t| matches!(t, Token::Lit(_, _) | Token::Verbatim(_)))
353 }
354
355 /// Push a marker that suppresses the next inter-Lit separator the
356 /// layout pass would otherwise insert. Used to encode "no source-
357 /// level separator was emitted between these two Lits" without
358 /// having to make per-grammar adjacency decisions in the layout.
359 pub(crate) fn no_space(&mut self) {
360 self.tokens.push(Token::NoSpace);
361 }
362
363 /// Push a marker that forces a separator (space) between the
364 /// surrounding Lits. Used for an external scanner token that is
365 /// required inter-token whitespace (dockerfile `_non_newline_whitespace`
366 /// between path arguments), which carries no text of its own but
367 /// must keep the neighbours apart.
368 pub(crate) fn force_space(&mut self) {
369 self.tokens.push(Token::ForceSpace);
370 }
371
372 pub(crate) fn finish(self) -> Vec<u8> {
373 layout(
374 &self.tokens,
375 self.policy,
376 &self.grammar.line_comment_prefixes,
377 &self.grammar.trailing_break_markers,
378 self.grammar.trailing_break_on_whitespace,
379 self.grammar.top_level_text_admits_newline,
380 )
381 }
382}
383
384/// Fold a token list into bytes. The algebra:
385/// * adjacent `Lit`s get a single space iff `needs_space_between(a, b)`,
386/// * `IndentOpen` / `IndentClose` adjust a depth counter,
387/// * `LineBreak` writes `\n` if not already at line start, then the
388/// next `Lit` writes `indent * indent_width` spaces of indent.
389pub(crate) fn layout(
390 tokens: &[Token],
391 policy: &FormatPolicy,
392 line_comment_prefixes: &[String],
393 trailing_break_markers: &[String],
394 trailing_break_on_whitespace: bool,
395 top_level_text_admits_newline: bool,
396) -> Vec<u8> {
397 let mut bytes = Vec::new();
398 let mut indent: usize = 0;
399 let mut at_line_start = true;
400 let mut last_role: Option<TokenRole> = None;
401 let mut last_text: String = String::new();
402 let mut suppress_next_separator = false;
403 let mut force_next_separator = false;
404 // The negated-class content of a greedy terminal that just emitted; if
405 // the next Lit's first char is admitted by it, force a separator.
406 let mut pending_absorber: Option<String> = None;
407 // True iff the most recently emitted content token was an exact-replay
408 // `Verbatim` blob. The byte-faithful replay path reproduces the source's
409 // trailing bytes verbatim (the trailing interstitial is part of the
410 // reconstructed span), so the final line-terminating newline below must not
411 // be appended after a verbatim tail: the source may legitimately have ended
412 // without a trailing newline, and a spurious `\n` can flip a
413 // newline-sensitive scanner's parse (scala `class A\n()\n()\n{}` — the
414 // trailing `\n` inserts an automatic semicolon that re-binds the empty
415 // `class_parameters`/`template_body` as top-level `unit`/`block`). Canonical
416 // (forget_layout) schemas emit no `Verbatim` tokens, so this never relaxes
417 // the conventional terminating newline on the reformatting path.
418 let mut last_content_was_verbatim = false;
419 let newline = policy.newline.as_bytes();
420 let separator = policy.separator.as_bytes();
421
422 for (tok_idx, tok) in tokens.iter().enumerate() {
423 if std::env::var("DBG_LAYOUT").is_ok() {
424 match tok {
425 Token::Lit(v, r) => eprintln!(
426 " TOK: Lit({v:?}, {r:?}) at_line_start={at_line_start} last_role={last_role:?}"
427 ),
428 Token::IndentOpen => eprintln!(" TOK: IndentOpen"),
429 Token::IndentClose => eprintln!(" TOK: IndentClose"),
430 Token::LineBreak => eprintln!(" TOK: LineBreak"),
431 Token::NoSpace => eprintln!(" TOK: NoSpace"),
432 Token::ForceSpace => eprintln!(" TOK: ForceSpace"),
433 Token::AbsorberGuard(s) => eprintln!(" TOK: AbsorberGuard({s:?})"),
434 Token::Verbatim(s) => eprintln!(" TOK: Verbatim({s:?})"),
435 }
436 }
437 match tok {
438 Token::IndentOpen => indent += 1,
439 Token::IndentClose => {
440 indent = indent.saturating_sub(1);
441 pending_absorber = None;
442 if !at_line_start {
443 bytes.extend_from_slice(newline);
444 at_line_start = true;
445 }
446 }
447 Token::LineBreak => {
448 pending_absorber = None;
449 if !at_line_start {
450 bytes.extend_from_slice(newline);
451 at_line_start = true;
452 }
453 }
454 Token::NoSpace => {
455 suppress_next_separator = true;
456 }
457 Token::ForceSpace => {
458 force_next_separator = true;
459 }
460 Token::AbsorberGuard(negated) => {
461 pending_absorber = Some(negated.clone());
462 }
463 Token::Verbatim(bytes_str) => {
464 // Exact replayed source: written through with NO role-derived
465 // separator on either edge. The complement already encodes the
466 // verbatim whitespace, so the byte-faithful path must not let
467 // the role table inject or suppress a space here. Any pending
468 // absorber/force/suppress markers are discharged without effect.
469 pending_absorber = None;
470 suppress_next_separator = false;
471 force_next_separator = false;
472 // Indentation only applies to the FIRST line of the blob if we
473 // were at a fresh line start; the blob carries its own internal
474 // indentation thereafter.
475 if at_line_start && !bytes_str.is_empty() {
476 bytes.extend(std::iter::repeat_n(b' ', indent * policy.indent_width));
477 }
478 bytes.extend_from_slice(bytes_str.as_bytes());
479 // The trailing byte determines the line state for whatever
480 // follows; the role chain is reset so the next `Lit` does not
481 // role-space against a stale predecessor.
482 at_line_start = bytes_str.ends_with(['\n', '\r']);
483 last_role = None;
484 last_text.clear();
485 last_content_was_verbatim = true;
486 }
487 Token::Lit(value, role) => {
488 // A greedy negated-class terminal just emitted: if it would
489 // lexically swallow this Lit's first char on re-parse, the
490 // boundary needs a separator regardless of the role pair.
491 if let Some(negated) = pending_absorber.take() {
492 if value
493 .chars()
494 .next()
495 .is_some_and(|c| negated_class_admits(&negated, c))
496 {
497 force_next_separator = true;
498 }
499 }
500 // Block-opening bracket: BracketOpen followed by IndentOpen.
501 // After a Terminal/BracketClose, this should be spaced
502 // (`}\n` not `0{`).
503 let is_block_open = *role == TokenRole::BracketOpen
504 && tokens
505 .get(tok_idx + 1)
506 .is_some_and(|t| matches!(t, Token::IndentOpen));
507 if at_line_start {
508 bytes.extend(std::iter::repeat_n(b' ', indent * policy.indent_width));
509 } else if let Some(prev_role) = last_role {
510 // The role-spacer inserts at most ONE separator at a token
511 // boundary, but a content leaf can carry the boundary
512 // whitespace inside its own captured text: a marker token
513 // whose `literal-value` ends in a space (djot
514 // `block_quote_marker` = `"> "`, the ATX/list markers of
515 // lightweight-markup grammars) already supplies the gap to
516 // the following content, and a token whose text begins with
517 // a space supplies it to the preceding one. Adding a
518 // role-derived space on top would double it, and the doubled
519 // space is re-absorbed into the marker's text on re-parse, so
520 // it accretes one space per emit (`# Heading` -> `# Heading`
521 // -> `# Heading` ...): the canonical fixed point is lost.
522 // When the boundary already carries whitespace from either
523 // side, the separator is redundant; suppress it. This is
524 // derived purely from the emitted token text, not any
525 // per-language table, and applies uniformly: a genuine
526 // no-whitespace marker (Org's `* Heading`, whose literal is
527 // bare `*`) is unaffected, since neither side carries the
528 // space.
529 let boundary_has_whitespace =
530 last_text.ends_with([' ', '\t']) || value.starts_with([' ', '\t']);
531 // An explicit NoSpace (suppress) is authoritative: it
532 // records that the source had no separator at this
533 // boundary (an empty REPEAT separator slot, an
534 // IMMEDIATE_TOKEN). It overrides the sibling-separation
535 // ForceSpace heuristic — otherwise beamed notes
536 // (`CDEF`) re-space to `C D E F`.
537 let want_space = !suppress_next_separator
538 && !boundary_has_whitespace
539 && (force_next_separator
540 || needs_space_by_role(prev_role, &last_text, *role, value)
541 || (is_block_open
542 && matches!(
543 prev_role,
544 TokenRole::Terminal | TokenRole::BracketClose
545 )));
546 if want_space {
547 bytes.extend_from_slice(separator);
548 }
549 }
550 suppress_next_separator = false;
551 force_next_separator = false;
552 bytes.extend_from_slice(value.as_bytes());
553 at_line_start = false;
554 last_content_was_verbatim = false;
555 last_role = Some(*role);
556 last_text.clear();
557 last_text.push_str(value);
558 // A verbatim string-region content leaf (`Immediate` role) is
559 // data, not syntax: a `;`/`#`/`//` inside captured string text
560 // must not open a line comment.
561 if *role != TokenRole::Immediate
562 && line_comment_prefixes
563 .iter()
564 .any(|p| value.starts_with(p.as_str()))
565 {
566 bytes.extend_from_slice(newline);
567 at_line_start = true;
568 last_role = None;
569 }
570 }
571 }
572 }
573
574 // Append the customary end-of-output newline only when no suppressor
575 // fires: not already at line start, not directly after an exact-replay
576 // verbatim tail (scala), not on a top-level free-text repeat that admits a
577 // bare newline (liquid `{% endcomment %}` must not gain a trailing
578 // `template_content`), and not after a hard-line-break marker
579 // (markdown_inline). Each suppressor guards against the appended newline
580 // manufacturing a phantom node on re-parse.
581 if !at_line_start
582 && !last_content_was_verbatim
583 && !top_level_text_admits_newline
584 && !ends_with_trailing_break_marker(
585 &bytes,
586 trailing_break_markers,
587 trailing_break_on_whitespace,
588 )
589 {
590 bytes.extend_from_slice(newline);
591 }
592 bytes
593}
594
595/// Whether `bytes` ends with a hard-line-break marker — a bare break
596/// literal (the `\` of `markdown_inline`'s `hard_line_break`) or, when the
597/// grammar's break idiom admits it, trailing whitespace. Appending the
598/// customary end-of-output newline after such a tail would manufacture a
599/// phantom line-break node on re-parse, so the caller suppresses it.
600fn ends_with_trailing_break_marker(bytes: &[u8], markers: &[String], on_whitespace: bool) -> bool {
601 if markers.is_empty() && !on_whitespace {
602 return false;
603 }
604 if on_whitespace && bytes.last().is_some_and(|b| *b == b' ' || *b == b'\t') {
605 return true;
606 }
607 markers.iter().any(|m| bytes.ends_with(m.as_bytes()))
608}
609
610/// True when the negated character class `[^<negated>]` ADMITS `c` — i.e.
611/// `c` is not one of the excluded characters. `negated` is the inner text
612/// of the class (the part after `[^`, before `]`), with backslash escapes
613/// (`\s`, `\t`, `\n`, `\\`) and literal members. A greedy `[^...]+`
614/// terminal continues to consume any admitted character, so an admitted
615/// leading char on the following token would be swallowed on re-parse.
616fn negated_class_admits(negated: &str, c: char) -> bool {
617 let mut chars = negated.chars();
618 while let Some(ch) = chars.next() {
619 if ch == '\\' {
620 let excluded = match chars.next() {
621 Some('s') => c.is_whitespace(),
622 Some('t') => c == '\t',
623 Some('n') => c == '\n',
624 Some('r') => c == '\r',
625 Some(esc) => c == esc,
626 None => false,
627 };
628 if excluded {
629 return false;
630 }
631 } else if ch == c {
632 return false;
633 }
634 }
635 true
636}
637
638/// Effective spacing role: word-like bracket tokens (`function`, `end`,
639/// `begin`, `done`, etc.) are structurally brackets (for indentation)
640/// but space like keywords (they need whitespace on both sides).
641pub(crate) fn effective_spacing_role(role: TokenRole, text: &str) -> TokenRole {
642 match role {
643 TokenRole::BracketOpen | TokenRole::BracketClose if is_word_like(text) => {
644 TokenRole::Keyword
645 }
646 other => other,
647 }
648}
649
650/// Role-pair spacing table. Determines whether a space separator
651/// should be inserted between two adjacent tokens based on their
652/// structural roles and word-likeness.
653pub(crate) fn needs_space_by_role(
654 last: TokenRole,
655 last_text: &str,
656 next: TokenRole,
657 next_text: &str,
658) -> bool {
659 let last = effective_spacing_role(last, last_text);
660 let next = effective_spacing_role(next, next_text);
661 match (last, next) {
662 // Immediate (IMMEDIATE_TOKEN) tokens are lexically glued to
663 // their neighbours on both sides (`0.5`, not `0 . 5`).
664 (TokenRole::Immediate, _) | (_, TokenRole::Immediate) => false,
665 // Brackets: tight on the inside
666 (TokenRole::BracketOpen, _) | (_, TokenRole::BracketClose) => false,
667 // Separators: tight before, space after
668 (_, TokenRole::Separator) => false,
669 (TokenRole::Separator, _) => true,
670 // Connectors: always tight (., ::, ->, etc.)
671 (TokenRole::Connector, _) | (_, TokenRole::Connector) => false,
672 // Terminal followed by bracket-open: tight (f() not f ())
673 (TokenRole::Terminal, TokenRole::BracketOpen) => false,
674 // Close followed by open: tight
675 (TokenRole::BracketClose, TokenRole::BracketOpen) => false,
676 // Keywords always spaced
677 (TokenRole::Keyword, _) | (_, TokenRole::Keyword) => true,
678 // Terminals and operators: space between
679 (TokenRole::Terminal, TokenRole::Terminal) => true,
680 (TokenRole::Terminal, TokenRole::Operator) | (TokenRole::Operator, TokenRole::Terminal) => {
681 true
682 }
683 (TokenRole::Operator, TokenRole::Operator) => true,
684 // Close followed by non-bracket: space
685 (TokenRole::BracketClose, _) => true,
686 // Operator before open: space
687 (TokenRole::Operator, TokenRole::BracketOpen) => true,
688 }
689}