Skip to main content

rumdl_lib/utils/
quarto_chunks.rs

1//! Parser for Quarto / RMarkdown executable code chunk metadata.
2//!
3//! Two label sources, both supported:
4//! 1. Inline info string: ` ```{r, label="setup", echo=FALSE} `
5//! 2. Hashpipe chunk options inside the block body: `#| label: setup`
6//!
7//! The inline form supports three shapes:
8//! - Bare label as the first positional argument: `{r setup}` or `{r several words}`
9//!   (multiple bare words before any `key=value` are treated as a whitespace-
10//!   separated label; this is also how the linter detects spaces in labels).
11//! - Explicit `label=value`: `{r, label=setup}` or `{r, label="my label"}`.
12//! - Mixed forms like `{r setup, echo=FALSE}`.
13//!
14//! The grammar reflects how knitr/Quarto themselves parse chunk headers. We do
15//! not aim for full knitr fidelity; the goal is to recognise the patterns that
16//! drive the two lint rules using this helper (MD078, MD079).
17
18/// Origin of a parsed label, mirrored from panache's `ChunkLabelSource` so
19/// rules can distinguish inline-positional spaces (which are the strongest
20/// signal of a typo) from quoted-string spaces.
21#[derive(Debug, Clone, Copy, PartialEq, Eq)]
22pub enum ChunkLabelSource {
23    /// A bare positional argument before any `key=value`, e.g. `{r setup}`.
24    InlinePositional,
25    /// An explicit `label=` argument, e.g. `{r, label=setup}` or `{r, label="my label"}`.
26    InlineKey,
27    /// A `#| label: setup` hashpipe option inside the block body.
28    Hashpipe,
29}
30
31/// One label found while parsing a chunk header or body.
32#[derive(Debug, Clone, PartialEq, Eq)]
33pub struct ChunkLabel {
34    pub value: String,
35    pub source: ChunkLabelSource,
36}
37
38/// Parsed inline chunk header — the part inside `{...}`.
39#[derive(Debug, Clone, PartialEq, Eq)]
40pub struct InlineChunkHeader {
41    /// Engine name, e.g. `r`, `python`. Empty if absent (malformed header).
42    pub engine: String,
43    /// Labels in declaration order. `InlinePositional` entries come first; if
44    /// multiple bare positionals appear before the first `key=value`, they are
45    /// all returned so MD079 can flag the implicit-spaces case.
46    pub labels: Vec<ChunkLabel>,
47}
48
49/// Try to parse the info string of a fenced code block as a Quarto inline
50/// chunk header. Accepts both `{r}` and `{r, label=foo}` shapes; returns
51/// `None` for plain display blocks like ` ```r `.
52pub fn parse_inline_chunk_header(info_string: &str) -> Option<InlineChunkHeader> {
53    let trimmed = info_string.trim();
54    let inner = trimmed.strip_prefix('{')?.strip_suffix('}')?;
55
56    let mut tokens = tokenize_chunk_args(inner);
57
58    // The engine must be a bare identifier at the head of the token stream.
59    // A leading `key=value` (e.g. `{=html}`) or no tokens at all means no
60    // engine, which `is_executable_chunk` uses to reject Pandoc raw fences.
61    let engine = match tokens.next() {
62        Some(tok) if matches!(tok.kind, TokenKind::Bare) => tok.value,
63        _ => String::new(),
64    };
65
66    let mut labels: Vec<ChunkLabel> = Vec::new();
67    let mut seen_kv = false;
68    for tok in tokens {
69        match tok.kind {
70            TokenKind::Bare => {
71                // Bare words before any key=value act as positional labels.
72                // Bare words AFTER the first key=value are not labels (knitr
73                // ignores stray bareword options).
74                if !seen_kv {
75                    labels.push(ChunkLabel {
76                        value: tok.value,
77                        source: ChunkLabelSource::InlinePositional,
78                    });
79                }
80            }
81            TokenKind::KeyValue { key } => {
82                seen_kv = true;
83                if key.eq_ignore_ascii_case("label") {
84                    labels.push(ChunkLabel {
85                        value: tok.value,
86                        source: ChunkLabelSource::InlineKey,
87                    });
88                }
89            }
90        }
91    }
92
93    Some(InlineChunkHeader { engine, labels })
94}
95
96/// Scan the body of a fenced code block for hashpipe label options
97/// (`#| label: setup`).
98///
99/// Only the contiguous run of hashpipe lines at the top of the block is
100/// inspected, matching Quarto's own behaviour: chunk options must appear
101/// before any code.
102pub fn parse_hashpipe_labels(body: &str) -> Vec<ChunkLabel> {
103    let mut out = Vec::new();
104    for line in body.lines() {
105        let Some(after) = line.trim_start().strip_prefix("#|") else {
106            // First non-hashpipe, non-blank line ends the option block.
107            if line.trim().is_empty() {
108                continue;
109            }
110            break;
111        };
112        let Some((key, value)) = after.split_once(':') else {
113            continue;
114        };
115        if !key.trim().eq_ignore_ascii_case("label") {
116            continue;
117        }
118        let value = value.trim().trim_matches(|c| c == '"' || c == '\'');
119        if value.is_empty() {
120            continue;
121        }
122        out.push(ChunkLabel {
123            value: value.to_string(),
124            source: ChunkLabelSource::Hashpipe,
125        });
126    }
127    out
128}
129
130/// Return `true` if the chunk header denotes an *executable* Quarto chunk.
131///
132/// Executable engines are identifiers like `r`, `python`, `julia`, `bash` -
133/// the first character is ASCII alphabetic. Pandoc attribute fences such as
134/// `{.python}` (display block with a class) and raw-format fences like
135/// `{=html}` are not executable and must not be flagged by MD078/MD079.
136pub fn is_executable_chunk(info_string: &str) -> bool {
137    parse_inline_chunk_header(info_string)
138        .is_some_and(|h| h.engine.chars().next().is_some_and(|c| c.is_ascii_alphabetic()))
139}
140
141#[derive(Debug, Clone, PartialEq, Eq)]
142enum TokenKind {
143    Bare,
144    KeyValue { key: String },
145}
146
147#[derive(Debug, Clone, PartialEq, Eq)]
148struct Token {
149    value: String,
150    kind: TokenKind,
151}
152
153/// Tokenize the body of a chunk header. Arguments are separated by commas or
154/// whitespace; quoted strings preserve their interior (including spaces).
155///
156/// Returns an iterator over tokens. Each token is either a bare word or a
157/// `key=value` pair (with the value unquoted).
158fn tokenize_chunk_args(input: &str) -> impl Iterator<Item = Token> + '_ {
159    ChunkArgIter {
160        input,
161        bytes: input.as_bytes(),
162        pos: 0,
163    }
164}
165
166struct ChunkArgIter<'a> {
167    input: &'a str,
168    bytes: &'a [u8],
169    pos: usize,
170}
171
172impl Iterator for ChunkArgIter<'_> {
173    type Item = Token;
174
175    fn next(&mut self) -> Option<Token> {
176        self.skip_separators();
177        if self.pos >= self.bytes.len() {
178            return None;
179        }
180
181        // A quoted string at this position is a standalone bare token.
182        if matches!(self.bytes[self.pos], b'"' | b'\'') {
183            let value = self.read_quoted();
184            return Some(Token {
185                value,
186                kind: TokenKind::Bare,
187            });
188        }
189
190        // Read a key or bare word: run of non-separator, non-`=`, non-quote chars.
191        let key_start = self.pos;
192        while self.pos < self.bytes.len() {
193            let b = self.bytes[self.pos];
194            if b == b',' || b == b'=' || b == b'"' || b == b'\'' || b.is_ascii_whitespace() {
195                break;
196            }
197            self.pos += 1;
198        }
199        let key = &self.input[key_start..self.pos];
200
201        // Allow optional whitespace between key and `=` so `label = setup`
202        // parses as a key/value, matching knitr's tolerance for spacing.
203        let lookahead = self.skip_inline_whitespace_peek();
204        if lookahead.is_none_or(|b| b != b'=') {
205            return Some(Token {
206                value: key.to_string(),
207                kind: TokenKind::Bare,
208            });
209        }
210
211        // Consume `=` and any whitespace before the value.
212        self.pos += 1;
213        self.skip_inline_whitespace();
214
215        let value = match self.bytes.get(self.pos).copied() {
216            Some(b'"') | Some(b'\'') => self.read_quoted(),
217            Some(_) => {
218                let val_start = self.pos;
219                while self.pos < self.bytes.len() {
220                    let b = self.bytes[self.pos];
221                    if b == b',' || b.is_ascii_whitespace() {
222                        break;
223                    }
224                    self.pos += 1;
225                }
226                self.input[val_start..self.pos].to_string()
227            }
228            None => String::new(),
229        };
230
231        Some(Token {
232            value,
233            kind: TokenKind::KeyValue { key: key.to_string() },
234        })
235    }
236}
237
238impl ChunkArgIter<'_> {
239    fn skip_separators(&mut self) {
240        while self.pos < self.bytes.len() {
241            let b = self.bytes[self.pos];
242            if b == b',' || b.is_ascii_whitespace() {
243                self.pos += 1;
244            } else {
245                break;
246            }
247        }
248    }
249
250    fn skip_inline_whitespace(&mut self) {
251        while self.pos < self.bytes.len() && self.bytes[self.pos].is_ascii_whitespace() {
252            self.pos += 1;
253        }
254    }
255
256    /// Peek past inline whitespace without committing the advance. Returns the
257    /// next non-whitespace byte if any. Used to look for `=` after a key.
258    fn skip_inline_whitespace_peek(&mut self) -> Option<u8> {
259        let saved = self.pos;
260        self.skip_inline_whitespace();
261        let next = self.bytes.get(self.pos).copied();
262        if next != Some(b'=') {
263            self.pos = saved;
264        }
265        next
266    }
267
268    /// Consume a quoted string starting at the current position. Advances `pos`
269    /// past the closing quote (or to end of input if unterminated). Always
270    /// advances at least one byte, so callers cannot livelock on a stray quote.
271    fn read_quoted(&mut self) -> String {
272        let q = self.bytes[self.pos];
273        self.pos += 1;
274        let start = self.pos;
275        while self.pos < self.bytes.len() && self.bytes[self.pos] != q {
276            self.pos += 1;
277        }
278        let val = self.input[start..self.pos].to_string();
279        if self.pos < self.bytes.len() {
280            self.pos += 1;
281        }
282        val
283    }
284}
285
286#[cfg(test)]
287mod tests {
288    use super::*;
289
290    fn header(info: &str) -> InlineChunkHeader {
291        parse_inline_chunk_header(info).expect("should parse")
292    }
293
294    #[test]
295    fn plain_display_block_is_not_a_chunk_header() {
296        assert!(parse_inline_chunk_header("r").is_none());
297        assert!(parse_inline_chunk_header("python").is_none());
298        assert!(parse_inline_chunk_header("").is_none());
299    }
300
301    #[test]
302    fn bare_engine_has_no_label() {
303        let h = header("{r}");
304        assert_eq!(h.engine, "r");
305        assert!(h.labels.is_empty());
306    }
307
308    #[test]
309    fn inline_positional_label() {
310        let h = header("{r setup}");
311        assert_eq!(h.engine, "r");
312        assert_eq!(h.labels.len(), 1);
313        assert_eq!(h.labels[0].value, "setup");
314        assert_eq!(h.labels[0].source, ChunkLabelSource::InlinePositional);
315    }
316
317    #[test]
318    fn multiple_bare_words_are_all_positional() {
319        let h = header("{r several words}");
320        assert_eq!(h.engine, "r");
321        let vals: Vec<&str> = h.labels.iter().map(|l| l.value.as_str()).collect();
322        assert_eq!(vals, vec!["several", "words"]);
323        assert!(h.labels.iter().all(|l| l.source == ChunkLabelSource::InlinePositional));
324    }
325
326    #[test]
327    fn explicit_label_key() {
328        let h = header("{r, label=setup}");
329        assert_eq!(h.engine, "r");
330        assert_eq!(h.labels.len(), 1);
331        assert_eq!(h.labels[0].value, "setup");
332        assert_eq!(h.labels[0].source, ChunkLabelSource::InlineKey);
333    }
334
335    #[test]
336    fn quoted_label_with_spaces() {
337        let h = header(r#"{r, label="my label"}"#);
338        assert_eq!(h.labels.len(), 1);
339        assert_eq!(h.labels[0].value, "my label");
340        assert_eq!(h.labels[0].source, ChunkLabelSource::InlineKey);
341    }
342
343    #[test]
344    fn positional_then_options_only_collects_first_as_label() {
345        let h = header("{r setup, echo=FALSE}");
346        assert_eq!(h.labels.len(), 1);
347        assert_eq!(h.labels[0].value, "setup");
348        assert_eq!(h.labels[0].source, ChunkLabelSource::InlinePositional);
349    }
350
351    #[test]
352    fn bareword_after_kv_is_not_a_label() {
353        // knitr ignores stray barewords after the first kv; we must not treat
354        // them as labels or MD079 would falsely flag them.
355        let h = header("{r, echo=FALSE stray}");
356        assert!(h.labels.is_empty());
357    }
358
359    #[test]
360    fn hashpipe_label_is_picked_up() {
361        let labels = parse_hashpipe_labels("#| label: setup\n#| echo: false\n1 + 1\n");
362        assert_eq!(labels.len(), 1);
363        assert_eq!(labels[0].value, "setup");
364        assert_eq!(labels[0].source, ChunkLabelSource::Hashpipe);
365    }
366
367    #[test]
368    fn hashpipe_label_with_quotes() {
369        let labels = parse_hashpipe_labels("#| label: \"setup\"\n");
370        assert_eq!(labels.len(), 1);
371        assert_eq!(labels[0].value, "setup");
372    }
373
374    #[test]
375    fn hashpipe_options_must_be_at_top_of_block() {
376        // Once real code appears, later #| comments are not options.
377        let labels = parse_hashpipe_labels("1 + 1\n#| label: too-late\n");
378        assert!(labels.is_empty());
379    }
380
381    #[test]
382    fn hashpipe_blank_lines_at_top_are_skipped() {
383        let labels = parse_hashpipe_labels("\n#| label: setup\n");
384        assert_eq!(labels.len(), 1);
385    }
386
387    #[test]
388    fn hashpipe_value_without_colon_is_ignored() {
389        let labels = parse_hashpipe_labels("#| label\n");
390        assert!(labels.is_empty());
391    }
392
393    #[test]
394    fn hashpipe_empty_value_is_ignored() {
395        let labels = parse_hashpipe_labels("#| label:\n");
396        assert!(labels.is_empty());
397    }
398
399    #[test]
400    fn is_executable_chunk_recognises_braced_engines() {
401        assert!(is_executable_chunk("{r}"));
402        assert!(is_executable_chunk("{python}"));
403        assert!(is_executable_chunk("{r, label=foo}"));
404        assert!(!is_executable_chunk("r"));
405        assert!(!is_executable_chunk("python"));
406        assert!(!is_executable_chunk(""));
407    }
408
409    #[test]
410    fn is_executable_chunk_rejects_empty_engine() {
411        // `{}` and `{ , label=foo}` have no engine.
412        assert!(!is_executable_chunk("{}"));
413        assert!(!is_executable_chunk("{ }"));
414    }
415
416    #[test]
417    fn pandoc_attribute_fences_are_not_executable() {
418        // `{.python}` is a display block with a class, not an executable chunk.
419        assert!(!is_executable_chunk("{.python}"));
420        assert!(!is_executable_chunk("{.haskell .numberLines}"));
421        assert!(!is_executable_chunk("{#snippet .python startFrom=\"10\"}"));
422    }
423
424    #[test]
425    fn pandoc_raw_format_fences_are_not_executable() {
426        // `{=html}`, `{=latex}` are raw-format blocks, not executable chunks.
427        assert!(!is_executable_chunk("{=html}"));
428        assert!(!is_executable_chunk("{=latex}"));
429    }
430
431    #[test]
432    fn spaces_around_equals_in_key_value() {
433        // knitr accepts `label = setup`; we must parse the assignment, not
434        // treat `label` as a bare positional.
435        let h = header("{r, label = setup}");
436        assert_eq!(h.labels.len(), 1);
437        assert_eq!(h.labels[0].value, "setup");
438        assert_eq!(h.labels[0].source, ChunkLabelSource::InlineKey);
439    }
440
441    #[test]
442    fn spaces_around_equals_with_quoted_value() {
443        let h = header(r#"{r, label = "my label"}"#);
444        assert_eq!(h.labels.len(), 1);
445        assert_eq!(h.labels[0].value, "my label");
446        assert_eq!(h.labels[0].source, ChunkLabelSource::InlineKey);
447    }
448
449    #[test]
450    fn quoted_bare_token_does_not_livelock() {
451        // A quoted positional like `{r "setup"}` must terminate parsing.
452        let h = header(r#"{r "setup"}"#);
453        assert_eq!(h.engine, "r");
454        // The quoted bare token is captured as a positional label.
455        assert_eq!(h.labels.len(), 1);
456        assert_eq!(h.labels[0].value, "setup");
457        assert_eq!(h.labels[0].source, ChunkLabelSource::InlinePositional);
458    }
459
460    #[test]
461    fn stray_quote_does_not_livelock() {
462        // Malformed header with an unterminated quote must still terminate.
463        let h = header(r#"{r, label="oops}"#);
464        assert_eq!(h.engine, "r");
465        // The unterminated string captures the rest as the value.
466        assert!(!h.labels.is_empty());
467    }
468}