Skip to main content

panache_parser/parser/blocks/
metadata.rs

1//! YAML metadata block parsing utilities.
2
3use crate::parser::diagnostics::{Diagnostics, SyntaxError, SyntaxErrorSource};
4use crate::parser::utils::helpers::{emit_line_tokens, strip_newline};
5use crate::parser::utils::tree_copy::copy_green_children;
6use crate::parser::yaml::{locate_yaml_diagnostic, parse_stream};
7use crate::syntax::SyntaxKind;
8use rowan::{GreenNodeBuilder, TextRange};
9
10/// Try to parse a YAML metadata block starting at the given position.
11/// Returns the new position after the block if successful, None otherwise.
12///
13/// A YAML block:
14/// - Starts with `---` (not followed by blank line)
15/// - Ends with `---` or `...`
16/// - At document start OR preceded by blank line
17pub(crate) fn try_parse_yaml_block(
18    lines: &[&str],
19    pos: usize,
20    builder: &mut GreenNodeBuilder<'static>,
21    at_document_start: bool,
22    diags: &Diagnostics,
23) -> Option<usize> {
24    let closing_pos = find_yaml_block_closing_pos(lines, pos, at_document_start)?;
25    emit_yaml_block(lines, pos, closing_pos, builder, diags)
26}
27
28pub(crate) fn find_yaml_block_closing_pos(
29    lines: &[&str],
30    pos: usize,
31    at_document_start: bool,
32) -> Option<usize> {
33    if pos >= lines.len() {
34        return None;
35    }
36
37    let line = lines[pos];
38
39    // Must start with ---
40    if line.trim() != "---" {
41        return None;
42    }
43
44    // If not at document start, previous line must be blank
45    if !at_document_start && pos > 0 {
46        let prev_line = lines[pos - 1];
47        if !prev_line.trim().is_empty() {
48            return None;
49        }
50    }
51
52    // Check that next line (if exists) is NOT blank (this distinguishes from horizontal rule)
53    if pos + 1 < lines.len() {
54        let next_line = lines[pos + 1];
55        if next_line.trim().is_empty() {
56            // This is likely a horizontal rule, not YAML
57            return None;
58        }
59    } else {
60        // No content after ---, can't be a YAML block
61        return None;
62    }
63
64    // Find a closing delimiter before emitting; otherwise this is not a valid YAML block.
65    let mut closing_pos = None;
66    for (i, content_line) in lines.iter().enumerate().skip(pos + 1) {
67        if content_line.trim() == "---" || content_line.trim() == "..." {
68            closing_pos = Some(i);
69            break;
70        }
71    }
72    closing_pos
73}
74
75pub(crate) fn emit_yaml_block(
76    lines: &[&str],
77    pos: usize,
78    closing_pos: usize,
79    builder: &mut GreenNodeBuilder<'static>,
80    diags: &Diagnostics,
81) -> Option<usize> {
82    if pos >= lines.len() || closing_pos <= pos || closing_pos >= lines.len() {
83        return None;
84    }
85    // Start metadata node
86    builder.start_node(SyntaxKind::YAML_METADATA.into());
87
88    // Opening delimiter - strip newline before emitting
89    let (text, newline_str) = strip_newline(lines[pos]);
90    builder.token(SyntaxKind::YAML_METADATA_DELIM.into(), text);
91    if !newline_str.is_empty() {
92        builder.token(SyntaxKind::NEWLINE.into(), newline_str);
93    }
94
95    builder.start_node(SyntaxKind::YAML_METADATA_CONTENT.into());
96    // Reconstruct the frontmatter content as a contiguous byte string. The
97    // lines returned by `split_lines_inclusive` are non-overlapping slices
98    // of the original input that retain their trailing LF / CRLF, so
99    // concatenating them rebuilds the source bytes between the delimiters
100    // exactly (including CRLF).
101    let mut content = String::new();
102    for content_line in lines.iter().take(closing_pos).skip(pos + 1) {
103        content.push_str(content_line);
104    }
105
106    // Embed the in-tree YAML CST under YAML_METADATA_CONTENT when the
107    // content validates. On validation failure, fall back to the
108    // opaque line-token shape so downstream re-parse (and the host
109    // CST snapshot of malformed YAML) keep their current behavior.
110    //
111    // `parse_stream` returns a `YAML_STREAM` wrapping one or more
112    // `YAML_DOCUMENT` children. The wrapper is the YAML-spec stream
113    // container — but inside frontmatter the host's
114    // `YAML_METADATA_CONTENT` already plays that role (and
115    // `find_yaml_block_closing_pos` guarantees a single document by
116    // stopping at the first internal `---` / `...`). Splice the stream's
117    // children in directly to avoid the redundant wrapper.
118    if let Some((diag, start_off, end_off)) = locate_yaml_diagnostic(&content, "") {
119        // Malformed frontmatter YAML: record the syntax error at its host
120        // position (the parser already has the verdict), then fall back to the
121        // opaque line-token shape. `content` begins at `lines[pos + 1]`, a
122        // subslice of the host input, so its host start is the pointer offset
123        // from line 0; offsets are identity (no per-line prefix).
124        let host_start = lines[pos + 1].as_ptr() as usize - lines[0].as_ptr() as usize;
125        diags.push(SyntaxError {
126            range: TextRange::new(
127                ((host_start + start_off) as u32).into(),
128                ((host_start + end_off) as u32).into(),
129            ),
130            message: diag.message.to_string(),
131            source: SyntaxErrorSource::Yaml,
132        });
133        for content_line in lines.iter().take(closing_pos).skip(pos + 1) {
134            emit_line_tokens(builder, content_line);
135        }
136    } else {
137        let stream_green = parse_stream(&content).green().into_owned();
138        copy_green_children(builder, &stream_green);
139    }
140    builder.finish_node(); // YAML_METADATA_CONTENT
141
142    let (closing_text, closing_newline) = strip_newline(lines[closing_pos]);
143    builder.token(SyntaxKind::YAML_METADATA_DELIM.into(), closing_text);
144    if !closing_newline.is_empty() {
145        builder.token(SyntaxKind::NEWLINE.into(), closing_newline);
146    }
147
148    builder.finish_node(); // YamlMetadata
149
150    Some(closing_pos + 1)
151}
152
153/// Try to parse a Pandoc title block starting at the beginning of document.
154/// Returns the new position after the block if successful, None otherwise.
155///
156/// A Pandoc title block:
157/// - Must be at document start (pos == 0)
158/// - Has 1-3 lines starting with `%`
159/// - Format: % title, % author(s), % date
160/// - Continuation lines start with leading space
161pub(crate) fn try_parse_pandoc_title_block(
162    lines: &[&str],
163    pos: usize,
164    builder: &mut GreenNodeBuilder<'static>,
165) -> Option<usize> {
166    if pos != 0 || lines.is_empty() {
167        return None;
168    }
169
170    let first_line = lines[0];
171    if !first_line.trim_start().starts_with('%') {
172        return None;
173    }
174
175    // Start title block node
176    builder.start_node(SyntaxKind::PANDOC_TITLE_BLOCK.into());
177
178    let mut current_pos = 0;
179    let mut field_count = 0;
180
181    // Parse up to 3 fields (title, author, date)
182    while current_pos < lines.len() && field_count < 3 {
183        let line = lines[current_pos];
184
185        // Check if this line starts a field (begins with %)
186        if line.trim_start().starts_with('%') {
187            emit_line_tokens(builder, line);
188            field_count += 1;
189            current_pos += 1;
190
191            // Collect continuation lines (start with leading space, not with %)
192            while current_pos < lines.len() {
193                let cont_line = lines[current_pos];
194                if cont_line.is_empty() {
195                    // Blank line ends title block
196                    break;
197                }
198                if cont_line.trim_start().starts_with('%') {
199                    // Next field
200                    break;
201                }
202                if cont_line.starts_with(' ') || cont_line.starts_with('\t') {
203                    // Continuation line
204                    emit_line_tokens(builder, cont_line);
205                    current_pos += 1;
206                } else {
207                    // Non-continuation, non-% line ends title block
208                    break;
209                }
210            }
211        } else {
212            // Line doesn't start with %, title block ends
213            break;
214        }
215    }
216
217    builder.finish_node(); // PandocTitleBlock
218
219    if field_count > 0 {
220        Some(current_pos)
221    } else {
222        None
223    }
224}
225
226fn mmd_key_value(line: &str) -> Option<(String, String)> {
227    let (key, value) = line.split_once(':')?;
228    let key_trimmed = key.trim();
229    if key_trimmed.is_empty() {
230        return None;
231    }
232    Some((key_trimmed.to_string(), value.trim().to_string()))
233}
234
235/// Try to parse a MultiMarkdown title block starting at the beginning of document.
236/// Returns the new position after the block if successful, None otherwise.
237///
238/// A MultiMarkdown title block:
239/// - Must be at document start (pos == 0)
240/// - Contains one or more `Key: Value` lines
241/// - The first field value must be non-empty
242/// - Continuation lines start with leading space or tab
243/// - Terminates with a blank line
244pub(crate) fn try_parse_mmd_title_block(
245    lines: &[&str],
246    pos: usize,
247    builder: &mut GreenNodeBuilder<'static>,
248) -> Option<usize> {
249    if pos != 0 || lines.is_empty() {
250        return None;
251    }
252
253    let mut current_pos = pos;
254
255    // First line must be a key-value pair with non-empty value.
256    let first = lines[current_pos];
257    let (_first_key, first_value) = mmd_key_value(first)?;
258    if first_value.is_empty() {
259        return None;
260    }
261
262    builder.start_node(SyntaxKind::MMD_TITLE_BLOCK.into());
263
264    while current_pos < lines.len() {
265        let line = lines[current_pos];
266
267        if line.trim().is_empty() {
268            break;
269        }
270
271        if mmd_key_value(line).is_none() {
272            builder.finish_node();
273            return None;
274        }
275
276        emit_line_tokens(builder, line);
277        current_pos += 1;
278
279        // Optional continuation lines (must be indented and not key-value starts).
280        while current_pos < lines.len() {
281            let cont_line = lines[current_pos];
282            if cont_line.trim().is_empty() {
283                break;
284            }
285
286            let trimmed = cont_line.trim_start();
287            if mmd_key_value(trimmed).is_some() {
288                break;
289            }
290
291            if cont_line.starts_with(' ') || cont_line.starts_with('\t') {
292                emit_line_tokens(builder, cont_line);
293                current_pos += 1;
294            } else {
295                builder.finish_node();
296                return None;
297            }
298        }
299    }
300
301    if current_pos >= lines.len() || !lines[current_pos].trim().is_empty() {
302        builder.finish_node();
303        return None;
304    }
305
306    emit_line_tokens(builder, lines[current_pos]);
307    current_pos += 1;
308
309    builder.finish_node(); // MMD_TITLE_BLOCK
310    Some(current_pos)
311}
312
313#[cfg(test)]
314mod tests {
315    use super::*;
316
317    #[test]
318    fn test_yaml_block_at_start() {
319        let lines = vec!["---", "title: Test", "---", "Content"];
320        let mut builder = GreenNodeBuilder::new();
321        let result = try_parse_yaml_block(&lines, 0, &mut builder, true, &Diagnostics::default());
322        assert_eq!(result, Some(3));
323    }
324
325    #[test]
326    fn test_yaml_block_not_at_start() {
327        let lines = vec!["Paragraph", "", "---", "title: Test", "---", "Content"];
328        let mut builder = GreenNodeBuilder::new();
329        let result = try_parse_yaml_block(&lines, 2, &mut builder, false, &Diagnostics::default());
330        assert_eq!(result, Some(5));
331    }
332
333    #[test]
334    fn test_horizontal_rule_not_yaml() {
335        let lines = vec!["---", "", "Content"];
336        let mut builder = GreenNodeBuilder::new();
337        let result = try_parse_yaml_block(&lines, 0, &mut builder, true, &Diagnostics::default());
338        assert_eq!(result, None); // Followed by blank line, so not YAML
339    }
340
341    #[test]
342    fn test_yaml_with_dots_closer() {
343        let lines = vec!["---", "title: Test", "...", "Content"];
344        let mut builder = GreenNodeBuilder::new();
345        let result = try_parse_yaml_block(&lines, 0, &mut builder, true, &Diagnostics::default());
346        assert_eq!(result, Some(3));
347    }
348
349    #[test]
350    fn test_yaml_without_closing_delimiter_is_not_yaml_block() {
351        let lines = vec!["---", "title: Test", "Content"];
352        let mut builder = GreenNodeBuilder::new();
353        let result = try_parse_yaml_block(&lines, 0, &mut builder, true, &Diagnostics::default());
354        assert_eq!(result, None);
355    }
356
357    #[test]
358    fn test_find_yaml_block_closing_pos() {
359        let lines = vec!["---", "title: Test", "---", "Content"];
360        let result = find_yaml_block_closing_pos(&lines, 0, true);
361        assert_eq!(result, Some(2));
362    }
363
364    #[test]
365    fn test_yaml_block_emits_content_node() {
366        let input = "---\ntitle: Test\nlist:\n  - a\n---\n";
367        let tree = crate::parse(input, Some(crate::ParserOptions::default()));
368        let metadata = tree
369            .descendants()
370            .find(|n| n.kind() == SyntaxKind::YAML_METADATA)
371            .expect("yaml metadata node");
372        let content = metadata
373            .children()
374            .find(|n| n.kind() == SyntaxKind::YAML_METADATA_CONTENT)
375            .expect("yaml metadata content node");
376        assert_eq!(content.text().to_string(), "title: Test\nlist:\n  - a\n");
377    }
378
379    #[test]
380    fn test_pandoc_title_simple() {
381        let lines = vec!["% My Title", "% Author", "% Date", "", "Content"];
382        let mut builder = GreenNodeBuilder::new();
383        let result = try_parse_pandoc_title_block(&lines, 0, &mut builder);
384        assert_eq!(result, Some(3));
385    }
386
387    #[test]
388    fn test_pandoc_title_with_continuation() {
389        let lines = vec![
390            "% My Title",
391            "  on multiple lines",
392            "% Author One",
393            "  Author Two",
394            "% June 15, 2006",
395            "",
396            "Content",
397        ];
398        let mut builder = GreenNodeBuilder::new();
399        let result = try_parse_pandoc_title_block(&lines, 0, &mut builder);
400        assert_eq!(result, Some(5));
401    }
402
403    #[test]
404    fn test_pandoc_title_partial() {
405        let lines = vec!["% My Title", "%", "% June 15, 2006", "", "Content"];
406        let mut builder = GreenNodeBuilder::new();
407        let result = try_parse_pandoc_title_block(&lines, 0, &mut builder);
408        assert_eq!(result, Some(3));
409    }
410
411    #[test]
412    fn test_pandoc_title_not_at_start() {
413        let lines = vec!["Content", "% Title"];
414        let mut builder = GreenNodeBuilder::new();
415        let result = try_parse_pandoc_title_block(&lines, 1, &mut builder);
416        assert_eq!(result, None);
417    }
418
419    #[test]
420    fn test_mmd_title_simple() {
421        let lines = vec!["Title: My Title", "Author: Jane Doe", "", "Content"];
422        let mut builder = GreenNodeBuilder::new();
423        let result = try_parse_mmd_title_block(&lines, 0, &mut builder);
424        assert_eq!(result, Some(3));
425    }
426
427    #[test]
428    fn test_mmd_title_with_continuation() {
429        let lines = vec![
430            "Title: My title",
431            "Author: John Doe",
432            "Comment: This is a sample mmd title block, with",
433            "  a field spanning multiple lines.",
434            "",
435            "Body",
436        ];
437        let mut builder = GreenNodeBuilder::new();
438        let result = try_parse_mmd_title_block(&lines, 0, &mut builder);
439        assert_eq!(result, Some(5));
440    }
441
442    #[test]
443    fn test_mmd_title_requires_non_empty_first_value() {
444        let lines = vec!["Title:", "Author: Jane Doe", "", "Body"];
445        let mut builder = GreenNodeBuilder::new();
446        let result = try_parse_mmd_title_block(&lines, 0, &mut builder);
447        assert_eq!(result, None);
448    }
449
450    #[test]
451    fn test_mmd_title_requires_trailing_blank_line() {
452        let lines = vec!["Title: My Title", "Author: Jane Doe"];
453        let mut builder = GreenNodeBuilder::new();
454        let result = try_parse_mmd_title_block(&lines, 0, &mut builder);
455        assert_eq!(result, None);
456    }
457
458    #[test]
459    fn test_mmd_title_not_at_start() {
460        let lines = vec!["Body", "Title: My Title", ""];
461        let mut builder = GreenNodeBuilder::new();
462        let result = try_parse_mmd_title_block(&lines, 1, &mut builder);
463        assert_eq!(result, None);
464    }
465
466    #[test]
467    fn test_indented_yaml_delimiters_are_lossless() {
468        let input = "    ---\n    title: Test\n    ...\n";
469        let tree = crate::parse(input, Some(crate::ParserOptions::default()));
470        assert_eq!(tree.text().to_string(), input);
471    }
472
473    #[test]
474    fn test_valid_yaml_content_embeds_yaml_document_subtree() {
475        let input = "---\ntitle: Test\nlist:\n  - a\n---\n";
476        let tree = crate::parse(input, Some(crate::ParserOptions::default()));
477        assert_eq!(tree.text().to_string(), input);
478        let content = tree
479            .descendants()
480            .find(|n| n.kind() == SyntaxKind::YAML_METADATA)
481            .and_then(|m| {
482                m.children()
483                    .find(|c| c.kind() == SyntaxKind::YAML_METADATA_CONTENT)
484            })
485            .expect("yaml metadata content node");
486        // YAML_METADATA_CONTENT plays the singleton-stream role; the
487        // YAML_STREAM wrapper is dropped during embedding. The direct
488        // child is the YAML_DOCUMENT covering the full content range.
489        let first_child = content
490            .children()
491            .next()
492            .expect("embedded yaml subtree child");
493        assert_eq!(first_child.kind(), SyntaxKind::YAML_DOCUMENT);
494        assert_eq!(first_child.text_range(), content.text_range());
495        assert!(
496            content
497                .descendants()
498                .all(|n| n.kind() != SyntaxKind::YAML_STREAM),
499            "host embed should not carry the redundant YAML_STREAM wrapper"
500        );
501    }
502
503    #[test]
504    fn test_invalid_yaml_content_falls_back_to_line_tokens() {
505        // Unterminated single-quoted scalar is rejected by the YAML
506        // validator. The host parser must keep the legacy line-token
507        // shape so losslessness holds and the downstream re-parse still
508        // reports the diagnostic.
509        let input = "---\ntitle: 'unterminated\n---\n";
510        let tree = crate::parse(input, Some(crate::ParserOptions::default()));
511        assert_eq!(tree.text().to_string(), input);
512        let content = tree
513            .descendants()
514            .find(|n| n.kind() == SyntaxKind::YAML_METADATA)
515            .and_then(|m| {
516                m.children()
517                    .find(|c| c.kind() == SyntaxKind::YAML_METADATA_CONTENT)
518            })
519            .expect("yaml metadata content node");
520        assert!(
521            content
522                .children()
523                .all(|c| c.kind() != SyntaxKind::YAML_DOCUMENT),
524            "invalid YAML must not embed a YAML_DOCUMENT subtree"
525        );
526    }
527}