Skip to main content

panache_parser/parser/blocks/
metadata.rs

1//! YAML metadata block parsing utilities.
2
3use crate::options::Flavor;
4use crate::parser::diagnostics::{Diagnostics, SyntaxError, SyntaxErrorSource};
5use crate::parser::utils::helpers::{emit_line_tokens, strip_newline};
6use crate::parser::utils::tree_copy::copy_green_children;
7use crate::parser::yaml::{YamlValidationContext, locate_yaml_diagnostic_ctx, parse_stream};
8use crate::syntax::SyntaxKind;
9use rowan::{GreenNodeBuilder, TextRange};
10
11/// Try to parse a YAML metadata block starting at the given position.
12/// Returns the new position after the block if successful, None otherwise.
13///
14/// A YAML block:
15/// - Starts with `---` (not followed by blank line)
16/// - Ends with `---` or `...`
17/// - At document start OR preceded by blank line
18pub(crate) fn try_parse_yaml_block(
19    lines: &[&str],
20    pos: usize,
21    builder: &mut GreenNodeBuilder<'static>,
22    at_document_start: bool,
23    diags: &Diagnostics,
24    flavor: Flavor,
25) -> Option<usize> {
26    let closing_pos = find_yaml_block_closing_pos(lines, pos, at_document_start)?;
27    emit_yaml_block(lines, pos, closing_pos, builder, diags, flavor)
28}
29
30pub(crate) fn find_yaml_block_closing_pos(
31    lines: &[&str],
32    pos: usize,
33    at_document_start: bool,
34) -> Option<usize> {
35    if pos >= lines.len() {
36        return None;
37    }
38
39    let line = lines[pos];
40
41    // Must start with ---
42    if line.trim() != "---" {
43        return None;
44    }
45
46    // If not at document start, previous line must be blank
47    if !at_document_start && pos > 0 {
48        let prev_line = lines[pos - 1];
49        if !prev_line.trim().is_empty() {
50            return None;
51        }
52    }
53
54    // Check that next line (if exists) is NOT blank (this distinguishes from horizontal rule)
55    if pos + 1 < lines.len() {
56        let next_line = lines[pos + 1];
57        if next_line.trim().is_empty() {
58            // This is likely a horizontal rule, not YAML
59            return None;
60        }
61    } else {
62        // No content after ---, can't be a YAML block
63        return None;
64    }
65
66    // Find a closing delimiter before emitting; otherwise this is not a valid YAML block.
67    let mut closing_pos = None;
68    for (i, content_line) in lines.iter().enumerate().skip(pos + 1) {
69        if content_line.trim() == "---" || content_line.trim() == "..." {
70            closing_pos = Some(i);
71            break;
72        }
73    }
74    closing_pos
75}
76
77pub(crate) fn emit_yaml_block(
78    lines: &[&str],
79    pos: usize,
80    closing_pos: usize,
81    builder: &mut GreenNodeBuilder<'static>,
82    diags: &Diagnostics,
83    flavor: Flavor,
84) -> Option<usize> {
85    if pos >= lines.len() || closing_pos <= pos || closing_pos >= lines.len() {
86        return None;
87    }
88    // Start metadata node
89    builder.start_node(SyntaxKind::YAML_METADATA.into());
90
91    // Opening delimiter - strip newline before emitting
92    let (text, newline_str) = strip_newline(lines[pos]);
93    builder.token(SyntaxKind::YAML_METADATA_DELIM.into(), text);
94    if !newline_str.is_empty() {
95        builder.token(SyntaxKind::NEWLINE.into(), newline_str);
96    }
97
98    builder.start_node(SyntaxKind::YAML_METADATA_CONTENT.into());
99    // Reconstruct the frontmatter content as a contiguous byte string. The
100    // lines returned by `split_lines_inclusive` are non-overlapping slices
101    // of the original input that retain their trailing LF / CRLF, so
102    // concatenating them rebuilds the source bytes between the delimiters
103    // exactly (including CRLF).
104    let mut content = String::new();
105    for content_line in lines.iter().take(closing_pos).skip(pos + 1) {
106        content.push_str(content_line);
107    }
108
109    // Embed the in-tree YAML CST under YAML_METADATA_CONTENT when the
110    // content validates. On validation failure, fall back to the
111    // opaque line-token shape so downstream re-parse (and the host
112    // CST snapshot of malformed YAML) keep their current behavior.
113    //
114    // `parse_stream` returns a `YAML_STREAM` wrapping one or more
115    // `YAML_DOCUMENT` children. The wrapper is the YAML-spec stream
116    // container — but inside frontmatter the host's
117    // `YAML_METADATA_CONTENT` already plays that role (and
118    // `find_yaml_block_closing_pos` guarantees a single document by
119    // stopping at the first internal `---` / `...`). Splice the stream's
120    // children in directly to avoid the redundant wrapper.
121    let yaml_ctx = YamlValidationContext::frontmatter(flavor);
122    if let Some((diag, start_off, end_off)) = locate_yaml_diagnostic_ctx(&content, "", yaml_ctx) {
123        // Malformed frontmatter YAML: record the syntax error at its host
124        // position (the parser already has the verdict), then fall back to the
125        // opaque line-token shape. `content` begins at `lines[pos + 1]`, a
126        // subslice of the host input, so its host start is the pointer offset
127        // from line 0; offsets are identity (no per-line prefix).
128        let host_start = lines[pos + 1].as_ptr() as usize - lines[0].as_ptr() as usize;
129        diags.push(SyntaxError {
130            range: TextRange::new(
131                ((host_start + start_off) as u32).into(),
132                ((host_start + end_off) as u32).into(),
133            ),
134            message: diag.message.to_string(),
135            source: SyntaxErrorSource::Yaml,
136        });
137        for content_line in lines.iter().take(closing_pos).skip(pos + 1) {
138            emit_line_tokens(builder, content_line);
139        }
140    } else {
141        let stream_green = parse_stream(&content).green().into_owned();
142        copy_green_children(builder, &stream_green);
143    }
144    builder.finish_node(); // YAML_METADATA_CONTENT
145
146    let (closing_text, closing_newline) = strip_newline(lines[closing_pos]);
147    builder.token(SyntaxKind::YAML_METADATA_DELIM.into(), closing_text);
148    if !closing_newline.is_empty() {
149        builder.token(SyntaxKind::NEWLINE.into(), closing_newline);
150    }
151
152    builder.finish_node(); // YamlMetadata
153
154    Some(closing_pos + 1)
155}
156
157/// Try to parse a Pandoc title block starting at the beginning of document.
158/// Returns the new position after the block if successful, None otherwise.
159///
160/// A Pandoc title block:
161/// - Must be at document start (pos == 0)
162/// - Has 1-3 lines starting with `%`
163/// - Format: % title, % author(s), % date
164/// - Continuation lines start with leading space
165pub(crate) fn try_parse_pandoc_title_block(
166    lines: &[&str],
167    pos: usize,
168    builder: &mut GreenNodeBuilder<'static>,
169) -> Option<usize> {
170    if pos != 0 || lines.is_empty() {
171        return None;
172    }
173
174    let first_line = lines[0];
175    if !first_line.trim_start().starts_with('%') {
176        return None;
177    }
178
179    // Start title block node
180    builder.start_node(SyntaxKind::PANDOC_TITLE_BLOCK.into());
181
182    let mut current_pos = 0;
183    let mut field_count = 0;
184
185    // Parse up to 3 fields (title, author, date)
186    while current_pos < lines.len() && field_count < 3 {
187        let line = lines[current_pos];
188
189        // Check if this line starts a field (begins with %)
190        if line.trim_start().starts_with('%') {
191            emit_line_tokens(builder, line);
192            field_count += 1;
193            current_pos += 1;
194
195            // Collect continuation lines (start with leading space, not with %)
196            while current_pos < lines.len() {
197                let cont_line = lines[current_pos];
198                if cont_line.is_empty() {
199                    // Blank line ends title block
200                    break;
201                }
202                if cont_line.trim_start().starts_with('%') {
203                    // Next field
204                    break;
205                }
206                if cont_line.starts_with(' ') || cont_line.starts_with('\t') {
207                    // Continuation line
208                    emit_line_tokens(builder, cont_line);
209                    current_pos += 1;
210                } else {
211                    // Non-continuation, non-% line ends title block
212                    break;
213                }
214            }
215        } else {
216            // Line doesn't start with %, title block ends
217            break;
218        }
219    }
220
221    builder.finish_node(); // PandocTitleBlock
222
223    if field_count > 0 {
224        Some(current_pos)
225    } else {
226        None
227    }
228}
229
230fn mmd_key_value(line: &str) -> Option<(String, String)> {
231    let (key, value) = line.split_once(':')?;
232    let key_trimmed = key.trim();
233    if key_trimmed.is_empty() {
234        return None;
235    }
236    Some((key_trimmed.to_string(), value.trim().to_string()))
237}
238
239/// Try to parse a MultiMarkdown title block starting at the beginning of document.
240/// Returns the new position after the block if successful, None otherwise.
241///
242/// A MultiMarkdown title block:
243/// - Must be at document start (pos == 0)
244/// - Contains one or more `Key: Value` lines
245/// - The first field value must be non-empty
246/// - Continuation lines start with leading space or tab
247/// - Terminates with a blank line
248pub(crate) fn try_parse_mmd_title_block(
249    lines: &[&str],
250    pos: usize,
251    builder: &mut GreenNodeBuilder<'static>,
252) -> Option<usize> {
253    if pos != 0 || lines.is_empty() {
254        return None;
255    }
256
257    let mut current_pos = pos;
258
259    // First line must be a key-value pair with non-empty value.
260    let first = lines[current_pos];
261    let (_first_key, first_value) = mmd_key_value(first)?;
262    if first_value.is_empty() {
263        return None;
264    }
265
266    builder.start_node(SyntaxKind::MMD_TITLE_BLOCK.into());
267
268    while current_pos < lines.len() {
269        let line = lines[current_pos];
270
271        if line.trim().is_empty() {
272            break;
273        }
274
275        if mmd_key_value(line).is_none() {
276            builder.finish_node();
277            return None;
278        }
279
280        emit_line_tokens(builder, line);
281        current_pos += 1;
282
283        // Optional continuation lines (must be indented and not key-value starts).
284        while current_pos < lines.len() {
285            let cont_line = lines[current_pos];
286            if cont_line.trim().is_empty() {
287                break;
288            }
289
290            let trimmed = cont_line.trim_start();
291            if mmd_key_value(trimmed).is_some() {
292                break;
293            }
294
295            if cont_line.starts_with(' ') || cont_line.starts_with('\t') {
296                emit_line_tokens(builder, cont_line);
297                current_pos += 1;
298            } else {
299                builder.finish_node();
300                return None;
301            }
302        }
303    }
304
305    if current_pos >= lines.len() || !lines[current_pos].trim().is_empty() {
306        builder.finish_node();
307        return None;
308    }
309
310    emit_line_tokens(builder, lines[current_pos]);
311    current_pos += 1;
312
313    builder.finish_node(); // MMD_TITLE_BLOCK
314    Some(current_pos)
315}
316
317#[cfg(test)]
318mod tests {
319    use super::*;
320
321    #[test]
322    fn test_yaml_block_at_start() {
323        let lines = vec!["---", "title: Test", "---", "Content"];
324        let mut builder = GreenNodeBuilder::new();
325        let result = try_parse_yaml_block(
326            &lines,
327            0,
328            &mut builder,
329            true,
330            &Diagnostics::default(),
331            Flavor::Pandoc,
332        );
333        assert_eq!(result, Some(3));
334    }
335
336    #[test]
337    fn test_yaml_block_not_at_start() {
338        let lines = vec!["Paragraph", "", "---", "title: Test", "---", "Content"];
339        let mut builder = GreenNodeBuilder::new();
340        let result = try_parse_yaml_block(
341            &lines,
342            2,
343            &mut builder,
344            false,
345            &Diagnostics::default(),
346            Flavor::Pandoc,
347        );
348        assert_eq!(result, Some(5));
349    }
350
351    #[test]
352    fn test_horizontal_rule_not_yaml() {
353        let lines = vec!["---", "", "Content"];
354        let mut builder = GreenNodeBuilder::new();
355        let result = try_parse_yaml_block(
356            &lines,
357            0,
358            &mut builder,
359            true,
360            &Diagnostics::default(),
361            Flavor::Pandoc,
362        );
363        assert_eq!(result, None); // Followed by blank line, so not YAML
364    }
365
366    #[test]
367    fn test_yaml_with_dots_closer() {
368        let lines = vec!["---", "title: Test", "...", "Content"];
369        let mut builder = GreenNodeBuilder::new();
370        let result = try_parse_yaml_block(
371            &lines,
372            0,
373            &mut builder,
374            true,
375            &Diagnostics::default(),
376            Flavor::Pandoc,
377        );
378        assert_eq!(result, Some(3));
379    }
380
381    #[test]
382    fn test_yaml_without_closing_delimiter_is_not_yaml_block() {
383        let lines = vec!["---", "title: Test", "Content"];
384        let mut builder = GreenNodeBuilder::new();
385        let result = try_parse_yaml_block(
386            &lines,
387            0,
388            &mut builder,
389            true,
390            &Diagnostics::default(),
391            Flavor::Pandoc,
392        );
393        assert_eq!(result, None);
394    }
395
396    #[test]
397    fn test_find_yaml_block_closing_pos() {
398        let lines = vec!["---", "title: Test", "---", "Content"];
399        let result = find_yaml_block_closing_pos(&lines, 0, true);
400        assert_eq!(result, Some(2));
401    }
402
403    #[test]
404    fn test_yaml_block_emits_content_node() {
405        let input = "---\ntitle: Test\nlist:\n  - a\n---\n";
406        let tree = crate::parse(input, Some(crate::ParserOptions::default()));
407        let metadata = tree
408            .descendants()
409            .find(|n| n.kind() == SyntaxKind::YAML_METADATA)
410            .expect("yaml metadata node");
411        let content = metadata
412            .children()
413            .find(|n| n.kind() == SyntaxKind::YAML_METADATA_CONTENT)
414            .expect("yaml metadata content node");
415        assert_eq!(content.text().to_string(), "title: Test\nlist:\n  - a\n");
416    }
417
418    #[test]
419    fn test_pandoc_title_simple() {
420        let lines = vec!["% My Title", "% Author", "% Date", "", "Content"];
421        let mut builder = GreenNodeBuilder::new();
422        let result = try_parse_pandoc_title_block(&lines, 0, &mut builder);
423        assert_eq!(result, Some(3));
424    }
425
426    #[test]
427    fn test_pandoc_title_with_continuation() {
428        let lines = vec![
429            "% My Title",
430            "  on multiple lines",
431            "% Author One",
432            "  Author Two",
433            "% June 15, 2006",
434            "",
435            "Content",
436        ];
437        let mut builder = GreenNodeBuilder::new();
438        let result = try_parse_pandoc_title_block(&lines, 0, &mut builder);
439        assert_eq!(result, Some(5));
440    }
441
442    #[test]
443    fn test_pandoc_title_partial() {
444        let lines = vec!["% My Title", "%", "% June 15, 2006", "", "Content"];
445        let mut builder = GreenNodeBuilder::new();
446        let result = try_parse_pandoc_title_block(&lines, 0, &mut builder);
447        assert_eq!(result, Some(3));
448    }
449
450    #[test]
451    fn test_pandoc_title_not_at_start() {
452        let lines = vec!["Content", "% Title"];
453        let mut builder = GreenNodeBuilder::new();
454        let result = try_parse_pandoc_title_block(&lines, 1, &mut builder);
455        assert_eq!(result, None);
456    }
457
458    #[test]
459    fn test_mmd_title_simple() {
460        let lines = vec!["Title: My Title", "Author: Jane Doe", "", "Content"];
461        let mut builder = GreenNodeBuilder::new();
462        let result = try_parse_mmd_title_block(&lines, 0, &mut builder);
463        assert_eq!(result, Some(3));
464    }
465
466    #[test]
467    fn test_mmd_title_with_continuation() {
468        let lines = vec![
469            "Title: My title",
470            "Author: John Doe",
471            "Comment: This is a sample mmd title block, with",
472            "  a field spanning multiple lines.",
473            "",
474            "Body",
475        ];
476        let mut builder = GreenNodeBuilder::new();
477        let result = try_parse_mmd_title_block(&lines, 0, &mut builder);
478        assert_eq!(result, Some(5));
479    }
480
481    #[test]
482    fn test_mmd_title_requires_non_empty_first_value() {
483        let lines = vec!["Title:", "Author: Jane Doe", "", "Body"];
484        let mut builder = GreenNodeBuilder::new();
485        let result = try_parse_mmd_title_block(&lines, 0, &mut builder);
486        assert_eq!(result, None);
487    }
488
489    #[test]
490    fn test_mmd_title_requires_trailing_blank_line() {
491        let lines = vec!["Title: My Title", "Author: Jane Doe"];
492        let mut builder = GreenNodeBuilder::new();
493        let result = try_parse_mmd_title_block(&lines, 0, &mut builder);
494        assert_eq!(result, None);
495    }
496
497    #[test]
498    fn test_mmd_title_not_at_start() {
499        let lines = vec!["Body", "Title: My Title", ""];
500        let mut builder = GreenNodeBuilder::new();
501        let result = try_parse_mmd_title_block(&lines, 1, &mut builder);
502        assert_eq!(result, None);
503    }
504
505    #[test]
506    fn test_indented_yaml_delimiters_are_lossless() {
507        let input = "    ---\n    title: Test\n    ...\n";
508        let tree = crate::parse(input, Some(crate::ParserOptions::default()));
509        assert_eq!(tree.text().to_string(), input);
510    }
511
512    #[test]
513    fn test_valid_yaml_content_embeds_yaml_document_subtree() {
514        let input = "---\ntitle: Test\nlist:\n  - a\n---\n";
515        let tree = crate::parse(input, Some(crate::ParserOptions::default()));
516        assert_eq!(tree.text().to_string(), input);
517        let content = tree
518            .descendants()
519            .find(|n| n.kind() == SyntaxKind::YAML_METADATA)
520            .and_then(|m| {
521                m.children()
522                    .find(|c| c.kind() == SyntaxKind::YAML_METADATA_CONTENT)
523            })
524            .expect("yaml metadata content node");
525        // YAML_METADATA_CONTENT plays the singleton-stream role; the
526        // YAML_STREAM wrapper is dropped during embedding. The direct
527        // child is the YAML_DOCUMENT covering the full content range.
528        let first_child = content
529            .children()
530            .next()
531            .expect("embedded yaml subtree child");
532        assert_eq!(first_child.kind(), SyntaxKind::YAML_DOCUMENT);
533        assert_eq!(first_child.text_range(), content.text_range());
534        assert!(
535            content
536                .descendants()
537                .all(|n| n.kind() != SyntaxKind::YAML_STREAM),
538            "host embed should not carry the redundant YAML_STREAM wrapper"
539        );
540    }
541
542    #[test]
543    fn test_invalid_yaml_content_falls_back_to_line_tokens() {
544        // Unterminated single-quoted scalar is rejected by the YAML
545        // validator. The host parser must keep the legacy line-token
546        // shape so losslessness holds and the downstream re-parse still
547        // reports the diagnostic.
548        let input = "---\ntitle: 'unterminated\n---\n";
549        let tree = crate::parse(input, Some(crate::ParserOptions::default()));
550        assert_eq!(tree.text().to_string(), input);
551        let content = tree
552            .descendants()
553            .find(|n| n.kind() == SyntaxKind::YAML_METADATA)
554            .and_then(|m| {
555                m.children()
556                    .find(|c| c.kind() == SyntaxKind::YAML_METADATA_CONTENT)
557            })
558            .expect("yaml metadata content node");
559        assert!(
560            content
561                .children()
562                .all(|c| c.kind() != SyntaxKind::YAML_DOCUMENT),
563            "invalid YAML must not embed a YAML_DOCUMENT subtree"
564        );
565    }
566}