Skip to main content

panache_parser/parser/blocks/
headings.rs

1//! ATX heading parsing utilities.
2
3use crate::options::ParserOptions;
4use crate::syntax::SyntaxKind;
5use rowan::GreenNodeBuilder;
6
7use crate::parser::utils::attributes::{
8    emit_attribute_node, try_parse_trailing_attributes_with_pos,
9};
10use crate::parser::utils::helpers::trim_end_spaces_tabs;
11use crate::parser::utils::inline_emission;
12
13fn try_parse_mmd_header_identifier_with_pos(content: &str) -> Option<(String, usize, usize)> {
14    let trimmed = trim_end_spaces_tabs(content);
15    let end = trimmed.len();
16    let bytes = trimmed.as_bytes();
17
18    if end == 0 || bytes[end - 1] != b']' {
19        return None;
20    }
21
22    let start = trimmed[..end - 1].rfind('[')?;
23    let raw = &trimmed[start..end];
24    let inner = &raw[1..raw.len() - 1];
25    if inner.trim().is_empty() {
26        return None;
27    }
28
29    let normalized = inner.split_whitespace().collect::<String>().to_lowercase();
30    if normalized.is_empty() {
31        return None;
32    }
33
34    Some((normalized, start, end))
35}
36
37/// Try to parse an ATX heading from content, returns heading level (1-6) if found.
38pub fn try_parse_atx_heading(content: &str) -> Option<usize> {
39    let line = if let Some(stripped) = content.strip_suffix("\r\n") {
40        stripped
41    } else if let Some(stripped) = content.strip_suffix('\n') {
42        stripped
43    } else {
44        content
45    };
46    let trimmed = line.trim_start();
47
48    // Must start with 1-6 # characters
49    let hash_count = trimmed.chars().take_while(|&c| c == '#').count();
50    if hash_count == 0 || hash_count > 6 {
51        return None;
52    }
53
54    // After hashes, must be end of line, space, or tab.
55    // We strip trailing line ending first so empty headings like `##\n`
56    // are accepted when this function is called on full source lines.
57    let after_hashes = &trimmed[hash_count..];
58    if !after_hashes.is_empty() && !after_hashes.starts_with(' ') && !after_hashes.starts_with('\t')
59    {
60        return None;
61    }
62
63    // Check leading spaces (max 3)
64    let leading_spaces = line.len() - trimmed.len();
65    if leading_spaces > 3 {
66        return None;
67    }
68
69    Some(hash_count)
70}
71
72/// Try to parse a setext heading from lines, returns (level, underline_char) if found.
73///
74/// Setext headings consist of:
75/// 1. A non-empty text line (heading content)
76/// 2. An underline of `=` (level 1) or `-` (level 2) characters
77///
78/// Rules:
79/// - Underline can be any non-zero length (CommonMark §4.3 / Pandoc both)
80/// - Underline can have leading/trailing spaces (up to 3 leading spaces)
81/// - All underline characters must be the same (`=` or `-`)
82/// - Text line cannot be indented 4+ spaces (would be code block)
83/// - Text line cannot be empty/blank
84pub fn try_parse_setext_heading(lines: &[&str], pos: usize) -> Option<(usize, char)> {
85    // Need current line (text) and next line (underline)
86    if pos >= lines.len() {
87        return None;
88    }
89
90    let text_line = lines[pos];
91    let next_pos = pos + 1;
92    if next_pos >= lines.len() {
93        return None;
94    }
95
96    let underline = lines[next_pos];
97
98    // Text line cannot be empty or blank
99    if crate::parser::utils::helpers::is_blank_line(text_line) {
100        return None;
101    }
102
103    // Text line cannot be indented 4+ spaces (would be code block)
104    let leading_spaces = text_line.len() - text_line.trim_start().len();
105    if leading_spaces >= 4 {
106        return None;
107    }
108
109    // Check if underline is valid
110    let underline_trimmed = underline.trim();
111
112    // Must be non-empty
113    if underline_trimmed.is_empty() {
114        return None;
115    }
116
117    // Determine underline character and check consistency
118    let first_char = underline_trimmed.chars().next()?;
119    if first_char != '=' && first_char != '-' {
120        return None;
121    }
122
123    // All characters must be the same
124    if !underline_trimmed.chars().all(|c| c == first_char) {
125        return None;
126    }
127
128    // Leading spaces in underline (max 3 for consistency with other block rules)
129    let underline_leading_spaces = underline.len() - underline.trim_start().len();
130    if underline_leading_spaces >= 4 {
131        return None;
132    }
133
134    // Determine level: '=' is level 1, '-' is level 2
135    let level = if first_char == '=' { 1 } else { 2 };
136
137    Some((level, first_char))
138}
139
140/// Emit a setext heading node to the builder.
141///
142/// Setext headings consist of a text line followed by an underline.
143/// This function emits the complete HEADING node with both lines.
144pub(crate) fn emit_setext_heading(
145    builder: &mut GreenNodeBuilder<'static>,
146    text_line: &str,
147    underline_line: &str,
148    level: usize,
149    config: &ParserOptions,
150) {
151    builder.start_node(SyntaxKind::HEADING.into());
152    emit_setext_heading_body(builder, text_line, underline_line, level, config);
153    builder.finish_node(); // HEADING
154}
155
156/// Emit the body of a setext heading (HEADING_CONTENT + underline + newlines).
157///
158/// The caller is responsible for the surrounding `HEADING` start/finish node.
159/// This split lets multi-line setext headings retroactively wrap a previously
160/// open paragraph by combining its buffered content with the underline line.
161pub(crate) fn emit_setext_heading_body(
162    builder: &mut GreenNodeBuilder<'static>,
163    text_line: &str,
164    underline_line: &str,
165    _level: usize,
166    config: &ParserOptions,
167) {
168    // Strip trailing newline from text line for processing
169    let (text_without_newline, text_newline_str) =
170        if let Some(stripped) = text_line.strip_suffix("\r\n") {
171            (stripped, "\r\n")
172        } else if let Some(stripped) = text_line.strip_suffix('\n') {
173            (stripped, "\n")
174        } else {
175            (text_line, "")
176        };
177
178    // Handle leading spaces in text line
179    let text_trimmed = text_without_newline.trim_start();
180    let leading_spaces = text_without_newline.len() - text_trimmed.len();
181
182    if leading_spaces > 0 {
183        builder.token(
184            SyntaxKind::WHITESPACE.into(),
185            &text_without_newline[..leading_spaces],
186        );
187    }
188
189    // Try to parse trailing attributes from heading text
190    let (text_content, attr_text, space_before_attrs) =
191        if let Some((_attrs, text_before, start_brace_pos)) =
192            try_parse_trailing_attributes_with_pos(text_trimmed)
193        {
194            let space = &text_trimmed[text_before.len()..start_brace_pos];
195            let raw_attrs = &text_trimmed[start_brace_pos..];
196            (text_before, Some(raw_attrs), space)
197        } else if config.extensions.mmd_header_identifiers {
198            if let Some((_normalized, start_bracket_pos, end_bracket_pos)) =
199                try_parse_mmd_header_identifier_with_pos(text_trimmed)
200            {
201                let text_before = trim_end_spaces_tabs(&text_trimmed[..start_bracket_pos]);
202                let space = &text_trimmed[text_before.len()..start_bracket_pos];
203                let raw_attrs = &text_trimmed[start_bracket_pos..end_bracket_pos];
204                (text_before, Some(raw_attrs), space)
205            } else {
206                (text_trimmed, None, "")
207            }
208        } else {
209            (text_trimmed, None, "")
210        };
211
212    // Emit heading content with inline parsing
213    builder.start_node(SyntaxKind::HEADING_CONTENT.into());
214    if !text_content.is_empty() {
215        inline_emission::emit_inlines(builder, text_content, config, false);
216    }
217    builder.finish_node();
218
219    // Emit space before attributes if present
220    if !space_before_attrs.is_empty() {
221        builder.token(SyntaxKind::WHITESPACE.into(), space_before_attrs);
222    }
223
224    // Emit attributes if present
225    if let Some(attr_text) = attr_text {
226        emit_attribute_node(builder, attr_text);
227    }
228
229    // Emit newline after text line
230    if !text_newline_str.is_empty() {
231        builder.token(SyntaxKind::NEWLINE.into(), text_newline_str);
232    }
233
234    // Strip trailing newline from underline for processing
235    let (underline_without_newline, underline_newline_str) =
236        if let Some(stripped) = underline_line.strip_suffix("\r\n") {
237            (stripped, "\r\n")
238        } else if let Some(stripped) = underline_line.strip_suffix('\n') {
239            (stripped, "\n")
240        } else {
241            (underline_line, "")
242        };
243
244    // Emit underline leading spaces if present
245    let underline_trimmed = underline_without_newline.trim_start();
246    let underline_leading_spaces = underline_without_newline.len() - underline_trimmed.len();
247
248    if underline_leading_spaces > 0 {
249        builder.token(
250            SyntaxKind::WHITESPACE.into(),
251            &underline_without_newline[..underline_leading_spaces],
252        );
253    }
254
255    // Emit the setext underline as a node containing a token
256    builder.start_node(SyntaxKind::SETEXT_HEADING_UNDERLINE.into());
257    builder.token(
258        SyntaxKind::SETEXT_HEADING_UNDERLINE.into(),
259        underline_trimmed,
260    );
261    builder.finish_node();
262
263    // Emit trailing newline after underline
264    if !underline_newline_str.is_empty() {
265        builder.token(SyntaxKind::NEWLINE.into(), underline_newline_str);
266    }
267}
268
269/// Emit an ATX heading node to the builder.
270pub(crate) fn emit_atx_heading(
271    builder: &mut GreenNodeBuilder<'static>,
272    content: &str,
273    level: usize,
274    config: &ParserOptions,
275) {
276    builder.start_node(SyntaxKind::HEADING.into());
277
278    // Strip trailing newline (LF or CRLF) for processing but remember to emit it later
279    let (content_without_newline, newline_str) =
280        if let Some(stripped) = content.strip_suffix("\r\n") {
281            (stripped, "\r\n")
282        } else if let Some(stripped) = content.strip_suffix('\n') {
283            (stripped, "\n")
284        } else {
285            (content, "")
286        };
287
288    let trimmed = content_without_newline.trim_start();
289    let leading_spaces = content_without_newline.len() - trimmed.len();
290
291    // Emit leading spaces if present
292    if leading_spaces > 0 {
293        builder.token(
294            SyntaxKind::WHITESPACE.into(),
295            &content_without_newline[..leading_spaces],
296        );
297    }
298
299    // Marker node for the hashes (must be a node containing a token, not just a token)
300    builder.start_node(SyntaxKind::ATX_HEADING_MARKER.into());
301    builder.token(SyntaxKind::ATX_HEADING_MARKER.into(), &trimmed[..level]);
302    builder.finish_node();
303
304    // Get content after marker
305    let after_marker = &trimmed[level..];
306    let spaces_after_marker_count = after_marker
307        .find(|c: char| !c.is_whitespace())
308        .unwrap_or(after_marker.len());
309
310    // Emit spaces after marker
311    if spaces_after_marker_count > 0 {
312        builder.token(
313            SyntaxKind::WHITESPACE.into(),
314            &after_marker[..spaces_after_marker_count],
315        );
316    }
317
318    // Get actual heading text
319    let heading_text = &after_marker[spaces_after_marker_count..];
320
321    // Parse optional closing ATX marker (` ###`) while preserving bytes.
322    let (heading_content, closing_suffix) = {
323        let without_trailing_ws = trim_end_spaces_tabs(heading_text);
324        let trailing_hashes = without_trailing_ws
325            .chars()
326            .rev()
327            .take_while(|&c| c == '#')
328            .count();
329
330        if trailing_hashes > 0 {
331            let hashes_start = without_trailing_ws.len() - trailing_hashes;
332            let before_hashes = &without_trailing_ws[..hashes_start];
333            // Closing fence requires the hashes to be preceded by whitespace.
334            // That whitespace can be in `before_hashes` (non-empty content case),
335            // or it can be the post-marker spaces we already consumed when content
336            // is empty (e.g. `### ###` → empty heading with closing fence).
337            let preceded_by_ws = before_hashes
338                .chars()
339                .last()
340                .is_some_and(|c| c == ' ' || c == '\t')
341                || (before_hashes.is_empty() && spaces_after_marker_count > 0);
342            if preceded_by_ws {
343                let content_end = trim_end_spaces_tabs(before_hashes).len();
344                (&heading_text[..content_end], &heading_text[content_end..])
345            } else {
346                (heading_text, "")
347            }
348        } else {
349            (heading_text, "")
350        }
351    };
352
353    // Try to parse trailing attributes
354    let (text_content, attr_text, space_before_attrs) =
355        if let Some((_attrs, text_before, start_brace_pos)) =
356            try_parse_trailing_attributes_with_pos(heading_content)
357        {
358            let space = &heading_content[text_before.len()..start_brace_pos];
359            let raw_attrs = &heading_content[start_brace_pos..];
360            (text_before, Some(raw_attrs), space)
361        } else if config.extensions.mmd_header_identifiers {
362            if let Some((_normalized, start_bracket_pos, end_bracket_pos)) =
363                try_parse_mmd_header_identifier_with_pos(heading_content)
364            {
365                let text_before = trim_end_spaces_tabs(&heading_content[..start_bracket_pos]);
366                let space = &heading_content[text_before.len()..start_bracket_pos];
367                let raw_attrs = &heading_content[start_bracket_pos..end_bracket_pos];
368                (text_before, Some(raw_attrs), space)
369            } else {
370                (heading_content, None, "")
371            }
372        } else {
373            (heading_content, None, "")
374        };
375
376    // Heading content node
377    builder.start_node(SyntaxKind::HEADING_CONTENT.into());
378    if !text_content.is_empty() {
379        inline_emission::emit_inlines(builder, text_content, config, false);
380    }
381    builder.finish_node();
382
383    // Emit space before attributes if present
384    if !space_before_attrs.is_empty() {
385        builder.token(SyntaxKind::WHITESPACE.into(), space_before_attrs);
386    }
387
388    // Emit attributes if present
389    if let Some(attr_text) = attr_text {
390        emit_attribute_node(builder, attr_text);
391    }
392
393    if !closing_suffix.is_empty() {
394        let closing_trimmed = trim_end_spaces_tabs(
395            crate::parser::utils::helpers::trim_start_spaces_tabs(closing_suffix),
396        );
397        let leading_ws_len = closing_suffix
398            .find(|c: char| c != ' ' && c != '\t')
399            .unwrap_or(closing_suffix.len());
400        let trailing_ws_len = closing_suffix.len() - leading_ws_len - closing_trimmed.len();
401
402        if leading_ws_len > 0 {
403            builder.token(
404                SyntaxKind::WHITESPACE.into(),
405                &closing_suffix[..leading_ws_len],
406            );
407        }
408        if !closing_trimmed.is_empty() {
409            builder.token(SyntaxKind::ATX_HEADING_MARKER.into(), closing_trimmed);
410        }
411        if trailing_ws_len > 0 {
412            builder.token(
413                SyntaxKind::WHITESPACE.into(),
414                &closing_suffix[closing_suffix.len() - trailing_ws_len..],
415            );
416        }
417    }
418
419    // Emit trailing newline if present
420    if !newline_str.is_empty() {
421        builder.token(SyntaxKind::NEWLINE.into(), newline_str);
422    }
423
424    builder.finish_node(); // Heading
425}
426
427#[cfg(test)]
428mod tests {
429    use super::*;
430
431    #[test]
432    fn test_simple_heading() {
433        assert_eq!(try_parse_atx_heading("# Heading"), Some(1));
434    }
435
436    #[test]
437    fn test_level_3_heading() {
438        assert_eq!(try_parse_atx_heading("### Level 3"), Some(3));
439    }
440
441    #[test]
442    fn test_heading_with_leading_spaces() {
443        assert_eq!(try_parse_atx_heading("   # Heading"), Some(1));
444    }
445
446    #[test]
447    fn test_atx_heading_with_attributes_losslessness() {
448        use crate::ParserOptions;
449
450        // Regression test for losslessness bug where space before attributes was dropped
451        let input = "# Test {#id}\n";
452        let config = ParserOptions::default();
453        let tree = crate::parse(input, Some(config));
454
455        // Verify losslessness: tree text should exactly match input
456        assert_eq!(
457            tree.text().to_string(),
458            input,
459            "Parser must preserve all bytes including space before attributes"
460        );
461
462        // Verify structure
463        let heading = tree.first_child().unwrap();
464        assert_eq!(heading.kind(), SyntaxKind::HEADING);
465
466        // Find the whitespace between content and attribute
467        let mut found_whitespace = false;
468        for child in heading.children_with_tokens() {
469            if child.kind() == SyntaxKind::WHITESPACE
470                && let Some(token) = child.as_token()
471            {
472                let start: usize = token.text_range().start().into();
473                if token.text() == " " && start == 6 {
474                    found_whitespace = true;
475                    break;
476                }
477            }
478        }
479        assert!(
480            found_whitespace,
481            "Whitespace token between heading content and attributes must be present"
482        );
483    }
484
485    #[test]
486    fn test_atx_heading_closing_hashes_are_lossless() {
487        let input = "### Extension: `smart` ###\n";
488        let tree = crate::parse(input, Some(crate::ParserOptions::default()));
489        assert_eq!(tree.text().to_string(), input);
490    }
491
492    #[test]
493    fn test_four_spaces_not_heading() {
494        assert_eq!(try_parse_atx_heading("    # Not heading"), None);
495    }
496
497    #[test]
498    fn test_no_space_after_hash() {
499        assert_eq!(try_parse_atx_heading("#NoSpace"), None);
500    }
501
502    #[test]
503    fn test_empty_heading() {
504        assert_eq!(try_parse_atx_heading("# "), Some(1));
505    }
506
507    #[test]
508    fn test_level_7_invalid() {
509        assert_eq!(try_parse_atx_heading("####### Too many"), None);
510    }
511
512    // Setext heading tests
513    #[test]
514    fn test_setext_level_1() {
515        let lines = vec!["Heading", "======="];
516        assert_eq!(try_parse_setext_heading(&lines, 0), Some((1, '=')));
517    }
518
519    #[test]
520    fn test_setext_level_2() {
521        let lines = vec!["Heading", "-------"];
522        assert_eq!(try_parse_setext_heading(&lines, 0), Some((2, '-')));
523    }
524
525    #[test]
526    fn test_setext_any_underline_length() {
527        // Per CommonMark §4.3 and Pandoc, the setext underline can be any
528        // non-zero length. Single `=` or `-` after a non-blank line is a
529        // valid setext underline.
530        let lines = vec!["Heading", "="];
531        assert_eq!(try_parse_setext_heading(&lines, 0), Some((1, '=')));
532
533        let lines = vec!["Heading", "=="];
534        assert_eq!(try_parse_setext_heading(&lines, 0), Some((1, '=')));
535
536        let lines = vec!["Heading", "==="];
537        assert_eq!(try_parse_setext_heading(&lines, 0), Some((1, '=')));
538    }
539
540    #[test]
541    fn test_setext_mixed_chars_invalid() {
542        let lines = vec!["Heading", "==-=="];
543        assert_eq!(try_parse_setext_heading(&lines, 0), None);
544    }
545
546    #[test]
547    fn test_setext_with_leading_spaces() {
548        let lines = vec!["Heading", "   ======="];
549        assert_eq!(try_parse_setext_heading(&lines, 0), Some((1, '=')));
550    }
551
552    #[test]
553    fn test_setext_with_trailing_spaces() {
554        let lines = vec!["Heading", "=======   "];
555        assert_eq!(try_parse_setext_heading(&lines, 0), Some((1, '=')));
556    }
557
558    #[test]
559    fn test_setext_empty_text_line() {
560        let lines = vec!["", "======="];
561        assert_eq!(try_parse_setext_heading(&lines, 0), None);
562    }
563
564    #[test]
565    fn test_setext_no_next_line() {
566        let lines = vec!["Heading"];
567        assert_eq!(try_parse_setext_heading(&lines, 0), None);
568    }
569
570    #[test]
571    fn test_setext_four_spaces_indent() {
572        // 4+ spaces means code block, not setext
573        let lines = vec!["    Heading", "    ======="];
574        assert_eq!(try_parse_setext_heading(&lines, 0), None);
575    }
576
577    #[test]
578    fn test_setext_long_underline() {
579        let underline = "=".repeat(100);
580        let lines = vec!["Heading", underline.as_str()];
581        assert_eq!(try_parse_setext_heading(&lines, 0), Some((1, '=')));
582    }
583
584    #[test]
585    fn test_parse_mmd_header_identifier_normalizes_like_pandoc() {
586        let parsed = try_parse_mmd_header_identifier_with_pos("A heading [My ID]")
587            .expect("should parse mmd header identifier");
588        assert_eq!(parsed.0, "myid");
589        assert_eq!(parsed.1, 10);
590    }
591}