panache_parser/parser/blocks/
headings.rs

1//! ATX heading parsing utilities.
2
3use crate::options::ParserOptions;
4use crate::syntax::SyntaxKind;
5use rowan::GreenNodeBuilder;
6
7use crate::parser::utils::attributes::try_parse_trailing_attributes_with_pos;
8use crate::parser::utils::helpers::trim_end_spaces_tabs;
9use crate::parser::utils::inline_emission;
10
11fn try_parse_mmd_header_identifier_with_pos(content: &str) -> Option<(String, usize, usize)> {
12    let trimmed = trim_end_spaces_tabs(content);
13    let end = trimmed.len();
14    let bytes = trimmed.as_bytes();
15
16    if end == 0 || bytes[end - 1] != b']' {
17        return None;
18    }
19
20    let start = trimmed[..end - 1].rfind('[')?;
21    let raw = &trimmed[start..end];
22    let inner = &raw[1..raw.len() - 1];
23    if inner.trim().is_empty() {
24        return None;
25    }
26
27    let normalized = inner.split_whitespace().collect::<String>().to_lowercase();
28    if normalized.is_empty() {
29        return None;
30    }
31
32    Some((normalized, start, end))
33}
34
35/// Try to parse an ATX heading from content, returns heading level (1-6) if found.
36pub fn try_parse_atx_heading(content: &str) -> Option<usize> {
37    let line = if let Some(stripped) = content.strip_suffix("\r\n") {
38        stripped
39    } else if let Some(stripped) = content.strip_suffix('\n') {
40        stripped
41    } else {
42        content
43    };
44    let trimmed = line.trim_start();
45
46    // Must start with 1-6 # characters
47    let hash_count = trimmed.chars().take_while(|&c| c == '#').count();
48    if hash_count == 0 || hash_count > 6 {
49        return None;
50    }
51
52    // After hashes, must be end of line, space, or tab.
53    // We strip trailing line ending first so empty headings like `##\n`
54    // are accepted when this function is called on full source lines.
55    let after_hashes = &trimmed[hash_count..];
56    if !after_hashes.is_empty() && !after_hashes.starts_with(' ') && !after_hashes.starts_with('\t')
57    {
58        return None;
59    }
60
61    // Check leading spaces (max 3)
62    let leading_spaces = line.len() - trimmed.len();
63    if leading_spaces > 3 {
64        return None;
65    }
66
67    Some(hash_count)
68}
69
70/// Try to parse a setext heading from lines, returns (level, underline_char) if found.
71///
72/// Setext headings consist of:
73/// 1. A non-empty text line (heading content)
74/// 2. An underline of `=` (level 1) or `-` (level 2) characters
75///
76/// Rules:
77/// - Underline can be any non-zero length (CommonMark §4.3 / Pandoc both)
78/// - Underline can have leading/trailing spaces (up to 3 leading spaces)
79/// - All underline characters must be the same (`=` or `-`)
80/// - Text line cannot be indented 4+ spaces (would be code block)
81/// - Text line cannot be empty/blank
82pub fn try_parse_setext_heading(lines: &[&str], pos: usize) -> Option<(usize, char)> {
83    // Need current line (text) and next line (underline)
84    if pos >= lines.len() {
85        return None;
86    }
87
88    let text_line = lines[pos];
89    let next_pos = pos + 1;
90    if next_pos >= lines.len() {
91        return None;
92    }
93
94    let underline = lines[next_pos];
95
96    // Text line cannot be empty or blank
97    if crate::parser::utils::helpers::is_blank_line(text_line) {
98        return None;
99    }
100
101    // Text line cannot be indented 4+ spaces (would be code block)
102    let leading_spaces = text_line.len() - text_line.trim_start().len();
103    if leading_spaces >= 4 {
104        return None;
105    }
106
107    // Check if underline is valid
108    let underline_trimmed = underline.trim();
109
110    // Must be non-empty
111    if underline_trimmed.is_empty() {
112        return None;
113    }
114
115    // Determine underline character and check consistency
116    let first_char = underline_trimmed.chars().next()?;
117    if first_char != '=' && first_char != '-' {
118        return None;
119    }
120
121    // All characters must be the same
122    if !underline_trimmed.chars().all(|c| c == first_char) {
123        return None;
124    }
125
126    // Leading spaces in underline (max 3 for consistency with other block rules)
127    let underline_leading_spaces = underline.len() - underline.trim_start().len();
128    if underline_leading_spaces >= 4 {
129        return None;
130    }
131
132    // Determine level: '=' is level 1, '-' is level 2
133    let level = if first_char == '=' { 1 } else { 2 };
134
135    Some((level, first_char))
136}
137
138/// Emit a setext heading node to the builder.
139///
140/// Setext headings consist of a text line followed by an underline.
141/// This function emits the complete HEADING node with both lines.
142pub(crate) fn emit_setext_heading(
143    builder: &mut GreenNodeBuilder<'static>,
144    text_line: &str,
145    underline_line: &str,
146    level: usize,
147    config: &ParserOptions,
148) {
149    builder.start_node(SyntaxKind::HEADING.into());
150    emit_setext_heading_body(builder, text_line, underline_line, level, config);
151    builder.finish_node(); // HEADING
152}
153
154/// Emit the body of a setext heading (HEADING_CONTENT + underline + newlines).
155///
156/// The caller is responsible for the surrounding `HEADING` start/finish node.
157/// This split lets multi-line setext headings retroactively wrap a previously
158/// open paragraph by combining its buffered content with the underline line.
159pub(crate) fn emit_setext_heading_body(
160    builder: &mut GreenNodeBuilder<'static>,
161    text_line: &str,
162    underline_line: &str,
163    _level: usize,
164    config: &ParserOptions,
165) {
166    // Strip trailing newline from text line for processing
167    let (text_without_newline, text_newline_str) =
168        if let Some(stripped) = text_line.strip_suffix("\r\n") {
169            (stripped, "\r\n")
170        } else if let Some(stripped) = text_line.strip_suffix('\n') {
171            (stripped, "\n")
172        } else {
173            (text_line, "")
174        };
175
176    // Handle leading spaces in text line
177    let text_trimmed = text_without_newline.trim_start();
178    let leading_spaces = text_without_newline.len() - text_trimmed.len();
179
180    if leading_spaces > 0 {
181        builder.token(
182            SyntaxKind::WHITESPACE.into(),
183            &text_without_newline[..leading_spaces],
184        );
185    }
186
187    // Try to parse trailing attributes from heading text
188    let (text_content, attr_text, space_before_attrs) =
189        if let Some((_attrs, text_before, start_brace_pos)) =
190            try_parse_trailing_attributes_with_pos(text_trimmed)
191        {
192            let space = &text_trimmed[text_before.len()..start_brace_pos];
193            let raw_attrs = &text_trimmed[start_brace_pos..];
194            (text_before, Some(raw_attrs), space)
195        } else if config.extensions.mmd_header_identifiers {
196            if let Some((_normalized, start_bracket_pos, end_bracket_pos)) =
197                try_parse_mmd_header_identifier_with_pos(text_trimmed)
198            {
199                let text_before = trim_end_spaces_tabs(&text_trimmed[..start_bracket_pos]);
200                let space = &text_trimmed[text_before.len()..start_bracket_pos];
201                let raw_attrs = &text_trimmed[start_bracket_pos..end_bracket_pos];
202                (text_before, Some(raw_attrs), space)
203            } else {
204                (text_trimmed, None, "")
205            }
206        } else {
207            (text_trimmed, None, "")
208        };
209
210    // Emit heading content with inline parsing
211    builder.start_node(SyntaxKind::HEADING_CONTENT.into());
212    if !text_content.is_empty() {
213        inline_emission::emit_inlines(builder, text_content, config, false);
214    }
215    builder.finish_node();
216
217    // Emit space before attributes if present
218    if !space_before_attrs.is_empty() {
219        builder.token(SyntaxKind::WHITESPACE.into(), space_before_attrs);
220    }
221
222    // Emit attributes if present
223    if let Some(attr_text) = attr_text {
224        builder.start_node(SyntaxKind::ATTRIBUTE.into());
225        builder.token(SyntaxKind::ATTRIBUTE.into(), attr_text);
226        builder.finish_node();
227    }
228
229    // Emit newline after text line
230    if !text_newline_str.is_empty() {
231        builder.token(SyntaxKind::NEWLINE.into(), text_newline_str);
232    }
233
234    // Strip trailing newline from underline for processing
235    let (underline_without_newline, underline_newline_str) =
236        if let Some(stripped) = underline_line.strip_suffix("\r\n") {
237            (stripped, "\r\n")
238        } else if let Some(stripped) = underline_line.strip_suffix('\n') {
239            (stripped, "\n")
240        } else {
241            (underline_line, "")
242        };
243
244    // Emit underline leading spaces if present
245    let underline_trimmed = underline_without_newline.trim_start();
246    let underline_leading_spaces = underline_without_newline.len() - underline_trimmed.len();
247
248    if underline_leading_spaces > 0 {
249        builder.token(
250            SyntaxKind::WHITESPACE.into(),
251            &underline_without_newline[..underline_leading_spaces],
252        );
253    }
254
255    // Emit the setext underline as a node containing a token
256    builder.start_node(SyntaxKind::SETEXT_HEADING_UNDERLINE.into());
257    builder.token(
258        SyntaxKind::SETEXT_HEADING_UNDERLINE.into(),
259        underline_trimmed,
260    );
261    builder.finish_node();
262
263    // Emit trailing newline after underline
264    if !underline_newline_str.is_empty() {
265        builder.token(SyntaxKind::NEWLINE.into(), underline_newline_str);
266    }
267}
268
269/// Emit an ATX heading node to the builder.
270pub(crate) fn emit_atx_heading(
271    builder: &mut GreenNodeBuilder<'static>,
272    content: &str,
273    level: usize,
274    config: &ParserOptions,
275) {
276    builder.start_node(SyntaxKind::HEADING.into());
277
278    // Strip trailing newline (LF or CRLF) for processing but remember to emit it later
279    let (content_without_newline, newline_str) =
280        if let Some(stripped) = content.strip_suffix("\r\n") {
281            (stripped, "\r\n")
282        } else if let Some(stripped) = content.strip_suffix('\n') {
283            (stripped, "\n")
284        } else {
285            (content, "")
286        };
287
288    let trimmed = content_without_newline.trim_start();
289    let leading_spaces = content_without_newline.len() - trimmed.len();
290
291    // Emit leading spaces if present
292    if leading_spaces > 0 {
293        builder.token(
294            SyntaxKind::WHITESPACE.into(),
295            &content_without_newline[..leading_spaces],
296        );
297    }
298
299    // Marker node for the hashes (must be a node containing a token, not just a token)
300    builder.start_node(SyntaxKind::ATX_HEADING_MARKER.into());
301    builder.token(SyntaxKind::ATX_HEADING_MARKER.into(), &trimmed[..level]);
302    builder.finish_node();
303
304    // Get content after marker
305    let after_marker = &trimmed[level..];
306    let spaces_after_marker_count = after_marker
307        .find(|c: char| !c.is_whitespace())
308        .unwrap_or(after_marker.len());
309
310    // Emit spaces after marker
311    if spaces_after_marker_count > 0 {
312        builder.token(
313            SyntaxKind::WHITESPACE.into(),
314            &after_marker[..spaces_after_marker_count],
315        );
316    }
317
318    // Get actual heading text
319    let heading_text = &after_marker[spaces_after_marker_count..];
320
321    // Parse optional closing ATX marker (` ###`) while preserving bytes.
322    let (heading_content, closing_suffix) = {
323        let without_trailing_ws = trim_end_spaces_tabs(heading_text);
324        let trailing_hashes = without_trailing_ws
325            .chars()
326            .rev()
327            .take_while(|&c| c == '#')
328            .count();
329
330        if trailing_hashes > 0 {
331            let hashes_start = without_trailing_ws.len() - trailing_hashes;
332            let before_hashes = &without_trailing_ws[..hashes_start];
333            // Closing fence requires the hashes to be preceded by whitespace.
334            // That whitespace can be in `before_hashes` (non-empty content case),
335            // or it can be the post-marker spaces we already consumed when content
336            // is empty (e.g. `### ###` → empty heading with closing fence).
337            let preceded_by_ws = before_hashes
338                .chars()
339                .last()
340                .is_some_and(|c| c == ' ' || c == '\t')
341                || (before_hashes.is_empty() && spaces_after_marker_count > 0);
342            if preceded_by_ws {
343                let content_end = trim_end_spaces_tabs(before_hashes).len();
344                (&heading_text[..content_end], &heading_text[content_end..])
345            } else {
346                (heading_text, "")
347            }
348        } else {
349            (heading_text, "")
350        }
351    };
352
353    // Try to parse trailing attributes
354    let (text_content, attr_text, space_before_attrs) =
355        if let Some((_attrs, text_before, start_brace_pos)) =
356            try_parse_trailing_attributes_with_pos(heading_content)
357        {
358            let space = &heading_content[text_before.len()..start_brace_pos];
359            let raw_attrs = &heading_content[start_brace_pos..];
360            (text_before, Some(raw_attrs), space)
361        } else if config.extensions.mmd_header_identifiers {
362            if let Some((_normalized, start_bracket_pos, end_bracket_pos)) =
363                try_parse_mmd_header_identifier_with_pos(heading_content)
364            {
365                let text_before = trim_end_spaces_tabs(&heading_content[..start_bracket_pos]);
366                let space = &heading_content[text_before.len()..start_bracket_pos];
367                let raw_attrs = &heading_content[start_bracket_pos..end_bracket_pos];
368                (text_before, Some(raw_attrs), space)
369            } else {
370                (heading_content, None, "")
371            }
372        } else {
373            (heading_content, None, "")
374        };
375
376    // Heading content node
377    builder.start_node(SyntaxKind::HEADING_CONTENT.into());
378    if !text_content.is_empty() {
379        inline_emission::emit_inlines(builder, text_content, config, false);
380    }
381    builder.finish_node();
382
383    // Emit space before attributes if present
384    if !space_before_attrs.is_empty() {
385        builder.token(SyntaxKind::WHITESPACE.into(), space_before_attrs);
386    }
387
388    // Emit attributes if present
389    if let Some(attr_text) = attr_text {
390        builder.start_node(SyntaxKind::ATTRIBUTE.into());
391        builder.token(SyntaxKind::ATTRIBUTE.into(), attr_text);
392        builder.finish_node();
393    }
394
395    if !closing_suffix.is_empty() {
396        let closing_trimmed = trim_end_spaces_tabs(
397            crate::parser::utils::helpers::trim_start_spaces_tabs(closing_suffix),
398        );
399        let leading_ws_len = closing_suffix
400            .find(|c: char| c != ' ' && c != '\t')
401            .unwrap_or(closing_suffix.len());
402        let trailing_ws_len = closing_suffix.len() - leading_ws_len - closing_trimmed.len();
403
404        if leading_ws_len > 0 {
405            builder.token(
406                SyntaxKind::WHITESPACE.into(),
407                &closing_suffix[..leading_ws_len],
408            );
409        }
410        if !closing_trimmed.is_empty() {
411            builder.token(SyntaxKind::ATX_HEADING_MARKER.into(), closing_trimmed);
412        }
413        if trailing_ws_len > 0 {
414            builder.token(
415                SyntaxKind::WHITESPACE.into(),
416                &closing_suffix[closing_suffix.len() - trailing_ws_len..],
417            );
418        }
419    }
420
421    // Emit trailing newline if present
422    if !newline_str.is_empty() {
423        builder.token(SyntaxKind::NEWLINE.into(), newline_str);
424    }
425
426    builder.finish_node(); // Heading
427}
428
429#[cfg(test)]
430mod tests {
431    use super::*;
432
433    #[test]
434    fn test_simple_heading() {
435        assert_eq!(try_parse_atx_heading("# Heading"), Some(1));
436    }
437
438    #[test]
439    fn test_level_3_heading() {
440        assert_eq!(try_parse_atx_heading("### Level 3"), Some(3));
441    }
442
443    #[test]
444    fn test_heading_with_leading_spaces() {
445        assert_eq!(try_parse_atx_heading("   # Heading"), Some(1));
446    }
447
448    #[test]
449    fn test_atx_heading_with_attributes_losslessness() {
450        use crate::ParserOptions;
451
452        // Regression test for losslessness bug where space before attributes was dropped
453        let input = "# Test {#id}\n";
454        let config = ParserOptions::default();
455        let tree = crate::parse(input, Some(config));
456
457        // Verify losslessness: tree text should exactly match input
458        assert_eq!(
459            tree.text().to_string(),
460            input,
461            "Parser must preserve all bytes including space before attributes"
462        );
463
464        // Verify structure
465        let heading = tree.first_child().unwrap();
466        assert_eq!(heading.kind(), SyntaxKind::HEADING);
467
468        // Find the whitespace between content and attribute
469        let mut found_whitespace = false;
470        for child in heading.children_with_tokens() {
471            if child.kind() == SyntaxKind::WHITESPACE
472                && let Some(token) = child.as_token()
473            {
474                let start: usize = token.text_range().start().into();
475                if token.text() == " " && start == 6 {
476                    found_whitespace = true;
477                    break;
478                }
479            }
480        }
481        assert!(
482            found_whitespace,
483            "Whitespace token between heading content and attributes must be present"
484        );
485    }
486
487    #[test]
488    fn test_atx_heading_closing_hashes_are_lossless() {
489        let input = "### Extension: `smart` ###\n";
490        let tree = crate::parse(input, Some(crate::ParserOptions::default()));
491        assert_eq!(tree.text().to_string(), input);
492    }
493
494    #[test]
495    fn test_four_spaces_not_heading() {
496        assert_eq!(try_parse_atx_heading("    # Not heading"), None);
497    }
498
499    #[test]
500    fn test_no_space_after_hash() {
501        assert_eq!(try_parse_atx_heading("#NoSpace"), None);
502    }
503
504    #[test]
505    fn test_empty_heading() {
506        assert_eq!(try_parse_atx_heading("# "), Some(1));
507    }
508
509    #[test]
510    fn test_level_7_invalid() {
511        assert_eq!(try_parse_atx_heading("####### Too many"), None);
512    }
513
514    // Setext heading tests
515    #[test]
516    fn test_setext_level_1() {
517        let lines = vec!["Heading", "======="];
518        assert_eq!(try_parse_setext_heading(&lines, 0), Some((1, '=')));
519    }
520
521    #[test]
522    fn test_setext_level_2() {
523        let lines = vec!["Heading", "-------"];
524        assert_eq!(try_parse_setext_heading(&lines, 0), Some((2, '-')));
525    }
526
527    #[test]
528    fn test_setext_any_underline_length() {
529        // Per CommonMark §4.3 and Pandoc, the setext underline can be any
530        // non-zero length. Single `=` or `-` after a non-blank line is a
531        // valid setext underline.
532        let lines = vec!["Heading", "="];
533        assert_eq!(try_parse_setext_heading(&lines, 0), Some((1, '=')));
534
535        let lines = vec!["Heading", "=="];
536        assert_eq!(try_parse_setext_heading(&lines, 0), Some((1, '=')));
537
538        let lines = vec!["Heading", "==="];
539        assert_eq!(try_parse_setext_heading(&lines, 0), Some((1, '=')));
540    }
541
542    #[test]
543    fn test_setext_mixed_chars_invalid() {
544        let lines = vec!["Heading", "==-=="];
545        assert_eq!(try_parse_setext_heading(&lines, 0), None);
546    }
547
548    #[test]
549    fn test_setext_with_leading_spaces() {
550        let lines = vec!["Heading", "   ======="];
551        assert_eq!(try_parse_setext_heading(&lines, 0), Some((1, '=')));
552    }
553
554    #[test]
555    fn test_setext_with_trailing_spaces() {
556        let lines = vec!["Heading", "=======   "];
557        assert_eq!(try_parse_setext_heading(&lines, 0), Some((1, '=')));
558    }
559
560    #[test]
561    fn test_setext_empty_text_line() {
562        let lines = vec!["", "======="];
563        assert_eq!(try_parse_setext_heading(&lines, 0), None);
564    }
565
566    #[test]
567    fn test_setext_no_next_line() {
568        let lines = vec!["Heading"];
569        assert_eq!(try_parse_setext_heading(&lines, 0), None);
570    }
571
572    #[test]
573    fn test_setext_four_spaces_indent() {
574        // 4+ spaces means code block, not setext
575        let lines = vec!["    Heading", "    ======="];
576        assert_eq!(try_parse_setext_heading(&lines, 0), None);
577    }
578
579    #[test]
580    fn test_setext_long_underline() {
581        let underline = "=".repeat(100);
582        let lines = vec!["Heading", underline.as_str()];
583        assert_eq!(try_parse_setext_heading(&lines, 0), Some((1, '=')));
584    }
585
586    #[test]
587    fn test_parse_mmd_header_identifier_normalizes_like_pandoc() {
588        let parsed = try_parse_mmd_header_identifier_with_pos("A heading [My ID]")
589            .expect("should parse mmd header identifier");
590        assert_eq!(parsed.0, "myid");
591        assert_eq!(parsed.1, 10);
592    }
593}
panache_parser/parser/blocks/headings.rs

panache_parser/parser/blocks/
headings.rs