Skip to main content

panache_parser/parser/blocks/
headings.rs

1//! ATX heading parsing utilities.
2
3use crate::options::ParserOptions;
4use crate::syntax::SyntaxKind;
5use rowan::GreenNodeBuilder;
6
7use crate::parser::utils::attributes::try_parse_trailing_attributes_with_pos;
8use crate::parser::utils::inline_emission;
9
10fn try_parse_mmd_header_identifier_with_pos(content: &str) -> Option<(String, usize, usize)> {
11    let trimmed = content.trim_end_matches([' ', '\t']);
12    let end = trimmed.len();
13    let bytes = trimmed.as_bytes();
14
15    if end == 0 || bytes[end - 1] != b']' {
16        return None;
17    }
18
19    let start = trimmed[..end - 1].rfind('[')?;
20    let raw = &trimmed[start..end];
21    let inner = &raw[1..raw.len() - 1];
22    if inner.trim().is_empty() {
23        return None;
24    }
25
26    let normalized = inner.split_whitespace().collect::<String>().to_lowercase();
27    if normalized.is_empty() {
28        return None;
29    }
30
31    Some((normalized, start, end))
32}
33
34/// Try to parse an ATX heading from content, returns heading level (1-6) if found.
35pub fn try_parse_atx_heading(content: &str) -> Option<usize> {
36    let line = if let Some(stripped) = content.strip_suffix("\r\n") {
37        stripped
38    } else if let Some(stripped) = content.strip_suffix('\n') {
39        stripped
40    } else {
41        content
42    };
43    let trimmed = line.trim_start();
44
45    // Must start with 1-6 # characters
46    let hash_count = trimmed.chars().take_while(|&c| c == '#').count();
47    if hash_count == 0 || hash_count > 6 {
48        return None;
49    }
50
51    // After hashes, must be end of line, space, or tab.
52    // We strip trailing line ending first so empty headings like `##\n`
53    // are accepted when this function is called on full source lines.
54    let after_hashes = &trimmed[hash_count..];
55    if !after_hashes.is_empty() && !after_hashes.starts_with(' ') && !after_hashes.starts_with('\t')
56    {
57        return None;
58    }
59
60    // Check leading spaces (max 3)
61    let leading_spaces = line.len() - trimmed.len();
62    if leading_spaces > 3 {
63        return None;
64    }
65
66    Some(hash_count)
67}
68
69/// Try to parse a setext heading from lines, returns (level, underline_char) if found.
70///
71/// Setext headings consist of:
72/// 1. A non-empty text line (heading content)
73/// 2. An underline of `=` (level 1) or `-` (level 2) characters
74///
75/// Rules:
76/// - Underline can be any non-zero length (CommonMark §4.3 / Pandoc both)
77/// - Underline can have leading/trailing spaces (up to 3 leading spaces)
78/// - All underline characters must be the same (`=` or `-`)
79/// - Text line cannot be indented 4+ spaces (would be code block)
80/// - Text line cannot be empty/blank
81pub fn try_parse_setext_heading(lines: &[&str], pos: usize) -> Option<(usize, char)> {
82    // Need current line (text) and next line (underline)
83    if pos >= lines.len() {
84        return None;
85    }
86
87    let text_line = lines[pos];
88    let next_pos = pos + 1;
89    if next_pos >= lines.len() {
90        return None;
91    }
92
93    let underline = lines[next_pos];
94
95    // Text line cannot be empty or blank
96    if text_line.trim().is_empty() {
97        return None;
98    }
99
100    // Text line cannot be indented 4+ spaces (would be code block)
101    let leading_spaces = text_line.len() - text_line.trim_start().len();
102    if leading_spaces >= 4 {
103        return None;
104    }
105
106    // Check if underline is valid
107    let underline_trimmed = underline.trim();
108
109    // Must be non-empty
110    if underline_trimmed.is_empty() {
111        return None;
112    }
113
114    // Determine underline character and check consistency
115    let first_char = underline_trimmed.chars().next()?;
116    if first_char != '=' && first_char != '-' {
117        return None;
118    }
119
120    // All characters must be the same
121    if !underline_trimmed.chars().all(|c| c == first_char) {
122        return None;
123    }
124
125    // Leading spaces in underline (max 3 for consistency with other block rules)
126    let underline_leading_spaces = underline.len() - underline.trim_start().len();
127    if underline_leading_spaces >= 4 {
128        return None;
129    }
130
131    // Determine level: '=' is level 1, '-' is level 2
132    let level = if first_char == '=' { 1 } else { 2 };
133
134    Some((level, first_char))
135}
136
137/// Emit a setext heading node to the builder.
138///
139/// Setext headings consist of a text line followed by an underline.
140/// This function emits the complete HEADING node with both lines.
141pub(crate) fn emit_setext_heading(
142    builder: &mut GreenNodeBuilder<'static>,
143    text_line: &str,
144    underline_line: &str,
145    _level: usize,
146    config: &ParserOptions,
147) {
148    builder.start_node(SyntaxKind::HEADING.into());
149
150    // Strip trailing newline from text line for processing
151    let (text_without_newline, text_newline_str) =
152        if let Some(stripped) = text_line.strip_suffix("\r\n") {
153            (stripped, "\r\n")
154        } else if let Some(stripped) = text_line.strip_suffix('\n') {
155            (stripped, "\n")
156        } else {
157            (text_line, "")
158        };
159
160    // Handle leading spaces in text line
161    let text_trimmed = text_without_newline.trim_start();
162    let leading_spaces = text_without_newline.len() - text_trimmed.len();
163
164    if leading_spaces > 0 {
165        builder.token(
166            SyntaxKind::WHITESPACE.into(),
167            &text_without_newline[..leading_spaces],
168        );
169    }
170
171    // Try to parse trailing attributes from heading text
172    let (text_content, attr_text, space_before_attrs) =
173        if let Some((_attrs, text_before, start_brace_pos)) =
174            try_parse_trailing_attributes_with_pos(text_trimmed)
175        {
176            let space = &text_trimmed[text_before.len()..start_brace_pos];
177            let raw_attrs = &text_trimmed[start_brace_pos..];
178            (text_before, Some(raw_attrs), space)
179        } else if config.extensions.mmd_header_identifiers {
180            if let Some((_normalized, start_bracket_pos, end_bracket_pos)) =
181                try_parse_mmd_header_identifier_with_pos(text_trimmed)
182            {
183                let text_before = text_trimmed[..start_bracket_pos].trim_end_matches([' ', '\t']);
184                let space = &text_trimmed[text_before.len()..start_bracket_pos];
185                let raw_attrs = &text_trimmed[start_bracket_pos..end_bracket_pos];
186                (text_before, Some(raw_attrs), space)
187            } else {
188                (text_trimmed, None, "")
189            }
190        } else {
191            (text_trimmed, None, "")
192        };
193
194    // Emit heading content with inline parsing
195    builder.start_node(SyntaxKind::HEADING_CONTENT.into());
196    if !text_content.is_empty() {
197        inline_emission::emit_inlines(builder, text_content, config);
198    }
199    builder.finish_node();
200
201    // Emit space before attributes if present
202    if !space_before_attrs.is_empty() {
203        builder.token(SyntaxKind::WHITESPACE.into(), space_before_attrs);
204    }
205
206    // Emit attributes if present
207    if let Some(attr_text) = attr_text {
208        builder.start_node(SyntaxKind::ATTRIBUTE.into());
209        builder.token(SyntaxKind::ATTRIBUTE.into(), attr_text);
210        builder.finish_node();
211    }
212
213    // Emit newline after text line
214    if !text_newline_str.is_empty() {
215        builder.token(SyntaxKind::NEWLINE.into(), text_newline_str);
216    }
217
218    // Strip trailing newline from underline for processing
219    let (underline_without_newline, underline_newline_str) =
220        if let Some(stripped) = underline_line.strip_suffix("\r\n") {
221            (stripped, "\r\n")
222        } else if let Some(stripped) = underline_line.strip_suffix('\n') {
223            (stripped, "\n")
224        } else {
225            (underline_line, "")
226        };
227
228    // Emit underline leading spaces if present
229    let underline_trimmed = underline_without_newline.trim_start();
230    let underline_leading_spaces = underline_without_newline.len() - underline_trimmed.len();
231
232    if underline_leading_spaces > 0 {
233        builder.token(
234            SyntaxKind::WHITESPACE.into(),
235            &underline_without_newline[..underline_leading_spaces],
236        );
237    }
238
239    // Emit the setext underline as a node containing a token
240    builder.start_node(SyntaxKind::SETEXT_HEADING_UNDERLINE.into());
241    builder.token(
242        SyntaxKind::SETEXT_HEADING_UNDERLINE.into(),
243        underline_trimmed,
244    );
245    builder.finish_node();
246
247    // Emit trailing newline after underline
248    if !underline_newline_str.is_empty() {
249        builder.token(SyntaxKind::NEWLINE.into(), underline_newline_str);
250    }
251
252    builder.finish_node(); // HEADING
253}
254
255/// Emit an ATX heading node to the builder.
256pub(crate) fn emit_atx_heading(
257    builder: &mut GreenNodeBuilder<'static>,
258    content: &str,
259    level: usize,
260    config: &ParserOptions,
261) {
262    builder.start_node(SyntaxKind::HEADING.into());
263
264    // Strip trailing newline (LF or CRLF) for processing but remember to emit it later
265    let (content_without_newline, newline_str) =
266        if let Some(stripped) = content.strip_suffix("\r\n") {
267            (stripped, "\r\n")
268        } else if let Some(stripped) = content.strip_suffix('\n') {
269            (stripped, "\n")
270        } else {
271            (content, "")
272        };
273
274    let trimmed = content_without_newline.trim_start();
275    let leading_spaces = content_without_newline.len() - trimmed.len();
276
277    // Emit leading spaces if present
278    if leading_spaces > 0 {
279        builder.token(
280            SyntaxKind::WHITESPACE.into(),
281            &content_without_newline[..leading_spaces],
282        );
283    }
284
285    // Marker node for the hashes (must be a node containing a token, not just a token)
286    builder.start_node(SyntaxKind::ATX_HEADING_MARKER.into());
287    builder.token(SyntaxKind::ATX_HEADING_MARKER.into(), &trimmed[..level]);
288    builder.finish_node();
289
290    // Get content after marker
291    let after_marker = &trimmed[level..];
292    let spaces_after_marker_count = after_marker
293        .find(|c: char| !c.is_whitespace())
294        .unwrap_or(after_marker.len());
295
296    // Emit spaces after marker
297    if spaces_after_marker_count > 0 {
298        builder.token(
299            SyntaxKind::WHITESPACE.into(),
300            &after_marker[..spaces_after_marker_count],
301        );
302    }
303
304    // Get actual heading text
305    let heading_text = &after_marker[spaces_after_marker_count..];
306
307    // Parse optional closing ATX marker (` ###`) while preserving bytes.
308    let (heading_content, closing_suffix) = {
309        let without_trailing_ws = heading_text.trim_end_matches([' ', '\t']);
310        let trailing_hashes = without_trailing_ws
311            .chars()
312            .rev()
313            .take_while(|&c| c == '#')
314            .count();
315
316        if trailing_hashes > 0 {
317            let hashes_start = without_trailing_ws.len() - trailing_hashes;
318            let before_hashes = &without_trailing_ws[..hashes_start];
319            // Closing fence requires the hashes to be preceded by whitespace.
320            // That whitespace can be in `before_hashes` (non-empty content case),
321            // or it can be the post-marker spaces we already consumed when content
322            // is empty (e.g. `### ###` → empty heading with closing fence).
323            let preceded_by_ws = before_hashes
324                .chars()
325                .last()
326                .is_some_and(|c| c == ' ' || c == '\t')
327                || (before_hashes.is_empty() && spaces_after_marker_count > 0);
328            if preceded_by_ws {
329                let content_end = before_hashes.trim_end_matches([' ', '\t']).len();
330                (&heading_text[..content_end], &heading_text[content_end..])
331            } else {
332                (heading_text, "")
333            }
334        } else {
335            (heading_text, "")
336        }
337    };
338
339    // Try to parse trailing attributes
340    let (text_content, attr_text, space_before_attrs) =
341        if let Some((_attrs, text_before, start_brace_pos)) =
342            try_parse_trailing_attributes_with_pos(heading_content)
343        {
344            let space = &heading_content[text_before.len()..start_brace_pos];
345            let raw_attrs = &heading_content[start_brace_pos..];
346            (text_before, Some(raw_attrs), space)
347        } else if config.extensions.mmd_header_identifiers {
348            if let Some((_normalized, start_bracket_pos, end_bracket_pos)) =
349                try_parse_mmd_header_identifier_with_pos(heading_content)
350            {
351                let text_before =
352                    heading_content[..start_bracket_pos].trim_end_matches([' ', '\t']);
353                let space = &heading_content[text_before.len()..start_bracket_pos];
354                let raw_attrs = &heading_content[start_bracket_pos..end_bracket_pos];
355                (text_before, Some(raw_attrs), space)
356            } else {
357                (heading_content, None, "")
358            }
359        } else {
360            (heading_content, None, "")
361        };
362
363    // Heading content node
364    builder.start_node(SyntaxKind::HEADING_CONTENT.into());
365    if !text_content.is_empty() {
366        inline_emission::emit_inlines(builder, text_content, config);
367    }
368    builder.finish_node();
369
370    // Emit space before attributes if present
371    if !space_before_attrs.is_empty() {
372        builder.token(SyntaxKind::WHITESPACE.into(), space_before_attrs);
373    }
374
375    // Emit attributes if present
376    if let Some(attr_text) = attr_text {
377        builder.start_node(SyntaxKind::ATTRIBUTE.into());
378        builder.token(SyntaxKind::ATTRIBUTE.into(), attr_text);
379        builder.finish_node();
380    }
381
382    if !closing_suffix.is_empty() {
383        let closing_trimmed = closing_suffix.trim_matches(|c| c == ' ' || c == '\t');
384        let leading_ws_len = closing_suffix
385            .find(|c: char| c != ' ' && c != '\t')
386            .unwrap_or(closing_suffix.len());
387        let trailing_ws_len = closing_suffix.len() - leading_ws_len - closing_trimmed.len();
388
389        if leading_ws_len > 0 {
390            builder.token(
391                SyntaxKind::WHITESPACE.into(),
392                &closing_suffix[..leading_ws_len],
393            );
394        }
395        if !closing_trimmed.is_empty() {
396            builder.token(SyntaxKind::ATX_HEADING_MARKER.into(), closing_trimmed);
397        }
398        if trailing_ws_len > 0 {
399            builder.token(
400                SyntaxKind::WHITESPACE.into(),
401                &closing_suffix[closing_suffix.len() - trailing_ws_len..],
402            );
403        }
404    }
405
406    // Emit trailing newline if present
407    if !newline_str.is_empty() {
408        builder.token(SyntaxKind::NEWLINE.into(), newline_str);
409    }
410
411    builder.finish_node(); // Heading
412}
413
414#[cfg(test)]
415mod tests {
416    use super::*;
417
418    #[test]
419    fn test_simple_heading() {
420        assert_eq!(try_parse_atx_heading("# Heading"), Some(1));
421    }
422
423    #[test]
424    fn test_level_3_heading() {
425        assert_eq!(try_parse_atx_heading("### Level 3"), Some(3));
426    }
427
428    #[test]
429    fn test_heading_with_leading_spaces() {
430        assert_eq!(try_parse_atx_heading("   # Heading"), Some(1));
431    }
432
433    #[test]
434    fn test_atx_heading_with_attributes_losslessness() {
435        use crate::ParserOptions;
436
437        // Regression test for losslessness bug where space before attributes was dropped
438        let input = "# Test {#id}\n";
439        let config = ParserOptions::default();
440        let tree = crate::parse(input, Some(config));
441
442        // Verify losslessness: tree text should exactly match input
443        assert_eq!(
444            tree.text().to_string(),
445            input,
446            "Parser must preserve all bytes including space before attributes"
447        );
448
449        // Verify structure
450        let heading = tree.first_child().unwrap();
451        assert_eq!(heading.kind(), SyntaxKind::HEADING);
452
453        // Find the whitespace between content and attribute
454        let mut found_whitespace = false;
455        for child in heading.children_with_tokens() {
456            if child.kind() == SyntaxKind::WHITESPACE
457                && let Some(token) = child.as_token()
458            {
459                let start: usize = token.text_range().start().into();
460                if token.text() == " " && start == 6 {
461                    found_whitespace = true;
462                    break;
463                }
464            }
465        }
466        assert!(
467            found_whitespace,
468            "Whitespace token between heading content and attributes must be present"
469        );
470    }
471
472    #[test]
473    fn test_atx_heading_closing_hashes_are_lossless() {
474        let input = "### Extension: `smart` ###\n";
475        let tree = crate::parse(input, Some(crate::ParserOptions::default()));
476        assert_eq!(tree.text().to_string(), input);
477    }
478
479    #[test]
480    fn test_four_spaces_not_heading() {
481        assert_eq!(try_parse_atx_heading("    # Not heading"), None);
482    }
483
484    #[test]
485    fn test_no_space_after_hash() {
486        assert_eq!(try_parse_atx_heading("#NoSpace"), None);
487    }
488
489    #[test]
490    fn test_empty_heading() {
491        assert_eq!(try_parse_atx_heading("# "), Some(1));
492    }
493
494    #[test]
495    fn test_level_7_invalid() {
496        assert_eq!(try_parse_atx_heading("####### Too many"), None);
497    }
498
499    // Setext heading tests
500    #[test]
501    fn test_setext_level_1() {
502        let lines = vec!["Heading", "======="];
503        assert_eq!(try_parse_setext_heading(&lines, 0), Some((1, '=')));
504    }
505
506    #[test]
507    fn test_setext_level_2() {
508        let lines = vec!["Heading", "-------"];
509        assert_eq!(try_parse_setext_heading(&lines, 0), Some((2, '-')));
510    }
511
512    #[test]
513    fn test_setext_any_underline_length() {
514        // Per CommonMark §4.3 and Pandoc, the setext underline can be any
515        // non-zero length. Single `=` or `-` after a non-blank line is a
516        // valid setext underline.
517        let lines = vec!["Heading", "="];
518        assert_eq!(try_parse_setext_heading(&lines, 0), Some((1, '=')));
519
520        let lines = vec!["Heading", "=="];
521        assert_eq!(try_parse_setext_heading(&lines, 0), Some((1, '=')));
522
523        let lines = vec!["Heading", "==="];
524        assert_eq!(try_parse_setext_heading(&lines, 0), Some((1, '=')));
525    }
526
527    #[test]
528    fn test_setext_mixed_chars_invalid() {
529        let lines = vec!["Heading", "==-=="];
530        assert_eq!(try_parse_setext_heading(&lines, 0), None);
531    }
532
533    #[test]
534    fn test_setext_with_leading_spaces() {
535        let lines = vec!["Heading", "   ======="];
536        assert_eq!(try_parse_setext_heading(&lines, 0), Some((1, '=')));
537    }
538
539    #[test]
540    fn test_setext_with_trailing_spaces() {
541        let lines = vec!["Heading", "=======   "];
542        assert_eq!(try_parse_setext_heading(&lines, 0), Some((1, '=')));
543    }
544
545    #[test]
546    fn test_setext_empty_text_line() {
547        let lines = vec!["", "======="];
548        assert_eq!(try_parse_setext_heading(&lines, 0), None);
549    }
550
551    #[test]
552    fn test_setext_no_next_line() {
553        let lines = vec!["Heading"];
554        assert_eq!(try_parse_setext_heading(&lines, 0), None);
555    }
556
557    #[test]
558    fn test_setext_four_spaces_indent() {
559        // 4+ spaces means code block, not setext
560        let lines = vec!["    Heading", "    ======="];
561        assert_eq!(try_parse_setext_heading(&lines, 0), None);
562    }
563
564    #[test]
565    fn test_setext_long_underline() {
566        let underline = "=".repeat(100);
567        let lines = vec!["Heading", underline.as_str()];
568        assert_eq!(try_parse_setext_heading(&lines, 0), Some((1, '=')));
569    }
570
571    #[test]
572    fn test_parse_mmd_header_identifier_normalizes_like_pandoc() {
573        let parsed = try_parse_mmd_header_identifier_with_pos("A heading [My ID]")
574            .expect("should parse mmd header identifier");
575        assert_eq!(parsed.0, "myid");
576        assert_eq!(parsed.1, 10);
577    }
578}