Skip to main content

panache_parser/parser/blocks/
headings.rs

1//! ATX heading parsing utilities.
2
3use crate::options::ParserOptions;
4use crate::syntax::SyntaxKind;
5use rowan::GreenNodeBuilder;
6
7use crate::parser::utils::attributes::try_parse_trailing_attributes_with_pos;
8use crate::parser::utils::inline_emission;
9
10fn try_parse_mmd_header_identifier_with_pos(content: &str) -> Option<(String, usize, usize)> {
11    let trimmed = content.trim_end_matches([' ', '\t']);
12    let end = trimmed.len();
13    let bytes = trimmed.as_bytes();
14
15    if end == 0 || bytes[end - 1] != b']' {
16        return None;
17    }
18
19    let start = trimmed[..end - 1].rfind('[')?;
20    let raw = &trimmed[start..end];
21    let inner = &raw[1..raw.len() - 1];
22    if inner.trim().is_empty() {
23        return None;
24    }
25
26    let normalized = inner.split_whitespace().collect::<String>().to_lowercase();
27    if normalized.is_empty() {
28        return None;
29    }
30
31    Some((normalized, start, end))
32}
33
34/// Try to parse an ATX heading from content, returns heading level (1-6) if found.
35pub fn try_parse_atx_heading(content: &str) -> Option<usize> {
36    let trimmed = content.trim_start();
37
38    // Must start with 1-6 # characters
39    let hash_count = trimmed.chars().take_while(|&c| c == '#').count();
40    if hash_count == 0 || hash_count > 6 {
41        return None;
42    }
43
44    // After hashes, must be end of line, space, or tab
45    let after_hashes = &trimmed[hash_count..];
46    if !after_hashes.is_empty() && !after_hashes.starts_with(' ') && !after_hashes.starts_with('\t')
47    {
48        return None;
49    }
50
51    // Check leading spaces (max 3)
52    let leading_spaces = content.len() - trimmed.len();
53    if leading_spaces > 3 {
54        return None;
55    }
56
57    Some(hash_count)
58}
59
60/// Try to parse a setext heading from lines, returns (level, underline_char) if found.
61///
62/// Setext headings consist of:
63/// 1. A non-empty text line (heading content)
64/// 2. An underline of `=` (level 1) or `-` (level 2) characters
65///
66/// Rules:
67/// - Underline must be at least 3 characters long
68/// - Underline can have leading/trailing spaces (up to 3 leading spaces)
69/// - All underline characters must be the same (`=` or `-`)
70/// - Text line cannot be indented 4+ spaces (would be code block)
71/// - Text line cannot be empty/blank
72pub fn try_parse_setext_heading(lines: &[&str], pos: usize) -> Option<(usize, char)> {
73    // Need current line (text) and next line (underline)
74    if pos >= lines.len() {
75        return None;
76    }
77
78    let text_line = lines[pos];
79    let next_pos = pos + 1;
80    if next_pos >= lines.len() {
81        return None;
82    }
83
84    let underline = lines[next_pos];
85
86    // Text line cannot be empty or blank
87    if text_line.trim().is_empty() {
88        return None;
89    }
90
91    // Text line cannot be indented 4+ spaces (would be code block)
92    let leading_spaces = text_line.len() - text_line.trim_start().len();
93    if leading_spaces >= 4 {
94        return None;
95    }
96
97    // Check if underline is valid
98    let underline_trimmed = underline.trim();
99
100    // Must be at least 3 characters
101    if underline_trimmed.len() < 3 {
102        return None;
103    }
104
105    // Determine underline character and check consistency
106    let first_char = underline_trimmed.chars().next()?;
107    if first_char != '=' && first_char != '-' {
108        return None;
109    }
110
111    // All characters must be the same
112    if !underline_trimmed.chars().all(|c| c == first_char) {
113        return None;
114    }
115
116    // Leading spaces in underline (max 3 for consistency with other block rules)
117    let underline_leading_spaces = underline.len() - underline.trim_start().len();
118    if underline_leading_spaces >= 4 {
119        return None;
120    }
121
122    // Determine level: '=' is level 1, '-' is level 2
123    let level = if first_char == '=' { 1 } else { 2 };
124
125    Some((level, first_char))
126}
127
128/// Emit a setext heading node to the builder.
129///
130/// Setext headings consist of a text line followed by an underline.
131/// This function emits the complete HEADING node with both lines.
132pub(crate) fn emit_setext_heading(
133    builder: &mut GreenNodeBuilder<'static>,
134    text_line: &str,
135    underline_line: &str,
136    _level: usize,
137    config: &ParserOptions,
138) {
139    builder.start_node(SyntaxKind::HEADING.into());
140
141    // Strip trailing newline from text line for processing
142    let (text_without_newline, text_newline_str) =
143        if let Some(stripped) = text_line.strip_suffix("\r\n") {
144            (stripped, "\r\n")
145        } else if let Some(stripped) = text_line.strip_suffix('\n') {
146            (stripped, "\n")
147        } else {
148            (text_line, "")
149        };
150
151    // Handle leading spaces in text line
152    let text_trimmed = text_without_newline.trim_start();
153    let leading_spaces = text_without_newline.len() - text_trimmed.len();
154
155    if leading_spaces > 0 {
156        builder.token(
157            SyntaxKind::WHITESPACE.into(),
158            &text_without_newline[..leading_spaces],
159        );
160    }
161
162    // Try to parse trailing attributes from heading text
163    let (text_content, attr_text, space_before_attrs) =
164        if let Some((_attrs, text_before, start_brace_pos)) =
165            try_parse_trailing_attributes_with_pos(text_trimmed)
166        {
167            let space = &text_trimmed[text_before.len()..start_brace_pos];
168            let raw_attrs = &text_trimmed[start_brace_pos..];
169            (text_before, Some(raw_attrs), space)
170        } else if config.extensions.mmd_header_identifiers {
171            if let Some((_normalized, start_bracket_pos, end_bracket_pos)) =
172                try_parse_mmd_header_identifier_with_pos(text_trimmed)
173            {
174                let text_before = text_trimmed[..start_bracket_pos].trim_end_matches([' ', '\t']);
175                let space = &text_trimmed[text_before.len()..start_bracket_pos];
176                let raw_attrs = &text_trimmed[start_bracket_pos..end_bracket_pos];
177                (text_before, Some(raw_attrs), space)
178            } else {
179                (text_trimmed, None, "")
180            }
181        } else {
182            (text_trimmed, None, "")
183        };
184
185    // Emit heading content with inline parsing
186    builder.start_node(SyntaxKind::HEADING_CONTENT.into());
187    if !text_content.is_empty() {
188        inline_emission::emit_inlines(builder, text_content, config);
189    }
190    builder.finish_node();
191
192    // Emit space before attributes if present
193    if !space_before_attrs.is_empty() {
194        builder.token(SyntaxKind::WHITESPACE.into(), space_before_attrs);
195    }
196
197    // Emit attributes if present
198    if let Some(attr_text) = attr_text {
199        builder.start_node(SyntaxKind::ATTRIBUTE.into());
200        builder.token(SyntaxKind::ATTRIBUTE.into(), attr_text);
201        builder.finish_node();
202    }
203
204    // Emit newline after text line
205    if !text_newline_str.is_empty() {
206        builder.token(SyntaxKind::NEWLINE.into(), text_newline_str);
207    }
208
209    // Strip trailing newline from underline for processing
210    let (underline_without_newline, underline_newline_str) =
211        if let Some(stripped) = underline_line.strip_suffix("\r\n") {
212            (stripped, "\r\n")
213        } else if let Some(stripped) = underline_line.strip_suffix('\n') {
214            (stripped, "\n")
215        } else {
216            (underline_line, "")
217        };
218
219    // Emit underline leading spaces if present
220    let underline_trimmed = underline_without_newline.trim_start();
221    let underline_leading_spaces = underline_without_newline.len() - underline_trimmed.len();
222
223    if underline_leading_spaces > 0 {
224        builder.token(
225            SyntaxKind::WHITESPACE.into(),
226            &underline_without_newline[..underline_leading_spaces],
227        );
228    }
229
230    // Emit the setext underline as a node containing a token
231    builder.start_node(SyntaxKind::SETEXT_HEADING_UNDERLINE.into());
232    builder.token(
233        SyntaxKind::SETEXT_HEADING_UNDERLINE.into(),
234        underline_trimmed,
235    );
236    builder.finish_node();
237
238    // Emit trailing newline after underline
239    if !underline_newline_str.is_empty() {
240        builder.token(SyntaxKind::NEWLINE.into(), underline_newline_str);
241    }
242
243    builder.finish_node(); // HEADING
244}
245
246/// Emit an ATX heading node to the builder.
247pub(crate) fn emit_atx_heading(
248    builder: &mut GreenNodeBuilder<'static>,
249    content: &str,
250    level: usize,
251    config: &ParserOptions,
252) {
253    builder.start_node(SyntaxKind::HEADING.into());
254
255    // Strip trailing newline (LF or CRLF) for processing but remember to emit it later
256    let (content_without_newline, newline_str) =
257        if let Some(stripped) = content.strip_suffix("\r\n") {
258            (stripped, "\r\n")
259        } else if let Some(stripped) = content.strip_suffix('\n') {
260            (stripped, "\n")
261        } else {
262            (content, "")
263        };
264
265    let trimmed = content_without_newline.trim_start();
266    let leading_spaces = content_without_newline.len() - trimmed.len();
267
268    // Emit leading spaces if present
269    if leading_spaces > 0 {
270        builder.token(
271            SyntaxKind::WHITESPACE.into(),
272            &content_without_newline[..leading_spaces],
273        );
274    }
275
276    // Marker node for the hashes (must be a node containing a token, not just a token)
277    builder.start_node(SyntaxKind::ATX_HEADING_MARKER.into());
278    builder.token(SyntaxKind::ATX_HEADING_MARKER.into(), &trimmed[..level]);
279    builder.finish_node();
280
281    // Get content after marker
282    let after_marker = &trimmed[level..];
283    let spaces_after_marker_count = after_marker
284        .find(|c: char| !c.is_whitespace())
285        .unwrap_or(after_marker.len());
286
287    // Emit spaces after marker
288    if spaces_after_marker_count > 0 {
289        builder.token(
290            SyntaxKind::WHITESPACE.into(),
291            &after_marker[..spaces_after_marker_count],
292        );
293    }
294
295    // Get actual heading text
296    let heading_text = &after_marker[spaces_after_marker_count..];
297
298    // Parse optional closing ATX marker (` ###`) while preserving bytes.
299    let (heading_content, closing_suffix) = {
300        let without_trailing_ws = heading_text.trim_end_matches([' ', '\t']);
301        let trailing_hashes = without_trailing_ws
302            .chars()
303            .rev()
304            .take_while(|&c| c == '#')
305            .count();
306
307        if trailing_hashes > 0 {
308            let hashes_start = without_trailing_ws.len() - trailing_hashes;
309            let before_hashes = &without_trailing_ws[..hashes_start];
310            if before_hashes
311                .chars()
312                .last()
313                .is_some_and(|c| c == ' ' || c == '\t')
314            {
315                let content_end = before_hashes.trim_end_matches([' ', '\t']).len();
316                (&heading_text[..content_end], &heading_text[content_end..])
317            } else {
318                (heading_text, "")
319            }
320        } else {
321            (heading_text, "")
322        }
323    };
324
325    // Try to parse trailing attributes
326    let (text_content, attr_text, space_before_attrs) =
327        if let Some((_attrs, text_before, start_brace_pos)) =
328            try_parse_trailing_attributes_with_pos(heading_content)
329        {
330            let space = &heading_content[text_before.len()..start_brace_pos];
331            let raw_attrs = &heading_content[start_brace_pos..];
332            (text_before, Some(raw_attrs), space)
333        } else if config.extensions.mmd_header_identifiers {
334            if let Some((_normalized, start_bracket_pos, end_bracket_pos)) =
335                try_parse_mmd_header_identifier_with_pos(heading_content)
336            {
337                let text_before =
338                    heading_content[..start_bracket_pos].trim_end_matches([' ', '\t']);
339                let space = &heading_content[text_before.len()..start_bracket_pos];
340                let raw_attrs = &heading_content[start_bracket_pos..end_bracket_pos];
341                (text_before, Some(raw_attrs), space)
342            } else {
343                (heading_content, None, "")
344            }
345        } else {
346            (heading_content, None, "")
347        };
348
349    // Heading content node
350    builder.start_node(SyntaxKind::HEADING_CONTENT.into());
351    if !text_content.is_empty() {
352        inline_emission::emit_inlines(builder, text_content, config);
353    }
354    builder.finish_node();
355
356    // Emit space before attributes if present
357    if !space_before_attrs.is_empty() {
358        builder.token(SyntaxKind::WHITESPACE.into(), space_before_attrs);
359    }
360
361    // Emit attributes if present
362    if let Some(attr_text) = attr_text {
363        builder.start_node(SyntaxKind::ATTRIBUTE.into());
364        builder.token(SyntaxKind::ATTRIBUTE.into(), attr_text);
365        builder.finish_node();
366    }
367
368    if !closing_suffix.is_empty() {
369        let closing_trimmed = closing_suffix.trim_matches(|c| c == ' ' || c == '\t');
370        let leading_ws_len = closing_suffix
371            .find(|c: char| c != ' ' && c != '\t')
372            .unwrap_or(closing_suffix.len());
373        let trailing_ws_len = closing_suffix.len() - leading_ws_len - closing_trimmed.len();
374
375        if leading_ws_len > 0 {
376            builder.token(
377                SyntaxKind::WHITESPACE.into(),
378                &closing_suffix[..leading_ws_len],
379            );
380        }
381        if !closing_trimmed.is_empty() {
382            builder.token(SyntaxKind::ATX_HEADING_MARKER.into(), closing_trimmed);
383        }
384        if trailing_ws_len > 0 {
385            builder.token(
386                SyntaxKind::WHITESPACE.into(),
387                &closing_suffix[closing_suffix.len() - trailing_ws_len..],
388            );
389        }
390    }
391
392    // Emit trailing newline if present
393    if !newline_str.is_empty() {
394        builder.token(SyntaxKind::NEWLINE.into(), newline_str);
395    }
396
397    builder.finish_node(); // Heading
398}
399
400#[cfg(test)]
401mod tests {
402    use super::*;
403
404    #[test]
405    fn test_simple_heading() {
406        assert_eq!(try_parse_atx_heading("# Heading"), Some(1));
407    }
408
409    #[test]
410    fn test_level_3_heading() {
411        assert_eq!(try_parse_atx_heading("### Level 3"), Some(3));
412    }
413
414    #[test]
415    fn test_heading_with_leading_spaces() {
416        assert_eq!(try_parse_atx_heading("   # Heading"), Some(1));
417    }
418
419    #[test]
420    fn test_atx_heading_with_attributes_losslessness() {
421        use crate::ParserOptions;
422
423        // Regression test for losslessness bug where space before attributes was dropped
424        let input = "# Test {#id}\n";
425        let config = ParserOptions::default();
426        let tree = crate::parse(input, Some(config));
427
428        // Verify losslessness: tree text should exactly match input
429        assert_eq!(
430            tree.text().to_string(),
431            input,
432            "Parser must preserve all bytes including space before attributes"
433        );
434
435        // Verify structure
436        let heading = tree.first_child().unwrap();
437        assert_eq!(heading.kind(), SyntaxKind::HEADING);
438
439        // Find the whitespace between content and attribute
440        let mut found_whitespace = false;
441        for child in heading.children_with_tokens() {
442            if child.kind() == SyntaxKind::WHITESPACE
443                && let Some(token) = child.as_token()
444            {
445                let start: usize = token.text_range().start().into();
446                if token.text() == " " && start == 6 {
447                    found_whitespace = true;
448                    break;
449                }
450            }
451        }
452        assert!(
453            found_whitespace,
454            "Whitespace token between heading content and attributes must be present"
455        );
456    }
457
458    #[test]
459    fn test_atx_heading_closing_hashes_are_lossless() {
460        let input = "### Extension: `smart` ###\n";
461        let tree = crate::parse(input, Some(crate::ParserOptions::default()));
462        assert_eq!(tree.text().to_string(), input);
463    }
464
465    #[test]
466    fn test_four_spaces_not_heading() {
467        assert_eq!(try_parse_atx_heading("    # Not heading"), None);
468    }
469
470    #[test]
471    fn test_no_space_after_hash() {
472        assert_eq!(try_parse_atx_heading("#NoSpace"), None);
473    }
474
475    #[test]
476    fn test_empty_heading() {
477        assert_eq!(try_parse_atx_heading("# "), Some(1));
478    }
479
480    #[test]
481    fn test_level_7_invalid() {
482        assert_eq!(try_parse_atx_heading("####### Too many"), None);
483    }
484
485    // Setext heading tests
486    #[test]
487    fn test_setext_level_1() {
488        let lines = vec!["Heading", "======="];
489        assert_eq!(try_parse_setext_heading(&lines, 0), Some((1, '=')));
490    }
491
492    #[test]
493    fn test_setext_level_2() {
494        let lines = vec!["Heading", "-------"];
495        assert_eq!(try_parse_setext_heading(&lines, 0), Some((2, '-')));
496    }
497
498    #[test]
499    fn test_setext_minimum_three_chars() {
500        let lines = vec!["Heading", "=="];
501        assert_eq!(try_parse_setext_heading(&lines, 0), None);
502
503        let lines = vec!["Heading", "==="];
504        assert_eq!(try_parse_setext_heading(&lines, 0), Some((1, '=')));
505    }
506
507    #[test]
508    fn test_setext_mixed_chars_invalid() {
509        let lines = vec!["Heading", "==-=="];
510        assert_eq!(try_parse_setext_heading(&lines, 0), None);
511    }
512
513    #[test]
514    fn test_setext_with_leading_spaces() {
515        let lines = vec!["Heading", "   ======="];
516        assert_eq!(try_parse_setext_heading(&lines, 0), Some((1, '=')));
517    }
518
519    #[test]
520    fn test_setext_with_trailing_spaces() {
521        let lines = vec!["Heading", "=======   "];
522        assert_eq!(try_parse_setext_heading(&lines, 0), Some((1, '=')));
523    }
524
525    #[test]
526    fn test_setext_empty_text_line() {
527        let lines = vec!["", "======="];
528        assert_eq!(try_parse_setext_heading(&lines, 0), None);
529    }
530
531    #[test]
532    fn test_setext_no_next_line() {
533        let lines = vec!["Heading"];
534        assert_eq!(try_parse_setext_heading(&lines, 0), None);
535    }
536
537    #[test]
538    fn test_setext_four_spaces_indent() {
539        // 4+ spaces means code block, not setext
540        let lines = vec!["    Heading", "    ======="];
541        assert_eq!(try_parse_setext_heading(&lines, 0), None);
542    }
543
544    #[test]
545    fn test_setext_long_underline() {
546        let underline = "=".repeat(100);
547        let lines = vec!["Heading", underline.as_str()];
548        assert_eq!(try_parse_setext_heading(&lines, 0), Some((1, '=')));
549    }
550
551    #[test]
552    fn test_parse_mmd_header_identifier_normalizes_like_pandoc() {
553        let parsed = try_parse_mmd_header_identifier_with_pos("A heading [My ID]")
554            .expect("should parse mmd header identifier");
555        assert_eq!(parsed.0, "myid");
556        assert_eq!(parsed.1, 10);
557    }
558}