Skip to main content

panache_parser/parser/blocks/
headings.rs

1//! ATX heading parsing utilities.
2
3use crate::options::ParserOptions;
4use crate::syntax::SyntaxKind;
5use rowan::GreenNodeBuilder;
6
7use crate::parser::utils::attributes::try_parse_trailing_attributes_with_pos;
8use crate::parser::utils::inline_emission;
9
10fn try_parse_mmd_header_identifier_with_pos(content: &str) -> Option<(String, usize, usize)> {
11    let trimmed = content.trim_end_matches([' ', '\t']);
12    let end = trimmed.len();
13    let bytes = trimmed.as_bytes();
14
15    if end == 0 || bytes[end - 1] != b']' {
16        return None;
17    }
18
19    let start = trimmed[..end - 1].rfind('[')?;
20    let raw = &trimmed[start..end];
21    let inner = &raw[1..raw.len() - 1];
22    if inner.trim().is_empty() {
23        return None;
24    }
25
26    let normalized = inner.split_whitespace().collect::<String>().to_lowercase();
27    if normalized.is_empty() {
28        return None;
29    }
30
31    Some((normalized, start, end))
32}
33
34/// Try to parse an ATX heading from content, returns heading level (1-6) if found.
35pub fn try_parse_atx_heading(content: &str) -> Option<usize> {
36    let line = if let Some(stripped) = content.strip_suffix("\r\n") {
37        stripped
38    } else if let Some(stripped) = content.strip_suffix('\n') {
39        stripped
40    } else {
41        content
42    };
43    let trimmed = line.trim_start();
44
45    // Must start with 1-6 # characters
46    let hash_count = trimmed.chars().take_while(|&c| c == '#').count();
47    if hash_count == 0 || hash_count > 6 {
48        return None;
49    }
50
51    // After hashes, must be end of line, space, or tab.
52    // We strip trailing line ending first so empty headings like `##\n`
53    // are accepted when this function is called on full source lines.
54    let after_hashes = &trimmed[hash_count..];
55    if !after_hashes.is_empty() && !after_hashes.starts_with(' ') && !after_hashes.starts_with('\t')
56    {
57        return None;
58    }
59
60    // Check leading spaces (max 3)
61    let leading_spaces = line.len() - trimmed.len();
62    if leading_spaces > 3 {
63        return None;
64    }
65
66    Some(hash_count)
67}
68
69/// Try to parse a setext heading from lines, returns (level, underline_char) if found.
70///
71/// Setext headings consist of:
72/// 1. A non-empty text line (heading content)
73/// 2. An underline of `=` (level 1) or `-` (level 2) characters
74///
75/// Rules:
76/// - Underline must be at least 3 characters long
77/// - Underline can have leading/trailing spaces (up to 3 leading spaces)
78/// - All underline characters must be the same (`=` or `-`)
79/// - Text line cannot be indented 4+ spaces (would be code block)
80/// - Text line cannot be empty/blank
81pub fn try_parse_setext_heading(lines: &[&str], pos: usize) -> Option<(usize, char)> {
82    // Need current line (text) and next line (underline)
83    if pos >= lines.len() {
84        return None;
85    }
86
87    let text_line = lines[pos];
88    let next_pos = pos + 1;
89    if next_pos >= lines.len() {
90        return None;
91    }
92
93    let underline = lines[next_pos];
94
95    // Text line cannot be empty or blank
96    if text_line.trim().is_empty() {
97        return None;
98    }
99
100    // Text line cannot be indented 4+ spaces (would be code block)
101    let leading_spaces = text_line.len() - text_line.trim_start().len();
102    if leading_spaces >= 4 {
103        return None;
104    }
105
106    // Check if underline is valid
107    let underline_trimmed = underline.trim();
108
109    // Must be at least 3 characters
110    if underline_trimmed.len() < 3 {
111        return None;
112    }
113
114    // Determine underline character and check consistency
115    let first_char = underline_trimmed.chars().next()?;
116    if first_char != '=' && first_char != '-' {
117        return None;
118    }
119
120    // All characters must be the same
121    if !underline_trimmed.chars().all(|c| c == first_char) {
122        return None;
123    }
124
125    // Leading spaces in underline (max 3 for consistency with other block rules)
126    let underline_leading_spaces = underline.len() - underline.trim_start().len();
127    if underline_leading_spaces >= 4 {
128        return None;
129    }
130
131    // Determine level: '=' is level 1, '-' is level 2
132    let level = if first_char == '=' { 1 } else { 2 };
133
134    Some((level, first_char))
135}
136
137/// Emit a setext heading node to the builder.
138///
139/// Setext headings consist of a text line followed by an underline.
140/// This function emits the complete HEADING node with both lines.
141pub(crate) fn emit_setext_heading(
142    builder: &mut GreenNodeBuilder<'static>,
143    text_line: &str,
144    underline_line: &str,
145    _level: usize,
146    config: &ParserOptions,
147) {
148    builder.start_node(SyntaxKind::HEADING.into());
149
150    // Strip trailing newline from text line for processing
151    let (text_without_newline, text_newline_str) =
152        if let Some(stripped) = text_line.strip_suffix("\r\n") {
153            (stripped, "\r\n")
154        } else if let Some(stripped) = text_line.strip_suffix('\n') {
155            (stripped, "\n")
156        } else {
157            (text_line, "")
158        };
159
160    // Handle leading spaces in text line
161    let text_trimmed = text_without_newline.trim_start();
162    let leading_spaces = text_without_newline.len() - text_trimmed.len();
163
164    if leading_spaces > 0 {
165        builder.token(
166            SyntaxKind::WHITESPACE.into(),
167            &text_without_newline[..leading_spaces],
168        );
169    }
170
171    // Try to parse trailing attributes from heading text
172    let (text_content, attr_text, space_before_attrs) =
173        if let Some((_attrs, text_before, start_brace_pos)) =
174            try_parse_trailing_attributes_with_pos(text_trimmed)
175        {
176            let space = &text_trimmed[text_before.len()..start_brace_pos];
177            let raw_attrs = &text_trimmed[start_brace_pos..];
178            (text_before, Some(raw_attrs), space)
179        } else if config.extensions.mmd_header_identifiers {
180            if let Some((_normalized, start_bracket_pos, end_bracket_pos)) =
181                try_parse_mmd_header_identifier_with_pos(text_trimmed)
182            {
183                let text_before = text_trimmed[..start_bracket_pos].trim_end_matches([' ', '\t']);
184                let space = &text_trimmed[text_before.len()..start_bracket_pos];
185                let raw_attrs = &text_trimmed[start_bracket_pos..end_bracket_pos];
186                (text_before, Some(raw_attrs), space)
187            } else {
188                (text_trimmed, None, "")
189            }
190        } else {
191            (text_trimmed, None, "")
192        };
193
194    // Emit heading content with inline parsing
195    builder.start_node(SyntaxKind::HEADING_CONTENT.into());
196    if !text_content.is_empty() {
197        inline_emission::emit_inlines(builder, text_content, config);
198    }
199    builder.finish_node();
200
201    // Emit space before attributes if present
202    if !space_before_attrs.is_empty() {
203        builder.token(SyntaxKind::WHITESPACE.into(), space_before_attrs);
204    }
205
206    // Emit attributes if present
207    if let Some(attr_text) = attr_text {
208        builder.start_node(SyntaxKind::ATTRIBUTE.into());
209        builder.token(SyntaxKind::ATTRIBUTE.into(), attr_text);
210        builder.finish_node();
211    }
212
213    // Emit newline after text line
214    if !text_newline_str.is_empty() {
215        builder.token(SyntaxKind::NEWLINE.into(), text_newline_str);
216    }
217
218    // Strip trailing newline from underline for processing
219    let (underline_without_newline, underline_newline_str) =
220        if let Some(stripped) = underline_line.strip_suffix("\r\n") {
221            (stripped, "\r\n")
222        } else if let Some(stripped) = underline_line.strip_suffix('\n') {
223            (stripped, "\n")
224        } else {
225            (underline_line, "")
226        };
227
228    // Emit underline leading spaces if present
229    let underline_trimmed = underline_without_newline.trim_start();
230    let underline_leading_spaces = underline_without_newline.len() - underline_trimmed.len();
231
232    if underline_leading_spaces > 0 {
233        builder.token(
234            SyntaxKind::WHITESPACE.into(),
235            &underline_without_newline[..underline_leading_spaces],
236        );
237    }
238
239    // Emit the setext underline as a node containing a token
240    builder.start_node(SyntaxKind::SETEXT_HEADING_UNDERLINE.into());
241    builder.token(
242        SyntaxKind::SETEXT_HEADING_UNDERLINE.into(),
243        underline_trimmed,
244    );
245    builder.finish_node();
246
247    // Emit trailing newline after underline
248    if !underline_newline_str.is_empty() {
249        builder.token(SyntaxKind::NEWLINE.into(), underline_newline_str);
250    }
251
252    builder.finish_node(); // HEADING
253}
254
255/// Emit an ATX heading node to the builder.
256pub(crate) fn emit_atx_heading(
257    builder: &mut GreenNodeBuilder<'static>,
258    content: &str,
259    level: usize,
260    config: &ParserOptions,
261) {
262    builder.start_node(SyntaxKind::HEADING.into());
263
264    // Strip trailing newline (LF or CRLF) for processing but remember to emit it later
265    let (content_without_newline, newline_str) =
266        if let Some(stripped) = content.strip_suffix("\r\n") {
267            (stripped, "\r\n")
268        } else if let Some(stripped) = content.strip_suffix('\n') {
269            (stripped, "\n")
270        } else {
271            (content, "")
272        };
273
274    let trimmed = content_without_newline.trim_start();
275    let leading_spaces = content_without_newline.len() - trimmed.len();
276
277    // Emit leading spaces if present
278    if leading_spaces > 0 {
279        builder.token(
280            SyntaxKind::WHITESPACE.into(),
281            &content_without_newline[..leading_spaces],
282        );
283    }
284
285    // Marker node for the hashes (must be a node containing a token, not just a token)
286    builder.start_node(SyntaxKind::ATX_HEADING_MARKER.into());
287    builder.token(SyntaxKind::ATX_HEADING_MARKER.into(), &trimmed[..level]);
288    builder.finish_node();
289
290    // Get content after marker
291    let after_marker = &trimmed[level..];
292    let spaces_after_marker_count = after_marker
293        .find(|c: char| !c.is_whitespace())
294        .unwrap_or(after_marker.len());
295
296    // Emit spaces after marker
297    if spaces_after_marker_count > 0 {
298        builder.token(
299            SyntaxKind::WHITESPACE.into(),
300            &after_marker[..spaces_after_marker_count],
301        );
302    }
303
304    // Get actual heading text
305    let heading_text = &after_marker[spaces_after_marker_count..];
306
307    // Parse optional closing ATX marker (` ###`) while preserving bytes.
308    let (heading_content, closing_suffix) = {
309        let without_trailing_ws = heading_text.trim_end_matches([' ', '\t']);
310        let trailing_hashes = without_trailing_ws
311            .chars()
312            .rev()
313            .take_while(|&c| c == '#')
314            .count();
315
316        if trailing_hashes > 0 {
317            let hashes_start = without_trailing_ws.len() - trailing_hashes;
318            let before_hashes = &without_trailing_ws[..hashes_start];
319            if before_hashes
320                .chars()
321                .last()
322                .is_some_and(|c| c == ' ' || c == '\t')
323            {
324                let content_end = before_hashes.trim_end_matches([' ', '\t']).len();
325                (&heading_text[..content_end], &heading_text[content_end..])
326            } else {
327                (heading_text, "")
328            }
329        } else {
330            (heading_text, "")
331        }
332    };
333
334    // Try to parse trailing attributes
335    let (text_content, attr_text, space_before_attrs) =
336        if let Some((_attrs, text_before, start_brace_pos)) =
337            try_parse_trailing_attributes_with_pos(heading_content)
338        {
339            let space = &heading_content[text_before.len()..start_brace_pos];
340            let raw_attrs = &heading_content[start_brace_pos..];
341            (text_before, Some(raw_attrs), space)
342        } else if config.extensions.mmd_header_identifiers {
343            if let Some((_normalized, start_bracket_pos, end_bracket_pos)) =
344                try_parse_mmd_header_identifier_with_pos(heading_content)
345            {
346                let text_before =
347                    heading_content[..start_bracket_pos].trim_end_matches([' ', '\t']);
348                let space = &heading_content[text_before.len()..start_bracket_pos];
349                let raw_attrs = &heading_content[start_bracket_pos..end_bracket_pos];
350                (text_before, Some(raw_attrs), space)
351            } else {
352                (heading_content, None, "")
353            }
354        } else {
355            (heading_content, None, "")
356        };
357
358    // Heading content node
359    builder.start_node(SyntaxKind::HEADING_CONTENT.into());
360    if !text_content.is_empty() {
361        inline_emission::emit_inlines(builder, text_content, config);
362    }
363    builder.finish_node();
364
365    // Emit space before attributes if present
366    if !space_before_attrs.is_empty() {
367        builder.token(SyntaxKind::WHITESPACE.into(), space_before_attrs);
368    }
369
370    // Emit attributes if present
371    if let Some(attr_text) = attr_text {
372        builder.start_node(SyntaxKind::ATTRIBUTE.into());
373        builder.token(SyntaxKind::ATTRIBUTE.into(), attr_text);
374        builder.finish_node();
375    }
376
377    if !closing_suffix.is_empty() {
378        let closing_trimmed = closing_suffix.trim_matches(|c| c == ' ' || c == '\t');
379        let leading_ws_len = closing_suffix
380            .find(|c: char| c != ' ' && c != '\t')
381            .unwrap_or(closing_suffix.len());
382        let trailing_ws_len = closing_suffix.len() - leading_ws_len - closing_trimmed.len();
383
384        if leading_ws_len > 0 {
385            builder.token(
386                SyntaxKind::WHITESPACE.into(),
387                &closing_suffix[..leading_ws_len],
388            );
389        }
390        if !closing_trimmed.is_empty() {
391            builder.token(SyntaxKind::ATX_HEADING_MARKER.into(), closing_trimmed);
392        }
393        if trailing_ws_len > 0 {
394            builder.token(
395                SyntaxKind::WHITESPACE.into(),
396                &closing_suffix[closing_suffix.len() - trailing_ws_len..],
397            );
398        }
399    }
400
401    // Emit trailing newline if present
402    if !newline_str.is_empty() {
403        builder.token(SyntaxKind::NEWLINE.into(), newline_str);
404    }
405
406    builder.finish_node(); // Heading
407}
408
409#[cfg(test)]
410mod tests {
411    use super::*;
412
413    #[test]
414    fn test_simple_heading() {
415        assert_eq!(try_parse_atx_heading("# Heading"), Some(1));
416    }
417
418    #[test]
419    fn test_level_3_heading() {
420        assert_eq!(try_parse_atx_heading("### Level 3"), Some(3));
421    }
422
423    #[test]
424    fn test_heading_with_leading_spaces() {
425        assert_eq!(try_parse_atx_heading("   # Heading"), Some(1));
426    }
427
428    #[test]
429    fn test_atx_heading_with_attributes_losslessness() {
430        use crate::ParserOptions;
431
432        // Regression test for losslessness bug where space before attributes was dropped
433        let input = "# Test {#id}\n";
434        let config = ParserOptions::default();
435        let tree = crate::parse(input, Some(config));
436
437        // Verify losslessness: tree text should exactly match input
438        assert_eq!(
439            tree.text().to_string(),
440            input,
441            "Parser must preserve all bytes including space before attributes"
442        );
443
444        // Verify structure
445        let heading = tree.first_child().unwrap();
446        assert_eq!(heading.kind(), SyntaxKind::HEADING);
447
448        // Find the whitespace between content and attribute
449        let mut found_whitespace = false;
450        for child in heading.children_with_tokens() {
451            if child.kind() == SyntaxKind::WHITESPACE
452                && let Some(token) = child.as_token()
453            {
454                let start: usize = token.text_range().start().into();
455                if token.text() == " " && start == 6 {
456                    found_whitespace = true;
457                    break;
458                }
459            }
460        }
461        assert!(
462            found_whitespace,
463            "Whitespace token between heading content and attributes must be present"
464        );
465    }
466
467    #[test]
468    fn test_atx_heading_closing_hashes_are_lossless() {
469        let input = "### Extension: `smart` ###\n";
470        let tree = crate::parse(input, Some(crate::ParserOptions::default()));
471        assert_eq!(tree.text().to_string(), input);
472    }
473
474    #[test]
475    fn test_four_spaces_not_heading() {
476        assert_eq!(try_parse_atx_heading("    # Not heading"), None);
477    }
478
479    #[test]
480    fn test_no_space_after_hash() {
481        assert_eq!(try_parse_atx_heading("#NoSpace"), None);
482    }
483
484    #[test]
485    fn test_empty_heading() {
486        assert_eq!(try_parse_atx_heading("# "), Some(1));
487    }
488
489    #[test]
490    fn test_level_7_invalid() {
491        assert_eq!(try_parse_atx_heading("####### Too many"), None);
492    }
493
494    // Setext heading tests
495    #[test]
496    fn test_setext_level_1() {
497        let lines = vec!["Heading", "======="];
498        assert_eq!(try_parse_setext_heading(&lines, 0), Some((1, '=')));
499    }
500
501    #[test]
502    fn test_setext_level_2() {
503        let lines = vec!["Heading", "-------"];
504        assert_eq!(try_parse_setext_heading(&lines, 0), Some((2, '-')));
505    }
506
507    #[test]
508    fn test_setext_minimum_three_chars() {
509        let lines = vec!["Heading", "=="];
510        assert_eq!(try_parse_setext_heading(&lines, 0), None);
511
512        let lines = vec!["Heading", "==="];
513        assert_eq!(try_parse_setext_heading(&lines, 0), Some((1, '=')));
514    }
515
516    #[test]
517    fn test_setext_mixed_chars_invalid() {
518        let lines = vec!["Heading", "==-=="];
519        assert_eq!(try_parse_setext_heading(&lines, 0), None);
520    }
521
522    #[test]
523    fn test_setext_with_leading_spaces() {
524        let lines = vec!["Heading", "   ======="];
525        assert_eq!(try_parse_setext_heading(&lines, 0), Some((1, '=')));
526    }
527
528    #[test]
529    fn test_setext_with_trailing_spaces() {
530        let lines = vec!["Heading", "=======   "];
531        assert_eq!(try_parse_setext_heading(&lines, 0), Some((1, '=')));
532    }
533
534    #[test]
535    fn test_setext_empty_text_line() {
536        let lines = vec!["", "======="];
537        assert_eq!(try_parse_setext_heading(&lines, 0), None);
538    }
539
540    #[test]
541    fn test_setext_no_next_line() {
542        let lines = vec!["Heading"];
543        assert_eq!(try_parse_setext_heading(&lines, 0), None);
544    }
545
546    #[test]
547    fn test_setext_four_spaces_indent() {
548        // 4+ spaces means code block, not setext
549        let lines = vec!["    Heading", "    ======="];
550        assert_eq!(try_parse_setext_heading(&lines, 0), None);
551    }
552
553    #[test]
554    fn test_setext_long_underline() {
555        let underline = "=".repeat(100);
556        let lines = vec!["Heading", underline.as_str()];
557        assert_eq!(try_parse_setext_heading(&lines, 0), Some((1, '=')));
558    }
559
560    #[test]
561    fn test_parse_mmd_header_identifier_normalizes_like_pandoc() {
562        let parsed = try_parse_mmd_header_identifier_with_pos("A heading [My ID]")
563            .expect("should parse mmd header identifier");
564        assert_eq!(parsed.0, "myid");
565        assert_eq!(parsed.1, 10);
566    }
567}