Skip to main content

panache_parser/parser/blocks/
html_blocks.rs

1//! HTML block parsing utilities.
2
3use crate::syntax::SyntaxKind;
4use rowan::GreenNodeBuilder;
5
6use super::blockquotes::count_blockquote_markers;
7use crate::parser::utils::helpers::{strip_leading_spaces, strip_newline};
8
9/// HTML block-level tags as defined by CommonMark spec.
10/// These tags start an HTML block when found at the start of a line.
11const BLOCK_TAGS: &[&str] = &[
12    "address",
13    "article",
14    "aside",
15    "base",
16    "basefont",
17    "blockquote",
18    "body",
19    "caption",
20    "center",
21    "col",
22    "colgroup",
23    "dd",
24    "details",
25    "dialog",
26    "dir",
27    "div",
28    "dl",
29    "dt",
30    "fieldset",
31    "figcaption",
32    "figure",
33    "footer",
34    "form",
35    "frame",
36    "frameset",
37    "h1",
38    "h2",
39    "h3",
40    "h4",
41    "h5",
42    "h6",
43    "head",
44    "header",
45    "hr",
46    "html",
47    "iframe",
48    "legend",
49    "li",
50    "link",
51    "main",
52    "menu",
53    "menuitem",
54    "nav",
55    "noframes",
56    "ol",
57    "optgroup",
58    "option",
59    "p",
60    "param",
61    "section",
62    "source",
63    "summary",
64    "table",
65    "tbody",
66    "td",
67    "tfoot",
68    "th",
69    "thead",
70    "title",
71    "tr",
72    "track",
73    "ul",
74];
75
76/// Tags that contain raw/verbatim content (no Markdown processing inside).
77const VERBATIM_TAGS: &[&str] = &["script", "style", "pre", "textarea"];
78
79/// Information about a detected HTML block opening.
80#[derive(Debug, Clone, PartialEq, Eq)]
81pub(crate) enum HtmlBlockType {
82    /// HTML comment: <!-- ... -->
83    Comment,
84    /// Processing instruction: <? ... ?>
85    ProcessingInstruction,
86    /// Declaration: <!...>
87    Declaration,
88    /// CDATA section: <![CDATA[ ... ]]>
89    CData,
90    /// Block-level tag
91    BlockTag { tag_name: String, is_verbatim: bool },
92}
93
94/// Try to detect an HTML block opening from content.
95/// Returns block type if this is a valid HTML block start.
96pub(crate) fn try_parse_html_block_start(content: &str) -> Option<HtmlBlockType> {
97    let trimmed = strip_leading_spaces(content);
98
99    // Must start with <
100    if !trimmed.starts_with('<') {
101        return None;
102    }
103
104    // HTML comment
105    if trimmed.starts_with("<!--") {
106        return Some(HtmlBlockType::Comment);
107    }
108
109    // Processing instruction
110    if trimmed.starts_with("<?") {
111        return Some(HtmlBlockType::ProcessingInstruction);
112    }
113
114    // CDATA section
115    if trimmed.starts_with("<![CDATA[") {
116        return Some(HtmlBlockType::CData);
117    }
118
119    // Declaration (DOCTYPE, etc.)
120    if trimmed.starts_with("<!") && trimmed.len() > 2 {
121        let after_bang = &trimmed[2..];
122        if after_bang.chars().next()?.is_ascii_uppercase() {
123            return Some(HtmlBlockType::Declaration);
124        }
125    }
126
127    // Try to parse as opening tag
128    if let Some(tag_name) = extract_opening_tag_name(trimmed) {
129        let tag_lower = tag_name.to_lowercase();
130
131        // Check if it's a block-level tag
132        if BLOCK_TAGS.contains(&tag_lower.as_str()) {
133            let is_verbatim = VERBATIM_TAGS.contains(&tag_lower.as_str());
134            return Some(HtmlBlockType::BlockTag {
135                tag_name: tag_lower,
136                is_verbatim,
137            });
138        }
139
140        // Also accept verbatim tags even if not in BLOCK_TAGS list
141        if VERBATIM_TAGS.contains(&tag_lower.as_str()) {
142            return Some(HtmlBlockType::BlockTag {
143                tag_name: tag_lower,
144                is_verbatim: true,
145            });
146        }
147    }
148
149    None
150}
151
152/// Extract the tag name from an opening tag.
153/// Returns Some(tag_name) if valid opening tag, None otherwise.
154fn extract_opening_tag_name(text: &str) -> Option<String> {
155    if !text.starts_with('<') {
156        return None;
157    }
158
159    let after_bracket = &text[1..];
160
161    // Skip closing tags
162    if after_bracket.starts_with('/') {
163        return None;
164    }
165
166    // Extract tag name (alphanumeric, ends at space, >, or /)
167    let tag_end = after_bracket
168        .find(|c: char| c.is_whitespace() || c == '>' || c == '/')
169        .unwrap_or(after_bracket.len());
170
171    if tag_end == 0 {
172        return None;
173    }
174
175    let tag_name = &after_bracket[..tag_end];
176
177    // Tag name must be valid (ASCII alphabetic start, alphanumeric)
178    if !tag_name.chars().next()?.is_ascii_alphabetic() {
179        return None;
180    }
181
182    if !tag_name.chars().all(|c| c.is_ascii_alphanumeric()) {
183        return None;
184    }
185
186    Some(tag_name.to_string())
187}
188
189/// Check if a line contains the closing marker for the given HTML block type.
190fn is_closing_marker(line: &str, block_type: &HtmlBlockType) -> bool {
191    match block_type {
192        HtmlBlockType::Comment => line.contains("-->"),
193        HtmlBlockType::ProcessingInstruction => line.contains("?>"),
194        HtmlBlockType::Declaration => line.contains('>'),
195        HtmlBlockType::CData => line.contains("]]>"),
196        HtmlBlockType::BlockTag { tag_name, .. } => {
197            // Look for closing tag </tagname>
198            let closing_tag = format!("</{}>", tag_name);
199            line.to_lowercase().contains(&closing_tag)
200        }
201    }
202}
203
204/// Parse an HTML block, consuming lines from the parser.
205/// Returns the new position after the HTML block.
206pub(crate) fn parse_html_block(
207    builder: &mut GreenNodeBuilder<'static>,
208    lines: &[&str],
209    start_pos: usize,
210    block_type: HtmlBlockType,
211    bq_depth: usize,
212) -> usize {
213    // Start HTML block
214    builder.start_node(SyntaxKind::HTML_BLOCK.into());
215
216    let first_line = lines[start_pos];
217
218    // Emit opening line
219    builder.start_node(SyntaxKind::HTML_BLOCK_TAG.into());
220
221    // Split off trailing newline if present
222    let (line_without_newline, newline_str) = strip_newline(first_line);
223
224    if !line_without_newline.is_empty() {
225        builder.token(SyntaxKind::TEXT.into(), line_without_newline);
226    }
227
228    if !newline_str.is_empty() {
229        builder.token(SyntaxKind::NEWLINE.into(), newline_str);
230    }
231
232    builder.finish_node(); // HtmlBlockTag
233
234    // Check if opening line also contains closing marker
235    let closes_on_first_line = is_closing_marker(first_line, &block_type);
236
237    if closes_on_first_line {
238        log::trace!(
239            "HTML block at line {} opens and closes on same line",
240            start_pos + 1
241        );
242        builder.finish_node(); // HtmlBlock
243        return start_pos + 1;
244    }
245
246    let mut current_pos = start_pos + 1;
247    let mut content_lines: Vec<&str> = Vec::new();
248    let mut found_closing = false;
249
250    // Parse content until we find the closing marker
251    while current_pos < lines.len() {
252        let line = lines[current_pos];
253        let (line_bq_depth, _inner_content) = count_blockquote_markers(line);
254
255        // Only process lines at the same or deeper blockquote depth
256        if line_bq_depth < bq_depth {
257            break;
258        }
259
260        // Check for closing marker
261        if is_closing_marker(line, &block_type) {
262            log::trace!("Found HTML block closing at line {}", current_pos + 1);
263            found_closing = true;
264
265            // Emit content
266            if !content_lines.is_empty() {
267                builder.start_node(SyntaxKind::HTML_BLOCK_CONTENT.into());
268                for content_line in &content_lines {
269                    // Split off trailing newline if present
270                    let (line_without_newline, newline_str) = strip_newline(content_line);
271
272                    if !line_without_newline.is_empty() {
273                        builder.token(SyntaxKind::TEXT.into(), line_without_newline);
274                    }
275
276                    if !newline_str.is_empty() {
277                        builder.token(SyntaxKind::NEWLINE.into(), newline_str);
278                    }
279                }
280                builder.finish_node(); // HtmlBlockContent
281            }
282
283            // Emit closing line
284            builder.start_node(SyntaxKind::HTML_BLOCK_TAG.into());
285
286            // Split off trailing newline if present
287            let (line_without_newline, newline_str) = strip_newline(line);
288
289            if !line_without_newline.is_empty() {
290                builder.token(SyntaxKind::TEXT.into(), line_without_newline);
291            }
292
293            if !newline_str.is_empty() {
294                builder.token(SyntaxKind::NEWLINE.into(), newline_str);
295            }
296
297            builder.finish_node(); // HtmlBlockTag
298
299            current_pos += 1;
300            break;
301        }
302
303        // Regular content line
304        content_lines.push(line);
305        current_pos += 1;
306    }
307
308    // If we didn't find a closing marker, emit what we collected
309    if !found_closing {
310        log::trace!("HTML block at line {} has no closing marker", start_pos + 1);
311        if !content_lines.is_empty() {
312            builder.start_node(SyntaxKind::HTML_BLOCK_CONTENT.into());
313            for content_line in &content_lines {
314                // Split off trailing newline if present
315                let (line_without_newline, newline_str) = strip_newline(content_line);
316
317                if !line_without_newline.is_empty() {
318                    builder.token(SyntaxKind::TEXT.into(), line_without_newline);
319                }
320
321                if !newline_str.is_empty() {
322                    builder.token(SyntaxKind::NEWLINE.into(), newline_str);
323                }
324            }
325            builder.finish_node(); // HtmlBlockContent
326        }
327    }
328
329    builder.finish_node(); // HtmlBlock
330    current_pos
331}
332
333#[cfg(test)]
334mod tests {
335    use super::*;
336
337    #[test]
338    fn test_try_parse_html_comment() {
339        assert_eq!(
340            try_parse_html_block_start("<!-- comment -->"),
341            Some(HtmlBlockType::Comment)
342        );
343        assert_eq!(
344            try_parse_html_block_start("  <!-- comment -->"),
345            Some(HtmlBlockType::Comment)
346        );
347    }
348
349    #[test]
350    fn test_try_parse_div_tag() {
351        assert_eq!(
352            try_parse_html_block_start("<div>"),
353            Some(HtmlBlockType::BlockTag {
354                tag_name: "div".to_string(),
355                is_verbatim: false
356            })
357        );
358        assert_eq!(
359            try_parse_html_block_start("<div class=\"test\">"),
360            Some(HtmlBlockType::BlockTag {
361                tag_name: "div".to_string(),
362                is_verbatim: false
363            })
364        );
365    }
366
367    #[test]
368    fn test_try_parse_script_tag() {
369        assert_eq!(
370            try_parse_html_block_start("<script>"),
371            Some(HtmlBlockType::BlockTag {
372                tag_name: "script".to_string(),
373                is_verbatim: true
374            })
375        );
376    }
377
378    #[test]
379    fn test_try_parse_processing_instruction() {
380        assert_eq!(
381            try_parse_html_block_start("<?xml version=\"1.0\"?>"),
382            Some(HtmlBlockType::ProcessingInstruction)
383        );
384    }
385
386    #[test]
387    fn test_try_parse_declaration() {
388        assert_eq!(
389            try_parse_html_block_start("<!DOCTYPE html>"),
390            Some(HtmlBlockType::Declaration)
391        );
392    }
393
394    #[test]
395    fn test_try_parse_cdata() {
396        assert_eq!(
397            try_parse_html_block_start("<![CDATA[content]]>"),
398            Some(HtmlBlockType::CData)
399        );
400    }
401
402    #[test]
403    fn test_extract_opening_tag_name() {
404        assert_eq!(extract_opening_tag_name("<div>"), Some("div".to_string()));
405        assert_eq!(
406            extract_opening_tag_name("<div class=\"test\">"),
407            Some("div".to_string())
408        );
409        assert_eq!(extract_opening_tag_name("<div/>"), Some("div".to_string()));
410        assert_eq!(extract_opening_tag_name("</div>"), None);
411        assert_eq!(extract_opening_tag_name("<>"), None);
412        assert_eq!(extract_opening_tag_name("< div>"), None);
413    }
414
415    #[test]
416    fn test_is_closing_marker_comment() {
417        let block_type = HtmlBlockType::Comment;
418        assert!(is_closing_marker("-->", &block_type));
419        assert!(is_closing_marker("end -->", &block_type));
420        assert!(!is_closing_marker("<!--", &block_type));
421    }
422
423    #[test]
424    fn test_is_closing_marker_tag() {
425        let block_type = HtmlBlockType::BlockTag {
426            tag_name: "div".to_string(),
427            is_verbatim: false,
428        };
429        assert!(is_closing_marker("</div>", &block_type));
430        assert!(is_closing_marker("</DIV>", &block_type)); // Case insensitive
431        assert!(is_closing_marker("content</div>", &block_type));
432        assert!(!is_closing_marker("<div>", &block_type));
433    }
434
435    #[test]
436    fn test_parse_html_comment_block() {
437        let input = "<!-- comment -->\n";
438        let lines: Vec<&str> = input.lines().collect();
439        let mut builder = GreenNodeBuilder::new();
440
441        let block_type = try_parse_html_block_start(lines[0]).unwrap();
442        let new_pos = parse_html_block(&mut builder, &lines, 0, block_type, 0);
443
444        assert_eq!(new_pos, 1);
445    }
446
447    #[test]
448    fn test_parse_div_block() {
449        let input = "<div>\ncontent\n</div>\n";
450        let lines: Vec<&str> = input.lines().collect();
451        let mut builder = GreenNodeBuilder::new();
452
453        let block_type = try_parse_html_block_start(lines[0]).unwrap();
454        let new_pos = parse_html_block(&mut builder, &lines, 0, block_type, 0);
455
456        assert_eq!(new_pos, 3);
457    }
458
459    #[test]
460    fn test_parse_html_block_no_closing() {
461        let input = "<div>\ncontent\n";
462        let lines: Vec<&str> = input.lines().collect();
463        let mut builder = GreenNodeBuilder::new();
464
465        let block_type = try_parse_html_block_start(lines[0]).unwrap();
466        let new_pos = parse_html_block(&mut builder, &lines, 0, block_type, 0);
467
468        // Should consume all lines even without closing tag
469        assert_eq!(new_pos, 2);
470    }
471}