1use crate::parser::inlines::inline_html::{parse_close_tag, parse_open_tag};
4use crate::syntax::SyntaxKind;
5use rowan::GreenNodeBuilder;
6
7use super::blockquotes::{count_blockquote_markers, strip_n_blockquote_markers};
8use crate::parser::utils::helpers::{strip_leading_spaces, strip_newline};
9
10const BLOCK_TAGS: &[&str] = &[
13 "address",
14 "article",
15 "aside",
16 "base",
17 "basefont",
18 "blockquote",
19 "body",
20 "caption",
21 "center",
22 "col",
23 "colgroup",
24 "dd",
25 "details",
26 "dialog",
27 "dir",
28 "div",
29 "dl",
30 "dt",
31 "fieldset",
32 "figcaption",
33 "figure",
34 "footer",
35 "form",
36 "frame",
37 "frameset",
38 "h1",
39 "h2",
40 "h3",
41 "h4",
42 "h5",
43 "h6",
44 "head",
45 "header",
46 "hr",
47 "html",
48 "iframe",
49 "legend",
50 "li",
51 "link",
52 "main",
53 "menu",
54 "menuitem",
55 "nav",
56 "noframes",
57 "ol",
58 "optgroup",
59 "option",
60 "p",
61 "param",
62 "section",
63 "source",
64 "summary",
65 "table",
66 "tbody",
67 "td",
68 "tfoot",
69 "th",
70 "thead",
71 "title",
72 "tr",
73 "track",
74 "ul",
75];
76
77const VERBATIM_TAGS: &[&str] = &["script", "style", "pre", "textarea"];
79
80#[derive(Debug, Clone, PartialEq, Eq)]
82pub(crate) enum HtmlBlockType {
83 Comment,
85 ProcessingInstruction,
87 Declaration,
89 CData,
91 BlockTag {
96 tag_name: String,
97 is_verbatim: bool,
98 closed_by_blank_line: bool,
99 },
100 Type7,
104}
105
106pub(crate) fn try_parse_html_block_start(
113 content: &str,
114 is_commonmark: bool,
115) -> Option<HtmlBlockType> {
116 let trimmed = strip_leading_spaces(content);
117
118 if !trimmed.starts_with('<') {
120 return None;
121 }
122
123 if trimmed.starts_with("<!--") {
125 return Some(HtmlBlockType::Comment);
126 }
127
128 if trimmed.starts_with("<?") {
130 return Some(HtmlBlockType::ProcessingInstruction);
131 }
132
133 if trimmed.starts_with("<![CDATA[") {
135 return Some(HtmlBlockType::CData);
136 }
137
138 if trimmed.starts_with("<!") && trimmed.len() > 2 {
140 let after_bang = &trimmed[2..];
141 if after_bang.chars().next()?.is_ascii_uppercase() {
142 return Some(HtmlBlockType::Declaration);
143 }
144 }
145
146 if let Some(tag_name) = extract_block_tag_name(trimmed, is_commonmark) {
148 let tag_lower = tag_name.to_lowercase();
149 let is_closing = trimmed.starts_with("</");
150
151 if BLOCK_TAGS.contains(&tag_lower.as_str()) {
153 let is_verbatim = VERBATIM_TAGS.contains(&tag_lower.as_str());
154 return Some(HtmlBlockType::BlockTag {
155 tag_name: tag_lower,
156 is_verbatim,
157 closed_by_blank_line: is_commonmark && !is_verbatim,
158 });
159 }
160
161 if !is_closing && VERBATIM_TAGS.contains(&tag_lower.as_str()) {
167 return Some(HtmlBlockType::BlockTag {
168 tag_name: tag_lower,
169 is_verbatim: true,
170 closed_by_blank_line: false,
171 });
172 }
173 }
174
175 if is_commonmark && let Some(end) = parse_open_tag(trimmed).or_else(|| parse_close_tag(trimmed))
178 {
179 let rest = &trimmed[end..];
180 let only_ws = rest
181 .bytes()
182 .all(|b| matches!(b, b' ' | b'\t' | b'\n' | b'\r'));
183 if only_ws {
184 let leading = trimmed.strip_prefix("</").unwrap_or_else(|| &trimmed[1..]);
190 let name_end = leading
191 .find(|c: char| !(c.is_ascii_alphanumeric() || c == '-'))
192 .unwrap_or(leading.len());
193 let name = leading[..name_end].to_ascii_lowercase();
194 if !VERBATIM_TAGS.contains(&name.as_str()) {
195 return Some(HtmlBlockType::Type7);
196 }
197 }
198 }
199
200 None
201}
202
203fn extract_block_tag_name(text: &str, accept_closing: bool) -> Option<String> {
210 if !text.starts_with('<') {
211 return None;
212 }
213
214 let after_bracket = &text[1..];
215
216 let after_slash = if let Some(stripped) = after_bracket.strip_prefix('/') {
217 if !accept_closing {
218 return None;
219 }
220 stripped
221 } else {
222 after_bracket
223 };
224
225 let tag_end = after_slash
227 .find(|c: char| c.is_whitespace() || c == '>' || c == '/')
228 .unwrap_or(after_slash.len());
229
230 if tag_end == 0 {
231 return None;
232 }
233
234 let tag_name = &after_slash[..tag_end];
235
236 if !tag_name.chars().next()?.is_ascii_alphabetic() {
238 return None;
239 }
240
241 if !tag_name.chars().all(|c| c.is_ascii_alphanumeric()) {
242 return None;
243 }
244
245 Some(tag_name.to_string())
246}
247
248fn ends_at_blank_line(block_type: &HtmlBlockType) -> bool {
252 matches!(
253 block_type,
254 HtmlBlockType::Type7
255 | HtmlBlockType::BlockTag {
256 closed_by_blank_line: true,
257 ..
258 }
259 )
260}
261
262fn is_closing_marker(line: &str, block_type: &HtmlBlockType) -> bool {
266 match block_type {
267 HtmlBlockType::Comment => line.contains("-->"),
268 HtmlBlockType::ProcessingInstruction => line.contains("?>"),
269 HtmlBlockType::Declaration => line.contains('>'),
270 HtmlBlockType::CData => line.contains("]]>"),
271 HtmlBlockType::BlockTag {
272 tag_name,
273 closed_by_blank_line: false,
274 ..
275 } => {
276 let closing_tag = format!("</{}>", tag_name);
278 line.to_lowercase().contains(&closing_tag)
279 }
280 HtmlBlockType::BlockTag {
281 closed_by_blank_line: true,
282 ..
283 }
284 | HtmlBlockType::Type7 => false,
285 }
286}
287
288pub(crate) fn parse_html_block(
291 builder: &mut GreenNodeBuilder<'static>,
292 lines: &[&str],
293 start_pos: usize,
294 block_type: HtmlBlockType,
295 bq_depth: usize,
296) -> usize {
297 builder.start_node(SyntaxKind::HTML_BLOCK.into());
299
300 let first_line = lines[start_pos];
301 let blank_terminated = ends_at_blank_line(&block_type);
302
303 let first_inner = if bq_depth > 0 {
307 strip_n_blockquote_markers(first_line, bq_depth)
308 } else {
309 first_line
310 };
311
312 builder.start_node(SyntaxKind::HTML_BLOCK_TAG.into());
314
315 let (line_without_newline, newline_str) = strip_newline(first_inner);
316 if !line_without_newline.is_empty() {
317 builder.token(SyntaxKind::TEXT.into(), line_without_newline);
318 }
319 if !newline_str.is_empty() {
320 builder.token(SyntaxKind::NEWLINE.into(), newline_str);
321 }
322
323 builder.finish_node(); if !blank_terminated && is_closing_marker(first_inner, &block_type) {
329 log::trace!(
330 "HTML block at line {} opens and closes on same line",
331 start_pos + 1
332 );
333 builder.finish_node(); return start_pos + 1;
335 }
336
337 let mut current_pos = start_pos + 1;
338 let mut content_lines: Vec<&str> = Vec::new();
339 let mut found_closing = false;
340
341 while current_pos < lines.len() {
343 let line = lines[current_pos];
344 let (line_bq_depth, inner) = count_blockquote_markers(line);
345
346 if line_bq_depth < bq_depth {
348 break;
349 }
350
351 if blank_terminated && inner.trim().is_empty() {
354 break;
355 }
356
357 if is_closing_marker(inner, &block_type) {
360 log::trace!("Found HTML block closing at line {}", current_pos + 1);
361 found_closing = true;
362
363 if !content_lines.is_empty() {
364 builder.start_node(SyntaxKind::HTML_BLOCK_CONTENT.into());
365 for content_line in &content_lines {
366 emit_html_block_line(builder, content_line, bq_depth);
367 }
368 builder.finish_node();
369 }
370
371 builder.start_node(SyntaxKind::HTML_BLOCK_TAG.into());
372 emit_html_block_line(builder, line, bq_depth);
373 builder.finish_node();
374
375 current_pos += 1;
376 break;
377 }
378
379 content_lines.push(line);
381 current_pos += 1;
382 }
383
384 if !found_closing {
386 log::trace!("HTML block at line {} has no closing marker", start_pos + 1);
387 if !content_lines.is_empty() {
388 builder.start_node(SyntaxKind::HTML_BLOCK_CONTENT.into());
389 for content_line in &content_lines {
390 emit_html_block_line(builder, content_line, bq_depth);
391 }
392 builder.finish_node();
393 }
394 }
395
396 builder.finish_node(); current_pos
398}
399
400fn emit_html_block_line(builder: &mut GreenNodeBuilder<'static>, line: &str, bq_depth: usize) {
404 let inner = if bq_depth > 0 {
405 let stripped = strip_n_blockquote_markers(line, bq_depth);
406 let prefix_len = line.len() - stripped.len();
407 if prefix_len > 0 {
408 for ch in line[..prefix_len].chars() {
409 if ch == '>' {
410 builder.token(SyntaxKind::BLOCK_QUOTE_MARKER.into(), ">");
411 } else {
412 let mut buf = [0u8; 4];
413 builder.token(SyntaxKind::WHITESPACE.into(), ch.encode_utf8(&mut buf));
414 }
415 }
416 }
417 stripped
418 } else {
419 line
420 };
421
422 let (line_without_newline, newline_str) = strip_newline(inner);
423 if !line_without_newline.is_empty() {
424 builder.token(SyntaxKind::TEXT.into(), line_without_newline);
425 }
426 if !newline_str.is_empty() {
427 builder.token(SyntaxKind::NEWLINE.into(), newline_str);
428 }
429}
430
431#[cfg(test)]
432mod tests {
433 use super::*;
434
435 #[test]
436 fn test_try_parse_html_comment() {
437 assert_eq!(
438 try_parse_html_block_start("<!-- comment -->", false),
439 Some(HtmlBlockType::Comment)
440 );
441 assert_eq!(
442 try_parse_html_block_start(" <!-- comment -->", false),
443 Some(HtmlBlockType::Comment)
444 );
445 }
446
447 #[test]
448 fn test_try_parse_div_tag() {
449 assert_eq!(
450 try_parse_html_block_start("<div>", false),
451 Some(HtmlBlockType::BlockTag {
452 tag_name: "div".to_string(),
453 is_verbatim: false,
454 closed_by_blank_line: false,
455 })
456 );
457 assert_eq!(
458 try_parse_html_block_start("<div class=\"test\">", false),
459 Some(HtmlBlockType::BlockTag {
460 tag_name: "div".to_string(),
461 is_verbatim: false,
462 closed_by_blank_line: false,
463 })
464 );
465 }
466
467 #[test]
468 fn test_try_parse_script_tag() {
469 assert_eq!(
470 try_parse_html_block_start("<script>", false),
471 Some(HtmlBlockType::BlockTag {
472 tag_name: "script".to_string(),
473 is_verbatim: true,
474 closed_by_blank_line: false,
475 })
476 );
477 }
478
479 #[test]
480 fn test_try_parse_processing_instruction() {
481 assert_eq!(
482 try_parse_html_block_start("<?xml version=\"1.0\"?>", false),
483 Some(HtmlBlockType::ProcessingInstruction)
484 );
485 }
486
487 #[test]
488 fn test_try_parse_declaration() {
489 assert_eq!(
490 try_parse_html_block_start("<!DOCTYPE html>", false),
491 Some(HtmlBlockType::Declaration)
492 );
493 }
494
495 #[test]
496 fn test_try_parse_cdata() {
497 assert_eq!(
498 try_parse_html_block_start("<![CDATA[content]]>", false),
499 Some(HtmlBlockType::CData)
500 );
501 }
502
503 #[test]
504 fn test_extract_block_tag_name_open_only() {
505 assert_eq!(
506 extract_block_tag_name("<div>", false),
507 Some("div".to_string())
508 );
509 assert_eq!(
510 extract_block_tag_name("<div class=\"test\">", false),
511 Some("div".to_string())
512 );
513 assert_eq!(
514 extract_block_tag_name("<div/>", false),
515 Some("div".to_string())
516 );
517 assert_eq!(extract_block_tag_name("</div>", false), None);
518 assert_eq!(extract_block_tag_name("<>", false), None);
519 assert_eq!(extract_block_tag_name("< div>", false), None);
520 }
521
522 #[test]
523 fn test_extract_block_tag_name_with_closing() {
524 assert_eq!(
526 extract_block_tag_name("</div>", true),
527 Some("div".to_string())
528 );
529 assert_eq!(
530 extract_block_tag_name("</div >", true),
531 Some("div".to_string())
532 );
533 }
534
535 #[test]
536 fn test_commonmark_type6_closing_tag_start() {
537 assert_eq!(
538 try_parse_html_block_start("</div>", true),
539 Some(HtmlBlockType::BlockTag {
540 tag_name: "div".to_string(),
541 is_verbatim: false,
542 closed_by_blank_line: true,
543 })
544 );
545 }
546
547 #[test]
548 fn test_commonmark_type7_open_tag() {
549 assert_eq!(
552 try_parse_html_block_start("<a href=\"foo\">", true),
553 Some(HtmlBlockType::Type7)
554 );
555 assert_eq!(try_parse_html_block_start("<a href=\"foo\">", false), None);
556 }
557
558 #[test]
559 fn test_commonmark_type7_close_tag() {
560 assert_eq!(
561 try_parse_html_block_start("</ins>", true),
562 Some(HtmlBlockType::Type7)
563 );
564 }
565
566 #[test]
567 fn test_commonmark_type7_rejects_with_trailing_text() {
568 assert_eq!(try_parse_html_block_start("<a> hi", true), None);
570 }
571
572 #[test]
573 fn test_is_closing_marker_comment() {
574 let block_type = HtmlBlockType::Comment;
575 assert!(is_closing_marker("-->", &block_type));
576 assert!(is_closing_marker("end -->", &block_type));
577 assert!(!is_closing_marker("<!--", &block_type));
578 }
579
580 #[test]
581 fn test_is_closing_marker_tag() {
582 let block_type = HtmlBlockType::BlockTag {
583 tag_name: "div".to_string(),
584 is_verbatim: false,
585 closed_by_blank_line: false,
586 };
587 assert!(is_closing_marker("</div>", &block_type));
588 assert!(is_closing_marker("</DIV>", &block_type)); assert!(is_closing_marker("content</div>", &block_type));
590 assert!(!is_closing_marker("<div>", &block_type));
591 }
592
593 #[test]
594 fn test_parse_html_comment_block() {
595 let input = "<!-- comment -->\n";
596 let lines: Vec<&str> = input.lines().collect();
597 let mut builder = GreenNodeBuilder::new();
598
599 let block_type = try_parse_html_block_start(lines[0], false).unwrap();
600 let new_pos = parse_html_block(&mut builder, &lines, 0, block_type, 0);
601
602 assert_eq!(new_pos, 1);
603 }
604
605 #[test]
606 fn test_parse_div_block() {
607 let input = "<div>\ncontent\n</div>\n";
608 let lines: Vec<&str> = input.lines().collect();
609 let mut builder = GreenNodeBuilder::new();
610
611 let block_type = try_parse_html_block_start(lines[0], false).unwrap();
612 let new_pos = parse_html_block(&mut builder, &lines, 0, block_type, 0);
613
614 assert_eq!(new_pos, 3);
615 }
616
617 #[test]
618 fn test_parse_html_block_no_closing() {
619 let input = "<div>\ncontent\n";
620 let lines: Vec<&str> = input.lines().collect();
621 let mut builder = GreenNodeBuilder::new();
622
623 let block_type = try_parse_html_block_start(lines[0], false).unwrap();
624 let new_pos = parse_html_block(&mut builder, &lines, 0, block_type, 0);
625
626 assert_eq!(new_pos, 2);
628 }
629
630 #[test]
631 fn test_commonmark_type6_blank_line_terminates() {
632 let input = "<div>\nfoo\n\nbar\n";
633 let lines: Vec<&str> = input.lines().collect();
634 let mut builder = GreenNodeBuilder::new();
635
636 let block_type = try_parse_html_block_start(lines[0], true).unwrap();
637 let new_pos = parse_html_block(&mut builder, &lines, 0, block_type, 0);
638
639 assert_eq!(new_pos, 2);
641 }
642}