panache_parser/parser/blocks/
html_blocks.rs1use crate::syntax::SyntaxKind;
4use rowan::GreenNodeBuilder;
5
6use super::blockquotes::count_blockquote_markers;
7use crate::parser::utils::helpers::{strip_leading_spaces, strip_newline};
8
9const BLOCK_TAGS: &[&str] = &[
12 "address",
13 "article",
14 "aside",
15 "base",
16 "basefont",
17 "blockquote",
18 "body",
19 "caption",
20 "center",
21 "col",
22 "colgroup",
23 "dd",
24 "details",
25 "dialog",
26 "dir",
27 "div",
28 "dl",
29 "dt",
30 "fieldset",
31 "figcaption",
32 "figure",
33 "footer",
34 "form",
35 "frame",
36 "frameset",
37 "h1",
38 "h2",
39 "h3",
40 "h4",
41 "h5",
42 "h6",
43 "head",
44 "header",
45 "hr",
46 "html",
47 "iframe",
48 "legend",
49 "li",
50 "link",
51 "main",
52 "menu",
53 "menuitem",
54 "nav",
55 "noframes",
56 "ol",
57 "optgroup",
58 "option",
59 "p",
60 "param",
61 "section",
62 "source",
63 "summary",
64 "table",
65 "tbody",
66 "td",
67 "tfoot",
68 "th",
69 "thead",
70 "title",
71 "tr",
72 "track",
73 "ul",
74];
75
76const VERBATIM_TAGS: &[&str] = &["script", "style", "pre", "textarea"];
78
79#[derive(Debug, Clone, PartialEq, Eq)]
81pub(crate) enum HtmlBlockType {
82 Comment,
84 ProcessingInstruction,
86 Declaration,
88 CData,
90 BlockTag { tag_name: String, is_verbatim: bool },
92}
93
94pub(crate) fn try_parse_html_block_start(content: &str) -> Option<HtmlBlockType> {
97 let trimmed = strip_leading_spaces(content);
98
99 if !trimmed.starts_with('<') {
101 return None;
102 }
103
104 if trimmed.starts_with("<!--") {
106 return Some(HtmlBlockType::Comment);
107 }
108
109 if trimmed.starts_with("<?") {
111 return Some(HtmlBlockType::ProcessingInstruction);
112 }
113
114 if trimmed.starts_with("<![CDATA[") {
116 return Some(HtmlBlockType::CData);
117 }
118
119 if trimmed.starts_with("<!") && trimmed.len() > 2 {
121 let after_bang = &trimmed[2..];
122 if after_bang.chars().next()?.is_ascii_uppercase() {
123 return Some(HtmlBlockType::Declaration);
124 }
125 }
126
127 if let Some(tag_name) = extract_opening_tag_name(trimmed) {
129 let tag_lower = tag_name.to_lowercase();
130
131 if BLOCK_TAGS.contains(&tag_lower.as_str()) {
133 let is_verbatim = VERBATIM_TAGS.contains(&tag_lower.as_str());
134 return Some(HtmlBlockType::BlockTag {
135 tag_name: tag_lower,
136 is_verbatim,
137 });
138 }
139
140 if VERBATIM_TAGS.contains(&tag_lower.as_str()) {
142 return Some(HtmlBlockType::BlockTag {
143 tag_name: tag_lower,
144 is_verbatim: true,
145 });
146 }
147 }
148
149 None
150}
151
152fn extract_opening_tag_name(text: &str) -> Option<String> {
155 if !text.starts_with('<') {
156 return None;
157 }
158
159 let after_bracket = &text[1..];
160
161 if after_bracket.starts_with('/') {
163 return None;
164 }
165
166 let tag_end = after_bracket
168 .find(|c: char| c.is_whitespace() || c == '>' || c == '/')
169 .unwrap_or(after_bracket.len());
170
171 if tag_end == 0 {
172 return None;
173 }
174
175 let tag_name = &after_bracket[..tag_end];
176
177 if !tag_name.chars().next()?.is_ascii_alphabetic() {
179 return None;
180 }
181
182 if !tag_name.chars().all(|c| c.is_ascii_alphanumeric()) {
183 return None;
184 }
185
186 Some(tag_name.to_string())
187}
188
189fn is_closing_marker(line: &str, block_type: &HtmlBlockType) -> bool {
191 match block_type {
192 HtmlBlockType::Comment => line.contains("-->"),
193 HtmlBlockType::ProcessingInstruction => line.contains("?>"),
194 HtmlBlockType::Declaration => line.contains('>'),
195 HtmlBlockType::CData => line.contains("]]>"),
196 HtmlBlockType::BlockTag { tag_name, .. } => {
197 let closing_tag = format!("</{}>", tag_name);
199 line.to_lowercase().contains(&closing_tag)
200 }
201 }
202}
203
204pub(crate) fn parse_html_block(
207 builder: &mut GreenNodeBuilder<'static>,
208 lines: &[&str],
209 start_pos: usize,
210 block_type: HtmlBlockType,
211 bq_depth: usize,
212) -> usize {
213 builder.start_node(SyntaxKind::HTML_BLOCK.into());
215
216 let first_line = lines[start_pos];
217
218 builder.start_node(SyntaxKind::HTML_BLOCK_TAG.into());
220
221 let (line_without_newline, newline_str) = strip_newline(first_line);
223
224 if !line_without_newline.is_empty() {
225 builder.token(SyntaxKind::TEXT.into(), line_without_newline);
226 }
227
228 if !newline_str.is_empty() {
229 builder.token(SyntaxKind::NEWLINE.into(), newline_str);
230 }
231
232 builder.finish_node(); let closes_on_first_line = is_closing_marker(first_line, &block_type);
236
237 if closes_on_first_line {
238 log::trace!(
239 "HTML block at line {} opens and closes on same line",
240 start_pos + 1
241 );
242 builder.finish_node(); return start_pos + 1;
244 }
245
246 let mut current_pos = start_pos + 1;
247 let mut content_lines: Vec<&str> = Vec::new();
248 let mut found_closing = false;
249
250 while current_pos < lines.len() {
252 let line = lines[current_pos];
253 let (line_bq_depth, _inner_content) = count_blockquote_markers(line);
254
255 if line_bq_depth < bq_depth {
257 break;
258 }
259
260 if is_closing_marker(line, &block_type) {
262 log::trace!("Found HTML block closing at line {}", current_pos + 1);
263 found_closing = true;
264
265 if !content_lines.is_empty() {
267 builder.start_node(SyntaxKind::HTML_BLOCK_CONTENT.into());
268 for content_line in &content_lines {
269 let (line_without_newline, newline_str) = strip_newline(content_line);
271
272 if !line_without_newline.is_empty() {
273 builder.token(SyntaxKind::TEXT.into(), line_without_newline);
274 }
275
276 if !newline_str.is_empty() {
277 builder.token(SyntaxKind::NEWLINE.into(), newline_str);
278 }
279 }
280 builder.finish_node(); }
282
283 builder.start_node(SyntaxKind::HTML_BLOCK_TAG.into());
285
286 let (line_without_newline, newline_str) = strip_newline(line);
288
289 if !line_without_newline.is_empty() {
290 builder.token(SyntaxKind::TEXT.into(), line_without_newline);
291 }
292
293 if !newline_str.is_empty() {
294 builder.token(SyntaxKind::NEWLINE.into(), newline_str);
295 }
296
297 builder.finish_node(); current_pos += 1;
300 break;
301 }
302
303 content_lines.push(line);
305 current_pos += 1;
306 }
307
308 if !found_closing {
310 log::trace!("HTML block at line {} has no closing marker", start_pos + 1);
311 if !content_lines.is_empty() {
312 builder.start_node(SyntaxKind::HTML_BLOCK_CONTENT.into());
313 for content_line in &content_lines {
314 let (line_without_newline, newline_str) = strip_newline(content_line);
316
317 if !line_without_newline.is_empty() {
318 builder.token(SyntaxKind::TEXT.into(), line_without_newline);
319 }
320
321 if !newline_str.is_empty() {
322 builder.token(SyntaxKind::NEWLINE.into(), newline_str);
323 }
324 }
325 builder.finish_node(); }
327 }
328
329 builder.finish_node(); current_pos
331}
332
333#[cfg(test)]
334mod tests {
335 use super::*;
336
337 #[test]
338 fn test_try_parse_html_comment() {
339 assert_eq!(
340 try_parse_html_block_start("<!-- comment -->"),
341 Some(HtmlBlockType::Comment)
342 );
343 assert_eq!(
344 try_parse_html_block_start(" <!-- comment -->"),
345 Some(HtmlBlockType::Comment)
346 );
347 }
348
349 #[test]
350 fn test_try_parse_div_tag() {
351 assert_eq!(
352 try_parse_html_block_start("<div>"),
353 Some(HtmlBlockType::BlockTag {
354 tag_name: "div".to_string(),
355 is_verbatim: false
356 })
357 );
358 assert_eq!(
359 try_parse_html_block_start("<div class=\"test\">"),
360 Some(HtmlBlockType::BlockTag {
361 tag_name: "div".to_string(),
362 is_verbatim: false
363 })
364 );
365 }
366
367 #[test]
368 fn test_try_parse_script_tag() {
369 assert_eq!(
370 try_parse_html_block_start("<script>"),
371 Some(HtmlBlockType::BlockTag {
372 tag_name: "script".to_string(),
373 is_verbatim: true
374 })
375 );
376 }
377
378 #[test]
379 fn test_try_parse_processing_instruction() {
380 assert_eq!(
381 try_parse_html_block_start("<?xml version=\"1.0\"?>"),
382 Some(HtmlBlockType::ProcessingInstruction)
383 );
384 }
385
386 #[test]
387 fn test_try_parse_declaration() {
388 assert_eq!(
389 try_parse_html_block_start("<!DOCTYPE html>"),
390 Some(HtmlBlockType::Declaration)
391 );
392 }
393
394 #[test]
395 fn test_try_parse_cdata() {
396 assert_eq!(
397 try_parse_html_block_start("<![CDATA[content]]>"),
398 Some(HtmlBlockType::CData)
399 );
400 }
401
402 #[test]
403 fn test_extract_opening_tag_name() {
404 assert_eq!(extract_opening_tag_name("<div>"), Some("div".to_string()));
405 assert_eq!(
406 extract_opening_tag_name("<div class=\"test\">"),
407 Some("div".to_string())
408 );
409 assert_eq!(extract_opening_tag_name("<div/>"), Some("div".to_string()));
410 assert_eq!(extract_opening_tag_name("</div>"), None);
411 assert_eq!(extract_opening_tag_name("<>"), None);
412 assert_eq!(extract_opening_tag_name("< div>"), None);
413 }
414
415 #[test]
416 fn test_is_closing_marker_comment() {
417 let block_type = HtmlBlockType::Comment;
418 assert!(is_closing_marker("-->", &block_type));
419 assert!(is_closing_marker("end -->", &block_type));
420 assert!(!is_closing_marker("<!--", &block_type));
421 }
422
423 #[test]
424 fn test_is_closing_marker_tag() {
425 let block_type = HtmlBlockType::BlockTag {
426 tag_name: "div".to_string(),
427 is_verbatim: false,
428 };
429 assert!(is_closing_marker("</div>", &block_type));
430 assert!(is_closing_marker("</DIV>", &block_type)); assert!(is_closing_marker("content</div>", &block_type));
432 assert!(!is_closing_marker("<div>", &block_type));
433 }
434
435 #[test]
436 fn test_parse_html_comment_block() {
437 let input = "<!-- comment -->\n";
438 let lines: Vec<&str> = input.lines().collect();
439 let mut builder = GreenNodeBuilder::new();
440
441 let block_type = try_parse_html_block_start(lines[0]).unwrap();
442 let new_pos = parse_html_block(&mut builder, &lines, 0, block_type, 0);
443
444 assert_eq!(new_pos, 1);
445 }
446
447 #[test]
448 fn test_parse_div_block() {
449 let input = "<div>\ncontent\n</div>\n";
450 let lines: Vec<&str> = input.lines().collect();
451 let mut builder = GreenNodeBuilder::new();
452
453 let block_type = try_parse_html_block_start(lines[0]).unwrap();
454 let new_pos = parse_html_block(&mut builder, &lines, 0, block_type, 0);
455
456 assert_eq!(new_pos, 3);
457 }
458
459 #[test]
460 fn test_parse_html_block_no_closing() {
461 let input = "<div>\ncontent\n";
462 let lines: Vec<&str> = input.lines().collect();
463 let mut builder = GreenNodeBuilder::new();
464
465 let block_type = try_parse_html_block_start(lines[0]).unwrap();
466 let new_pos = parse_html_block(&mut builder, &lines, 0, block_type, 0);
467
468 assert_eq!(new_pos, 2);
470 }
471}