panache_parser/parser/inlines/
inline_html.rs1use crate::options::Dialect;
19use crate::syntax::SyntaxKind;
20use rowan::GreenNodeBuilder;
21
22pub fn try_parse_inline_html(text: &str, dialect: Dialect) -> Option<usize> {
31 if !text.starts_with('<') {
32 return None;
33 }
34 let cdata_decl_allowed = dialect == Dialect::CommonMark;
35 parse_html_comment(text)
36 .or_else(|| {
37 if cdata_decl_allowed {
38 parse_cdata(text)
39 } else {
40 None
41 }
42 })
43 .or_else(|| {
44 if cdata_decl_allowed {
45 parse_declaration(text)
46 } else {
47 None
48 }
49 })
50 .or_else(|| parse_processing_instruction(text))
51 .or_else(|| parse_close_tag(text))
52 .or_else(|| parse_open_tag(text))
53}
54
55pub fn emit_inline_html(builder: &mut GreenNodeBuilder, raw: &str) {
57 builder.start_node(SyntaxKind::INLINE_HTML.into());
58 builder.token(SyntaxKind::INLINE_HTML_CONTENT.into(), raw);
59 builder.finish_node();
60}
61
62fn parse_html_comment(text: &str) -> Option<usize> {
63 if !text.starts_with("<!--") {
64 return None;
65 }
66 if text.as_bytes().get(4) == Some(&b'>') {
68 return Some(5);
69 }
70 if text.as_bytes().get(4) == Some(&b'-') && text.as_bytes().get(5) == Some(&b'>') {
71 return Some(6);
72 }
73 let after = &text[4..];
74 let end = after.find("-->")?;
75 Some(4 + end + 3)
76}
77
78fn parse_processing_instruction(text: &str) -> Option<usize> {
79 if !text.starts_with("<?") {
80 return None;
81 }
82 let after = &text[2..];
83 let end = after.find("?>")?;
84 Some(2 + end + 2)
85}
86
87fn parse_cdata(text: &str) -> Option<usize> {
88 const PREFIX: &str = "<![CDATA[";
89 if !text.starts_with(PREFIX) {
90 return None;
91 }
92 let after = &text[PREFIX.len()..];
93 let end = after.find("]]>")?;
94 Some(PREFIX.len() + end + 3)
95}
96
97fn parse_declaration(text: &str) -> Option<usize> {
98 let bytes = text.as_bytes();
99 if !text.starts_with("<!") || bytes.len() < 3 {
100 return None;
101 }
102 if !bytes[2].is_ascii_alphabetic() {
103 return None;
104 }
105 let mut i = 3;
106 while i < bytes.len() {
107 if bytes[i] == b'>' {
108 return Some(i + 1);
109 }
110 i += 1;
111 }
112 None
113}
114
115pub(crate) fn parse_close_tag(text: &str) -> Option<usize> {
116 let bytes = text.as_bytes();
117 if !text.starts_with("</") {
118 return None;
119 }
120 let mut i = 2;
121 if i >= bytes.len() || !bytes[i].is_ascii_alphabetic() {
122 return None;
123 }
124 i += 1;
125 while i < bytes.len() && (bytes[i].is_ascii_alphanumeric() || bytes[i] == b'-') {
126 i += 1;
127 }
128 i = skip_ws_with_optional_lf(bytes, i);
129 if bytes.get(i) == Some(&b'>') {
130 Some(i + 1)
131 } else {
132 None
133 }
134}
135
136pub(crate) fn parse_open_tag(text: &str) -> Option<usize> {
137 let bytes = text.as_bytes();
138 if !text.starts_with('<') {
139 return None;
140 }
141 let mut i = 1;
142 if i >= bytes.len() || !bytes[i].is_ascii_alphabetic() {
143 return None;
144 }
145 i += 1;
146 while i < bytes.len() && (bytes[i].is_ascii_alphanumeric() || bytes[i] == b'-') {
147 i += 1;
148 }
149 while let Some(after) = parse_attribute(bytes, i) {
150 i = after;
151 }
152 i = skip_ws_with_optional_lf(bytes, i);
153 if bytes.get(i) == Some(&b'/') {
154 i += 1;
155 }
156 if bytes.get(i) == Some(&b'>') {
157 Some(i + 1)
158 } else {
159 None
160 }
161}
162
163fn parse_attribute(bytes: &[u8], start: usize) -> Option<usize> {
164 let after_ws = skip_ws_required_with_optional_lf(bytes, start)?;
165 let mut i = after_ws;
166 let first = *bytes.get(i)?;
167 if !is_attr_name_start(first) {
168 return None;
169 }
170 i += 1;
171 while i < bytes.len() && is_attr_name_cont(bytes[i]) {
172 i += 1;
173 }
174 if let Some(after_value) = parse_attr_value_spec(bytes, i) {
175 i = after_value;
176 }
177 Some(i)
178}
179
180fn parse_attr_value_spec(bytes: &[u8], start: usize) -> Option<usize> {
181 let i_after_ws1 = skip_ws_with_optional_lf(bytes, start);
182 if bytes.get(i_after_ws1) != Some(&b'=') {
183 return None;
184 }
185 let mut i = i_after_ws1 + 1;
186 i = skip_ws_with_optional_lf(bytes, i);
187 parse_attr_value(bytes, i)
188}
189
190fn parse_attr_value(bytes: &[u8], start: usize) -> Option<usize> {
191 let q = *bytes.get(start)?;
192 match q {
193 b'"' | b'\'' => {
194 let mut j = start + 1;
195 while j < bytes.len() && bytes[j] != q {
196 j += 1;
197 }
198 if j >= bytes.len() {
199 return None;
200 }
201 Some(j + 1)
202 }
203 _ => {
204 let mut j = start;
205 while j < bytes.len() {
206 let b = bytes[j];
207 if matches!(
208 b,
209 b' ' | b'\t' | b'\n' | b'\r' | b'"' | b'\'' | b'=' | b'<' | b'>' | b'`'
210 ) {
211 break;
212 }
213 j += 1;
214 }
215 if j == start { None } else { Some(j) }
216 }
217 }
218}
219
220fn is_attr_name_start(b: u8) -> bool {
221 b.is_ascii_alphabetic() || b == b'_' || b == b':'
222}
223
224fn is_attr_name_cont(b: u8) -> bool {
225 b.is_ascii_alphanumeric() || b == b'_' || b == b'.' || b == b':' || b == b'-'
226}
227
228fn skip_ws_with_optional_lf(bytes: &[u8], start: usize) -> usize {
231 let mut i = start;
232 let mut saw_lf = false;
233 while i < bytes.len() {
234 match bytes[i] {
235 b' ' | b'\t' => i += 1,
236 b'\n' => {
237 if saw_lf {
238 break;
239 }
240 saw_lf = true;
241 i += 1;
242 }
243 b'\r' => {
244 if saw_lf {
245 break;
246 }
247 saw_lf = true;
248 i += 1;
249 if bytes.get(i) == Some(&b'\n') {
250 i += 1;
251 }
252 }
253 _ => break,
254 }
255 }
256 i
257}
258
259fn skip_ws_required_with_optional_lf(bytes: &[u8], start: usize) -> Option<usize> {
262 let after = skip_ws_with_optional_lf(bytes, start);
263 if after == start { None } else { Some(after) }
264}
265
266#[cfg(test)]
267mod tests {
268 use super::*;
269
270 fn matches(input: &str, expected_len: usize) {
271 assert_eq!(
276 try_parse_inline_html(input, Dialect::CommonMark),
277 Some(expected_len),
278 "expected {input:?} to match {expected_len} under CommonMark",
279 );
280 assert_eq!(
281 try_parse_inline_html(input, Dialect::Pandoc),
282 Some(expected_len),
283 "expected {input:?} to match {expected_len} under Pandoc",
284 );
285 }
286
287 fn matches_cm(input: &str, expected_len: usize) {
288 assert_eq!(
289 try_parse_inline_html(input, Dialect::CommonMark),
290 Some(expected_len),
291 "expected {input:?} to match {expected_len} under CommonMark",
292 );
293 }
294
295 fn no_match(input: &str) {
296 assert_eq!(
297 try_parse_inline_html(input, Dialect::CommonMark),
298 None,
299 "expected no match for {input:?} under CommonMark",
300 );
301 assert_eq!(
302 try_parse_inline_html(input, Dialect::Pandoc),
303 None,
304 "expected no match for {input:?} under Pandoc",
305 );
306 }
307
308 fn no_match_pandoc(input: &str) {
309 assert_eq!(
310 try_parse_inline_html(input, Dialect::Pandoc),
311 None,
312 "expected no match for {input:?} under Pandoc dialect",
313 );
314 }
315
316 #[test]
317 fn simple_open_tag() {
318 matches("<a>", 3);
319 matches("<bab>", 5);
320 matches("<c2c>", 5);
321 }
322
323 #[test]
324 fn empty_element() {
325 matches("<a/>", 4);
326 matches("<b2/>", 5);
327 matches("<a />", 6);
328 }
329
330 #[test]
331 fn open_tag_with_attrs() {
332 matches(r#"<a href="x">"#, r#"<a href="x">"#.len());
333 matches(
334 r#"<a foo="bar" baz='qux'>"#,
335 r#"<a foo="bar" baz='qux'>"#.len(),
336 );
337 matches(r#"<a foo=bar>"#, r#"<a foo=bar>"#.len());
338 }
339
340 #[test]
341 fn open_tag_attr_value_spans_lines() {
342 matches("<a href=\"foo\nbar\">", "<a href=\"foo\nbar\">".len());
343 }
344
345 #[test]
346 fn close_tag() {
347 matches("</a>", 4);
348 matches("</foo >", 7);
349 }
350
351 #[test]
352 fn comment_forms() {
353 matches("<!-->", 5);
354 matches("<!--->", 6);
355 matches("<!---->", 7);
356 matches("<!-- hi -->", 11);
357 matches("<!-- a\nb -->", 12);
358 }
359
360 #[test]
361 fn processing_instruction() {
362 matches("<?php $x; ?>", 12);
363 }
364
365 #[test]
366 fn cdata() {
367 matches_cm("<![CDATA[a]]>", 13);
368 no_match_pandoc("<![CDATA[a]]>");
370 }
371
372 #[test]
373 fn declaration() {
374 matches_cm("<!ELEMENT br EMPTY>", 19);
375 matches_cm("<!DOCTYPE html>", 15);
376 no_match_pandoc("<!ELEMENT br EMPTY>");
379 no_match_pandoc("<!DOCTYPE html>");
380 }
381
382 #[test]
383 fn rejects_illegal() {
384 no_match("<33>");
385 no_match("<__>");
386 no_match("<a h*#ref=\"hi\">");
387 no_match(r#"<a href="hi'>"#);
388 no_match("< a>");
389 no_match("<bar/ >");
390 no_match("<a href='bar'title=title>");
391 no_match("<");
392 no_match("<a");
393 no_match("<!--");
394 no_match("<![CDATA[abc");
395 }
396
397 #[test]
398 fn rejects_unclosed_quoted_value() {
399 no_match("<a href=\"foo");
400 }
401
402 #[test]
403 fn ignores_non_lt_prefix() {
404 no_match("foo");
405 no_match("a<b>");
406 }
407}