panache_parser/parser/inlines/
inline_html.rs1use crate::syntax::SyntaxKind;
19use rowan::GreenNodeBuilder;
20
21pub fn try_parse_inline_html(text: &str) -> Option<usize> {
24 if !text.starts_with('<') {
25 return None;
26 }
27 parse_html_comment(text)
28 .or_else(|| parse_cdata(text))
29 .or_else(|| parse_declaration(text))
30 .or_else(|| parse_processing_instruction(text))
31 .or_else(|| parse_close_tag(text))
32 .or_else(|| parse_open_tag(text))
33}
34
35pub fn emit_inline_html(builder: &mut GreenNodeBuilder, raw: &str) {
37 builder.start_node(SyntaxKind::INLINE_HTML.into());
38 builder.token(SyntaxKind::INLINE_HTML_CONTENT.into(), raw);
39 builder.finish_node();
40}
41
42fn parse_html_comment(text: &str) -> Option<usize> {
43 if !text.starts_with("<!--") {
44 return None;
45 }
46 if text.as_bytes().get(4) == Some(&b'>') {
48 return Some(5);
49 }
50 if text.as_bytes().get(4) == Some(&b'-') && text.as_bytes().get(5) == Some(&b'>') {
51 return Some(6);
52 }
53 let after = &text[4..];
54 let end = after.find("-->")?;
55 Some(4 + end + 3)
56}
57
58fn parse_processing_instruction(text: &str) -> Option<usize> {
59 if !text.starts_with("<?") {
60 return None;
61 }
62 let after = &text[2..];
63 let end = after.find("?>")?;
64 Some(2 + end + 2)
65}
66
67fn parse_cdata(text: &str) -> Option<usize> {
68 const PREFIX: &str = "<![CDATA[";
69 if !text.starts_with(PREFIX) {
70 return None;
71 }
72 let after = &text[PREFIX.len()..];
73 let end = after.find("]]>")?;
74 Some(PREFIX.len() + end + 3)
75}
76
77fn parse_declaration(text: &str) -> Option<usize> {
78 let bytes = text.as_bytes();
79 if !text.starts_with("<!") || bytes.len() < 3 {
80 return None;
81 }
82 if !bytes[2].is_ascii_alphabetic() {
83 return None;
84 }
85 let mut i = 3;
86 while i < bytes.len() {
87 if bytes[i] == b'>' {
88 return Some(i + 1);
89 }
90 i += 1;
91 }
92 None
93}
94
95pub(crate) fn parse_close_tag(text: &str) -> Option<usize> {
96 let bytes = text.as_bytes();
97 if !text.starts_with("</") {
98 return None;
99 }
100 let mut i = 2;
101 if i >= bytes.len() || !bytes[i].is_ascii_alphabetic() {
102 return None;
103 }
104 i += 1;
105 while i < bytes.len() && (bytes[i].is_ascii_alphanumeric() || bytes[i] == b'-') {
106 i += 1;
107 }
108 i = skip_ws_with_optional_lf(bytes, i);
109 if bytes.get(i) == Some(&b'>') {
110 Some(i + 1)
111 } else {
112 None
113 }
114}
115
116pub(crate) fn parse_open_tag(text: &str) -> Option<usize> {
117 let bytes = text.as_bytes();
118 if !text.starts_with('<') {
119 return None;
120 }
121 let mut i = 1;
122 if i >= bytes.len() || !bytes[i].is_ascii_alphabetic() {
123 return None;
124 }
125 i += 1;
126 while i < bytes.len() && (bytes[i].is_ascii_alphanumeric() || bytes[i] == b'-') {
127 i += 1;
128 }
129 while let Some(after) = parse_attribute(bytes, i) {
130 i = after;
131 }
132 i = skip_ws_with_optional_lf(bytes, i);
133 if bytes.get(i) == Some(&b'/') {
134 i += 1;
135 }
136 if bytes.get(i) == Some(&b'>') {
137 Some(i + 1)
138 } else {
139 None
140 }
141}
142
143fn parse_attribute(bytes: &[u8], start: usize) -> Option<usize> {
144 let after_ws = skip_ws_required_with_optional_lf(bytes, start)?;
145 let mut i = after_ws;
146 let first = *bytes.get(i)?;
147 if !is_attr_name_start(first) {
148 return None;
149 }
150 i += 1;
151 while i < bytes.len() && is_attr_name_cont(bytes[i]) {
152 i += 1;
153 }
154 if let Some(after_value) = parse_attr_value_spec(bytes, i) {
155 i = after_value;
156 }
157 Some(i)
158}
159
160fn parse_attr_value_spec(bytes: &[u8], start: usize) -> Option<usize> {
161 let i_after_ws1 = skip_ws_with_optional_lf(bytes, start);
162 if bytes.get(i_after_ws1) != Some(&b'=') {
163 return None;
164 }
165 let mut i = i_after_ws1 + 1;
166 i = skip_ws_with_optional_lf(bytes, i);
167 parse_attr_value(bytes, i)
168}
169
170fn parse_attr_value(bytes: &[u8], start: usize) -> Option<usize> {
171 let q = *bytes.get(start)?;
172 match q {
173 b'"' | b'\'' => {
174 let mut j = start + 1;
175 while j < bytes.len() && bytes[j] != q {
176 j += 1;
177 }
178 if j >= bytes.len() {
179 return None;
180 }
181 Some(j + 1)
182 }
183 _ => {
184 let mut j = start;
185 while j < bytes.len() {
186 let b = bytes[j];
187 if matches!(
188 b,
189 b' ' | b'\t' | b'\n' | b'\r' | b'"' | b'\'' | b'=' | b'<' | b'>' | b'`'
190 ) {
191 break;
192 }
193 j += 1;
194 }
195 if j == start { None } else { Some(j) }
196 }
197 }
198}
199
200fn is_attr_name_start(b: u8) -> bool {
201 b.is_ascii_alphabetic() || b == b'_' || b == b':'
202}
203
204fn is_attr_name_cont(b: u8) -> bool {
205 b.is_ascii_alphanumeric() || b == b'_' || b == b'.' || b == b':' || b == b'-'
206}
207
208fn skip_ws_with_optional_lf(bytes: &[u8], start: usize) -> usize {
211 let mut i = start;
212 let mut saw_lf = false;
213 while i < bytes.len() {
214 match bytes[i] {
215 b' ' | b'\t' => i += 1,
216 b'\n' => {
217 if saw_lf {
218 break;
219 }
220 saw_lf = true;
221 i += 1;
222 }
223 b'\r' => {
224 if saw_lf {
225 break;
226 }
227 saw_lf = true;
228 i += 1;
229 if bytes.get(i) == Some(&b'\n') {
230 i += 1;
231 }
232 }
233 _ => break,
234 }
235 }
236 i
237}
238
239fn skip_ws_required_with_optional_lf(bytes: &[u8], start: usize) -> Option<usize> {
242 let after = skip_ws_with_optional_lf(bytes, start);
243 if after == start { None } else { Some(after) }
244}
245
246#[cfg(test)]
247mod tests {
248 use super::*;
249
250 fn matches(input: &str, expected_len: usize) {
251 assert_eq!(
252 try_parse_inline_html(input),
253 Some(expected_len),
254 "expected {input:?} to match {expected_len}",
255 );
256 }
257
258 fn no_match(input: &str) {
259 assert_eq!(
260 try_parse_inline_html(input),
261 None,
262 "expected no match for {input:?}"
263 );
264 }
265
266 #[test]
267 fn simple_open_tag() {
268 matches("<a>", 3);
269 matches("<bab>", 5);
270 matches("<c2c>", 5);
271 }
272
273 #[test]
274 fn empty_element() {
275 matches("<a/>", 4);
276 matches("<b2/>", 5);
277 matches("<a />", 6);
278 }
279
280 #[test]
281 fn open_tag_with_attrs() {
282 matches(r#"<a href="x">"#, r#"<a href="x">"#.len());
283 matches(
284 r#"<a foo="bar" baz='qux'>"#,
285 r#"<a foo="bar" baz='qux'>"#.len(),
286 );
287 matches(r#"<a foo=bar>"#, r#"<a foo=bar>"#.len());
288 }
289
290 #[test]
291 fn open_tag_attr_value_spans_lines() {
292 matches("<a href=\"foo\nbar\">", "<a href=\"foo\nbar\">".len());
293 }
294
295 #[test]
296 fn close_tag() {
297 matches("</a>", 4);
298 matches("</foo >", 7);
299 }
300
301 #[test]
302 fn comment_forms() {
303 matches("<!-->", 5);
304 matches("<!--->", 6);
305 matches("<!---->", 7);
306 matches("<!-- hi -->", 11);
307 matches("<!-- a\nb -->", 12);
308 }
309
310 #[test]
311 fn processing_instruction() {
312 matches("<?php $x; ?>", 12);
313 }
314
315 #[test]
316 fn cdata() {
317 matches("<![CDATA[a]]>", 13);
318 }
319
320 #[test]
321 fn declaration() {
322 matches("<!ELEMENT br EMPTY>", 19);
323 }
324
325 #[test]
326 fn rejects_illegal() {
327 no_match("<33>");
328 no_match("<__>");
329 no_match("<a h*#ref=\"hi\">");
330 no_match(r#"<a href="hi'>"#);
331 no_match("< a>");
332 no_match("<bar/ >");
333 no_match("<a href='bar'title=title>");
334 no_match("<");
335 no_match("<a");
336 no_match("<!--");
337 no_match("<![CDATA[abc");
338 }
339
340 #[test]
341 fn rejects_unclosed_quoted_value() {
342 no_match("<a href=\"foo");
343 }
344
345 #[test]
346 fn ignores_non_lt_prefix() {
347 no_match("foo");
348 no_match("a<b>");
349 }
350}