1#![allow(clippy::type_complexity)]
2
3use std::{
4 borrow::Cow,
5 collections::BTreeMap,
6 marker::PhantomData,
7};
8
9use nom::{
10 branch::alt,
11 bytes::complete::{
12 is_not,
13 tag,
14 tag_no_case,
15 take_until,
16 },
17 character::complete::{
18 alphanumeric1,
19 char,
20 multispace0,
21 },
22 combinator::map,
23 multi::many0,
24 sequence::{
25 delimited,
26 pair,
27 preceded,
28 separated_pair,
29 terminated,
30 tuple,
31 },
32 IResult,
33 Parser,
34};
35use regex::Regex;
36
37use crate::parser::html::{
38 entities::{
39 CODEPOINTS,
40 ENTITIES,
41 },
42 HTMLNode,
43};
44
45#[derive(Clone, Debug)]
49pub struct StrictHTMLParser<'a> {
50 _marker: PhantomData<&'a ()>,
51}
52
53impl<'a> crate::parser::Parser for StrictHTMLParser<'a> {
54 type Input = &'a str;
55 type Node = HTMLNode<Cow<'a, str>>;
56 type Error = nom::Err<nom::error::Error<&'a str>>;
57
58 fn parse(text: &'a str) -> Result<Vec<Self::Node>, Self::Error> {
59 nom::combinator::all_consuming(parse_escaped)(text).map(|r| r.1)
60 }
61}
62
63fn attr<'a, E>(i: &'a str) -> IResult<&'a str, &'a str, E>
64where
65 E: nom::error::ParseError<&'a str>,
66{
67 is_not(r#" "'>/="#)(i)
68}
69
70fn ws<'a, F, O, E: nom::error::ParseError<&'a str>>(
71 inner: F,
72) -> impl FnMut(&'a str) -> IResult<&'a str, O, E>
73where
74 F: Fn(&'a str) -> IResult<&'a str, O, E>,
75{
76 delimited(multispace0, inner, multispace0)
77}
78
79fn take_to<'a, E: nom::error::ParseError<&'a str>>(
80 i: &'a str,
81) -> impl FnMut(&'a str) -> IResult<&'a str, &'a str, E> {
82 terminated(take_until(i), tag(i))
83}
84
85fn comment(i: &str) -> IResult<&str, HTMLNode<&str>> {
86 map(preceded(tag("<!--"), take_to("-->")), HTMLNode::Comment)(i)
87}
88
89fn doctype(i: &str) -> IResult<&str, HTMLNode<&str>> {
90 map(
91 preceded(tag_no_case("<!doctype "), take_to(">")),
92 HTMLNode::Doctype,
93 )(i)
94}
95
96fn start_tag<'a, F, E>(
97 inner: F,
98) -> impl FnMut(&'a str) -> IResult<&'a str, (&'a str, Vec<(&'a str, &'a str)>, bool), E>
99where
100 F: Parser<&'a str, &'a str, E>,
101 E: nom::error::ParseError<&'a str>,
102{
103 preceded(
104 tag("<"),
105 tuple((
106 inner,
107 many0(preceded(
108 multispace0,
109 alt((
110 separated_pair(attr, ws(char('=')), is_not(r#"\t\n\f\r "'=<>`"#)),
112 separated_pair(
114 attr,
115 ws(char('=')),
116 alt((
117 delimited(char('\''), take_until("'"), char('\'')),
118 delimited(char('"'), take_until("\""), char('"')),
119 )),
120 ),
121 pair(attr, |i| Ok((i, ""))),
123 )),
124 )),
125 preceded(
126 multispace0,
127 alt((map(tag("/>"), |_| true), map(tag(">"), |_| false))),
128 ),
129 )),
130 )
131}
132
133fn void(i: &str) -> IResult<&str, HTMLNode<&str>> {
134 map(
135 start_tag(alt((
136 tag_no_case("area"),
137 tag_no_case("base"),
138 tag_no_case("br"),
139 tag_no_case("col"),
140 tag_no_case("embed"),
141 tag_no_case("hr"),
142 tag_no_case("img"),
143 tag_no_case("input"),
144 tag_no_case("link"),
145 tag_no_case("meta"),
146 tag_no_case("source"),
147 tag_no_case("track"),
148 tag_no_case("wbr"),
149 ))),
150 |(name, attrs, _)| HTMLNode::Void {
151 name,
152 attrs: attrs.into_iter().collect(),
153 },
154 )(i)
155}
156
157fn raw_element(i: &str) -> IResult<&str, HTMLNode<&str>> {
158 let start = start_tag(alt((tag_no_case("script"), tag_no_case("style"))))(i)?;
159
160 let (left, (name, attrs, closed)) = start;
161
162 if closed {
163 return Ok((left, HTMLNode::RawElement {
164 name,
165 attrs: attrs.into_iter().collect(),
166 content: "",
167 }));
168 }
169
170 let (left, content) = terminated(
171 take_until(&*format!("</{name}")),
172 delimited(
173 tag("</"),
174 tag_no_case(name),
175 preceded(multispace0, char('>')),
176 ),
177 )(left)?;
178
179 Ok((left, HTMLNode::RawElement {
180 name,
181 attrs: attrs.into_iter().collect(),
182 content: content.trim(),
183 }))
184}
185
186fn element(i: &str) -> IResult<&str, HTMLNode<&str>> {
187 let start = start_tag(alphanumeric1)(i)?;
188
189 let (left, (name, attrs, closed)) = start;
190
191 if closed {
192 return Ok((left, HTMLNode::Element {
193 name,
194 attrs: attrs.into_iter().collect(),
195 children: vec![],
196 }));
197 }
198
199 let (left, children) = terminated(
200 parse,
201 delimited(
202 tag("</"),
203 tag_no_case(name),
204 preceded(multispace0, char('>')),
205 ),
206 )(left)?;
207
208 Ok((left, HTMLNode::Element {
209 name,
210 attrs: attrs.into_iter().collect(),
211 children,
212 }))
213}
214
215fn text(i: &str) -> IResult<&str, HTMLNode<&str>> {
216 map(is_not("<"), HTMLNode::Text)(i)
217}
218
219fn single(i: &str) -> IResult<&str, HTMLNode<&str>> {
220 alt((comment, doctype, void, raw_element, element, text))(i)
221}
222
223fn parse(i: &str) -> IResult<&str, Vec<HTMLNode<&str>>> {
224 many0(single)(i)
225}
226
227lazy_static::lazy_static! {
228 static ref ESCAPE: Regex = Regex::new(r"&(([a-zA-Z]+;?)|(#[0-9]+;)|(#[xX][a-fA-F0-9]+;))").unwrap();
229}
230
231fn escape_ref(text: &str) -> Option<&str> {
232 if let Some(text) = ENTITIES.get(text) {
233 Some(text)
234 } else {
235 let val = text.trim_start_matches("&#").trim_end_matches(';');
236
237 let codepoint = if let Some(hex) = val.strip_prefix(['x', 'X']) {
238 u32::from_str_radix(hex, 16)
239 } else {
240 val.parse::<u32>()
241 }
242 .ok()?;
243
244 CODEPOINTS.get(&codepoint).copied()
245 }
246}
247
248fn escape_text(text: &str) -> Cow<str> {
249 let mut new = String::with_capacity(text.len());
250 let mut last = 0;
251 for m in ESCAPE.find_iter(text) {
252 new.push_str(&text[last..m.start()]);
253 if let Some(escape) = escape_ref(m.as_str()) {
254 new.push_str(escape);
255 } else {
256 new.push_str(&text[m.start()..m.end()]);
257 }
258 last = m.end();
259 }
260 new.push_str(&text[last..]);
261 new.into()
262}
263
264fn escape_attrs<'a>(attrs: BTreeMap<&'a str, &'a str>) -> BTreeMap<Cow<'a, str>, Cow<'a, str>> {
265 attrs
266 .into_iter()
267 .map(|(k, v)| (k.into(), escape_text(v)))
268 .collect()
269}
270
271fn escape_node(node: HTMLNode<&str>) -> HTMLNode<Cow<str>> {
272 #[allow(clippy::enum_glob_use)]
273 use HTMLNode::*;
274
275 match node {
276 Comment(t) => Comment(t.into()),
277 Doctype(t) => Doctype(t.into()),
278 Element {
279 name,
280 attrs,
281 children,
282 } => Element {
283 name: name.into(),
284 attrs: escape_attrs(attrs),
285 children: children.into_iter().map(escape_node).collect(),
286 },
287 RawElement {
288 name,
289 attrs,
290 content,
291 } => RawElement {
292 name: name.into(),
293 attrs: escape_attrs(attrs),
294 content: content.into(),
295 },
296 Void { name, attrs } => Void {
297 name: name.into(),
298 attrs: escape_attrs(attrs),
299 },
300 Text(t) => Text(escape_text(t)),
301 }
302}
303
304fn parse_escaped(i: &str) -> IResult<&str, Vec<HTMLNode<Cow<str>>>> {
305 let (left, nodes) = parse(i)?;
306
307 Ok((left, nodes.into_iter().map(escape_node).collect()))
308}
309
310#[allow(clippy::too_many_lines)]
311#[cfg(test)]
312mod test {
313 use std::collections::BTreeMap;
314
315 use super::*;
316
317 #[test]
318 fn test_comment() {
319 assert_eq!(
320 comment("<!-- Hello, world! -->"),
321 Ok(("", HTMLNode::Comment(" Hello, world! ")))
322 );
323 assert_eq!(
324 comment("<!--My favorite operators are > and <!-->"),
325 Ok(("", HTMLNode::Comment("My favorite operators are > and <!")))
326 );
327 }
328
329 #[test]
330 fn test_doctype() {
331 assert_eq!(
332 doctype("<!DOCTYPE html>"),
333 Ok(("", HTMLNode::Doctype("html")))
334 );
335 assert_eq!(
336 doctype("<!doctype html>"),
337 Ok(("", HTMLNode::Doctype("html")))
338 );
339 assert_eq!(
340 doctype(r#"<!DOCTYPE html SYSTEM "about:legacy-compat">"#),
341 Ok((
342 "",
343 HTMLNode::Doctype(r#"html SYSTEM "about:legacy-compat""#)
344 ))
345 );
346 }
347
348 #[test]
349 fn test_void() {
350 assert_eq!(
351 void("<hr>"),
352 Ok(("", HTMLNode::Void {
353 name: "hr",
354 attrs: BTreeMap::new()
355 }))
356 );
357 assert_eq!(
358 void("<HR>"),
359 Ok(("", HTMLNode::Void {
360 name: "HR",
361 attrs: BTreeMap::new()
362 }))
363 ); assert_eq!(
365 void("<hr/>"),
366 Ok(("", HTMLNode::Void {
367 name: "hr",
368 attrs: BTreeMap::new()
369 }))
370 );
371 assert_eq!(
372 void("<hr >"),
373 Ok(("", HTMLNode::Void {
374 name: "hr",
375 attrs: BTreeMap::new()
376 }))
377 );
378 assert_eq!(
379 void("<hr />"),
380 Ok(("", HTMLNode::Void {
381 name: "hr",
382 attrs: BTreeMap::new()
383 }))
384 );
385
386 assert_eq!(
387 void("<hr value=yes>"),
388 Ok(("", HTMLNode::Void {
389 name: "hr",
390 attrs: [("value", "yes")].into()
391 }))
392 );
393 assert_eq!(
394 void("<hr value=yes >"),
395 Ok(("", HTMLNode::Void {
396 name: "hr",
397 attrs: [("value", "yes")].into()
398 }))
399 );
400 assert_eq!(
401 void("<hr value = yes >"),
402 Ok(("", HTMLNode::Void {
403 name: "hr",
404 attrs: [("value", "yes")].into()
405 }))
406 );
407
408 assert_eq!(
409 void(r#"<hr value="yes">"#),
410 Ok(("", HTMLNode::Void {
411 name: "hr",
412 attrs: [("value", "yes")].into()
413 }))
414 );
415 assert_eq!(
416 void(r#"<hr value= "yes" >"#),
417 Ok(("", HTMLNode::Void {
418 name: "hr",
419 attrs: [("value", "yes")].into()
420 }))
421 );
422 assert_eq!(
423 void(r#"<hr value ="yes">"#),
424 Ok(("", HTMLNode::Void {
425 name: "hr",
426 attrs: [("value", "yes")].into()
427 }))
428 );
429
430 assert_eq!(
431 void("<hr value='yes'>"),
432 Ok(("", HTMLNode::Void {
433 name: "hr",
434 attrs: [("value", "yes")].into()
435 }))
436 );
437 assert_eq!(
438 void("<hr value='yes' >"),
439 Ok(("", HTMLNode::Void {
440 name: "hr",
441 attrs: [("value", "yes")].into()
442 }))
443 );
444 assert_eq!(
445 void("<hr value = 'yes' >"),
446 Ok(("", HTMLNode::Void {
447 name: "hr",
448 attrs: [("value", "yes")].into()
449 }))
450 );
451
452 assert_eq!(
453 void("<hr disabled>"),
454 Ok(("", HTMLNode::Void {
455 name: "hr",
456 attrs: [("disabled", "")].into()
457 }))
458 );
459
460 assert_eq!(
461 void(r#"<hr value="yes" next='good' final=ok boolean>"#),
462 Ok(("", HTMLNode::Void {
463 name: "hr",
464 attrs: [
465 ("value", "yes"),
466 ("next", "good"),
467 ("final", "ok"),
468 ("boolean", "")
469 ]
470 .into()
471 }))
472 );
473 }
474
475 #[test]
476 fn test_element() {
477 assert_eq!(
478 element("<a/>"),
479 Ok(("", HTMLNode::Element {
480 name: "a",
481 attrs: [].into(),
482 children: [].into()
483 }))
484 );
485 assert_eq!(
486 element("<a></a>"),
487 Ok(("", HTMLNode::Element {
488 name: "a",
489 attrs: [].into(),
490 children: [].into()
491 }))
492 );
493 assert_eq!(
494 element("<a> </a>"),
495 Ok(("", HTMLNode::Element {
496 name: "a",
497 attrs: [].into(),
498 children: [HTMLNode::Text(" ")].into()
499 }))
500 );
501 assert_eq!(
502 element(r#"<a rel=""></a>"#),
503 Ok(("", HTMLNode::Element {
504 name: "a",
505 attrs: [("rel", "")].into(),
506 children: [].into()
507 }))
508 );
509 assert_eq!(
510 element(r#"<a href="https://example.com"></a>"#),
511 Ok(("", HTMLNode::Element {
512 name: "a",
513 attrs: [("href", "https://example.com")].into(),
514 children: [].into()
515 }))
516 );
517 assert_eq!(
518 element(r#"<a href="https://example.com">Example Link</a>"#),
519 Ok(("", HTMLNode::Element {
520 name: "a",
521 attrs: [("href", "https://example.com")].into(),
522 children: [HTMLNode::Text("Example Link")].into()
523 }))
524 );
525 }
526
527 #[test]
528 fn test_parse() {
529 assert_eq!(
530 parse("<!-- Hello --><!doctype html><!-- second -->"),
531 Ok(("", vec![
532 HTMLNode::Comment(" Hello "),
533 HTMLNode::Doctype("html"),
534 HTMLNode::Comment(" second ")
535 ]))
536 );
537
538 assert_eq!(
539 parse("\t\t<!-- Hello -->\n\t<!doctype html>\n<!-- second -->"),
540 Ok(("", vec![
541 HTMLNode::Text("\t\t"),
542 HTMLNode::Comment(" Hello "),
543 HTMLNode::Text("\n\t"),
544 HTMLNode::Doctype("html"),
545 HTMLNode::Text("\n"),
546 HTMLNode::Comment(" second ")
547 ]))
548 );
549
550 assert_eq!(
551 parse(
552 r#"<!--Here's a link.-->
553 <a href="https://example.com"/>
554 With some text."#
555 ),
556 Ok(("", vec![
557 HTMLNode::Comment("Here's a link."),
558 HTMLNode::Text("\n "),
559 HTMLNode::Element {
560 name: "a",
561 attrs: [("href", "https://example.com")].into(),
562 children: [].into()
563 },
564 HTMLNode::Text("\n With some text.")
565 ])),
566 );
567
568 assert_eq!(
569 parse(
570 r#"
571 <div class="outer">
572 <div class="inner">
573 <p>Hello, world!</p>
574 </div>
575 </div>
576 "#
577 ),
578 Ok(("", vec![
579 HTMLNode::Text("\n "),
580 HTMLNode::Element {
581 name: "div",
582 attrs: [("class", "outer")].into(),
583 children: vec![
584 HTMLNode::Text("\n "),
585 HTMLNode::Element {
586 name: "div",
587 attrs: [("class", "inner")].into(),
588 children: vec![
589 HTMLNode::Text("\n "),
590 HTMLNode::Element {
591 name: "p",
592 attrs: [].into(),
593 children: vec![HTMLNode::Text("Hello, world!")],
594 },
595 HTMLNode::Text("\n "),
596 ],
597 },
598 HTMLNode::Text("\n "),
599 ],
600 },
601 HTMLNode::Text("\n "),
602 ])),
603 );
604
605 assert_eq!(
606 parse(
607 r#"<script type="application/javascript">
608if (1 < 2) {
609 console.log("Hello, world!");
610}
611</script>
612<div class="outer">
613 <div class="inner">
614 <p>Hello, world!</p>
615 <p>Another element...</p>
616 Just some text...
617 </div>
618 <div>
619 <p>Fancy nesting</p>
620 </div>
621</div>
622"#
623 ),
624 Ok(("", vec![
625 HTMLNode::RawElement {
626 name: "script",
627 attrs: [("type", "application/javascript")].into(),
628 content: "if (1 < 2) {\n console.log(\"Hello, world!\");\n}",
629 },
630 HTMLNode::Text("\n"),
631 HTMLNode::Element {
632 name: "div",
633 attrs: [("class", "outer")].into(),
634 children: vec![
635 HTMLNode::Text("\n "),
636 HTMLNode::Element {
637 name: "div",
638 attrs: [("class", "inner")].into(),
639 children: vec![
640 HTMLNode::Text("\n "),
641 HTMLNode::Element {
642 name: "p",
643 attrs: [].into(),
644 children: vec![HTMLNode::Text("Hello, world!")],
645 },
646 HTMLNode::Text("\n "),
647 HTMLNode::Element {
648 name: "p",
649 attrs: [].into(),
650 children: vec![HTMLNode::Text("Another element...")],
651 },
652 HTMLNode::Text("\n Just some text...\n ")
653 ],
654 },
655 HTMLNode::Text("\n "),
656 HTMLNode::Element {
657 name: "div",
658 attrs: [].into(),
659 children: vec![
660 HTMLNode::Text("\n "),
661 HTMLNode::Element {
662 name: "p",
663 attrs: [].into(),
664 children: vec![HTMLNode::Text("Fancy nesting")],
665 },
666 HTMLNode::Text("\n "),
667 ]
668 },
669 HTMLNode::Text("\n"),
670 ],
671 },
672 HTMLNode::Text("\n"),
673 ])),
674 );
675 }
676
677 #[test]
678 fn test_escaping() {
679 assert_eq!(
680 parse_escaped(r#"<a href="/index.html">Hello & Goodbye!</a>"#),
681 Ok(("", vec![HTMLNode::Element {
682 name: "a".into(),
683 attrs: [("href".into(), "/index.html".into())].into(),
684 children: [HTMLNode::Text("Hello & Goodbye!".into())].into(),
685 }]))
686 );
687 }
688}