soupy/parser/html/
strict.rs

1#![allow(clippy::type_complexity)]
2
3use std::{
4    borrow::Cow,
5    collections::BTreeMap,
6    marker::PhantomData,
7};
8
9use nom::{
10    branch::alt,
11    bytes::complete::{
12        is_not,
13        tag,
14        tag_no_case,
15        take_until,
16    },
17    character::complete::{
18        alphanumeric1,
19        char,
20        multispace0,
21    },
22    combinator::map,
23    multi::many0,
24    sequence::{
25        delimited,
26        pair,
27        preceded,
28        separated_pair,
29        terminated,
30        tuple,
31    },
32    IResult,
33    Parser,
34};
35use regex::Regex;
36
37use crate::parser::html::{
38    entities::{
39        CODEPOINTS,
40        ENTITIES,
41    },
42    HTMLNode,
43};
44
45/// Simple, strict HTML parser
46///
47/// Errors on malformed HTML.
48#[derive(Clone, Debug)]
49pub struct StrictHTMLParser<'a> {
50    _marker: PhantomData<&'a ()>,
51}
52
53impl<'a> crate::parser::Parser for StrictHTMLParser<'a> {
54    type Input = &'a str;
55    type Node = HTMLNode<Cow<'a, str>>;
56    type Error = nom::Err<nom::error::Error<&'a str>>;
57
58    fn parse(text: &'a str) -> Result<Vec<Self::Node>, Self::Error> {
59        nom::combinator::all_consuming(parse_escaped)(text).map(|r| r.1)
60    }
61}
62
63fn attr<'a, E>(i: &'a str) -> IResult<&'a str, &'a str, E>
64where
65    E: nom::error::ParseError<&'a str>,
66{
67    is_not(r#" "'>/="#)(i)
68}
69
70fn ws<'a, F, O, E: nom::error::ParseError<&'a str>>(
71    inner: F,
72) -> impl FnMut(&'a str) -> IResult<&'a str, O, E>
73where
74    F: Fn(&'a str) -> IResult<&'a str, O, E>,
75{
76    delimited(multispace0, inner, multispace0)
77}
78
79fn take_to<'a, E: nom::error::ParseError<&'a str>>(
80    i: &'a str,
81) -> impl FnMut(&'a str) -> IResult<&'a str, &'a str, E> {
82    terminated(take_until(i), tag(i))
83}
84
85fn comment(i: &str) -> IResult<&str, HTMLNode<&str>> {
86    map(preceded(tag("<!--"), take_to("-->")), HTMLNode::Comment)(i)
87}
88
89fn doctype(i: &str) -> IResult<&str, HTMLNode<&str>> {
90    map(
91        preceded(tag_no_case("<!doctype "), take_to(">")),
92        HTMLNode::Doctype,
93    )(i)
94}
95
96fn start_tag<'a, F, E>(
97    inner: F,
98) -> impl FnMut(&'a str) -> IResult<&'a str, (&'a str, Vec<(&'a str, &'a str)>, bool), E>
99where
100    F: Parser<&'a str, &'a str, E>,
101    E: nom::error::ParseError<&'a str>,
102{
103    preceded(
104        tag("<"),
105        tuple((
106            inner,
107            many0(preceded(
108                multispace0,
109                alt((
110                    // unquoted
111                    separated_pair(attr, ws(char('=')), is_not(r#"\t\n\f\r "'=<>`"#)),
112                    // quoted
113                    separated_pair(
114                        attr,
115                        ws(char('=')),
116                        alt((
117                            delimited(char('\''), take_until("'"), char('\'')),
118                            delimited(char('"'), take_until("\""), char('"')),
119                        )),
120                    ),
121                    // boolean
122                    pair(attr, |i| Ok((i, ""))),
123                )),
124            )),
125            preceded(
126                multispace0,
127                alt((map(tag("/>"), |_| true), map(tag(">"), |_| false))),
128            ),
129        )),
130    )
131}
132
133fn void(i: &str) -> IResult<&str, HTMLNode<&str>> {
134    map(
135        start_tag(alt((
136            tag_no_case("area"),
137            tag_no_case("base"),
138            tag_no_case("br"),
139            tag_no_case("col"),
140            tag_no_case("embed"),
141            tag_no_case("hr"),
142            tag_no_case("img"),
143            tag_no_case("input"),
144            tag_no_case("link"),
145            tag_no_case("meta"),
146            tag_no_case("source"),
147            tag_no_case("track"),
148            tag_no_case("wbr"),
149        ))),
150        |(name, attrs, _)| HTMLNode::Void {
151            name,
152            attrs: attrs.into_iter().collect(),
153        },
154    )(i)
155}
156
157fn raw_element(i: &str) -> IResult<&str, HTMLNode<&str>> {
158    let start = start_tag(alt((tag_no_case("script"), tag_no_case("style"))))(i)?;
159
160    let (left, (name, attrs, closed)) = start;
161
162    if closed {
163        return Ok((left, HTMLNode::RawElement {
164            name,
165            attrs: attrs.into_iter().collect(),
166            content: "",
167        }));
168    }
169
170    let (left, content) = terminated(
171        take_until(&*format!("</{name}")),
172        delimited(
173            tag("</"),
174            tag_no_case(name),
175            preceded(multispace0, char('>')),
176        ),
177    )(left)?;
178
179    Ok((left, HTMLNode::RawElement {
180        name,
181        attrs: attrs.into_iter().collect(),
182        content: content.trim(),
183    }))
184}
185
186fn element(i: &str) -> IResult<&str, HTMLNode<&str>> {
187    let start = start_tag(alphanumeric1)(i)?;
188
189    let (left, (name, attrs, closed)) = start;
190
191    if closed {
192        return Ok((left, HTMLNode::Element {
193            name,
194            attrs: attrs.into_iter().collect(),
195            children: vec![],
196        }));
197    }
198
199    let (left, children) = terminated(
200        parse,
201        delimited(
202            tag("</"),
203            tag_no_case(name),
204            preceded(multispace0, char('>')),
205        ),
206    )(left)?;
207
208    Ok((left, HTMLNode::Element {
209        name,
210        attrs: attrs.into_iter().collect(),
211        children,
212    }))
213}
214
215fn text(i: &str) -> IResult<&str, HTMLNode<&str>> {
216    map(is_not("<"), HTMLNode::Text)(i)
217}
218
219fn single(i: &str) -> IResult<&str, HTMLNode<&str>> {
220    alt((comment, doctype, void, raw_element, element, text))(i)
221}
222
223fn parse(i: &str) -> IResult<&str, Vec<HTMLNode<&str>>> {
224    many0(single)(i)
225}
226
227lazy_static::lazy_static! {
228    static ref ESCAPE: Regex = Regex::new(r"&(([a-zA-Z]+;?)|(#[0-9]+;)|(#[xX][a-fA-F0-9]+;))").unwrap();
229}
230
231fn escape_ref(text: &str) -> Option<&str> {
232    if let Some(text) = ENTITIES.get(text) {
233        Some(text)
234    } else {
235        let val = text.trim_start_matches("&#").trim_end_matches(';');
236
237        let codepoint = if let Some(hex) = val.strip_prefix(['x', 'X']) {
238            u32::from_str_radix(hex, 16)
239        } else {
240            val.parse::<u32>()
241        }
242        .ok()?;
243
244        CODEPOINTS.get(&codepoint).copied()
245    }
246}
247
248fn escape_text(text: &str) -> Cow<str> {
249    let mut new = String::with_capacity(text.len());
250    let mut last = 0;
251    for m in ESCAPE.find_iter(text) {
252        new.push_str(&text[last..m.start()]);
253        if let Some(escape) = escape_ref(m.as_str()) {
254            new.push_str(escape);
255        } else {
256            new.push_str(&text[m.start()..m.end()]);
257        }
258        last = m.end();
259    }
260    new.push_str(&text[last..]);
261    new.into()
262}
263
264fn escape_attrs<'a>(attrs: BTreeMap<&'a str, &'a str>) -> BTreeMap<Cow<'a, str>, Cow<'a, str>> {
265    attrs
266        .into_iter()
267        .map(|(k, v)| (k.into(), escape_text(v)))
268        .collect()
269}
270
271fn escape_node(node: HTMLNode<&str>) -> HTMLNode<Cow<str>> {
272    #[allow(clippy::enum_glob_use)]
273    use HTMLNode::*;
274
275    match node {
276        Comment(t) => Comment(t.into()),
277        Doctype(t) => Doctype(t.into()),
278        Element {
279            name,
280            attrs,
281            children,
282        } => Element {
283            name: name.into(),
284            attrs: escape_attrs(attrs),
285            children: children.into_iter().map(escape_node).collect(),
286        },
287        RawElement {
288            name,
289            attrs,
290            content,
291        } => RawElement {
292            name: name.into(),
293            attrs: escape_attrs(attrs),
294            content: content.into(),
295        },
296        Void { name, attrs } => Void {
297            name: name.into(),
298            attrs: escape_attrs(attrs),
299        },
300        Text(t) => Text(escape_text(t)),
301    }
302}
303
304fn parse_escaped(i: &str) -> IResult<&str, Vec<HTMLNode<Cow<str>>>> {
305    let (left, nodes) = parse(i)?;
306
307    Ok((left, nodes.into_iter().map(escape_node).collect()))
308}
309
310#[allow(clippy::too_many_lines)]
311#[cfg(test)]
312mod test {
313    use std::collections::BTreeMap;
314
315    use super::*;
316
317    #[test]
318    fn test_comment() {
319        assert_eq!(
320            comment("<!-- Hello, world! -->"),
321            Ok(("", HTMLNode::Comment(" Hello, world! ")))
322        );
323        assert_eq!(
324            comment("<!--My favorite operators are > and <!-->"),
325            Ok(("", HTMLNode::Comment("My favorite operators are > and <!")))
326        );
327    }
328
329    #[test]
330    fn test_doctype() {
331        assert_eq!(
332            doctype("<!DOCTYPE html>"),
333            Ok(("", HTMLNode::Doctype("html")))
334        );
335        assert_eq!(
336            doctype("<!doctype html>"),
337            Ok(("", HTMLNode::Doctype("html")))
338        );
339        assert_eq!(
340            doctype(r#"<!DOCTYPE html SYSTEM "about:legacy-compat">"#),
341            Ok((
342                "",
343                HTMLNode::Doctype(r#"html SYSTEM "about:legacy-compat""#)
344            ))
345        );
346    }
347
348    #[test]
349    fn test_void() {
350        assert_eq!(
351            void("<hr>"),
352            Ok(("", HTMLNode::Void {
353                name: "hr",
354                attrs: BTreeMap::new()
355            }))
356        );
357        assert_eq!(
358            void("<HR>"),
359            Ok(("", HTMLNode::Void {
360                name: "HR",
361                attrs: BTreeMap::new()
362            }))
363        ); // TODO: convert to lowercase
364        assert_eq!(
365            void("<hr/>"),
366            Ok(("", HTMLNode::Void {
367                name: "hr",
368                attrs: BTreeMap::new()
369            }))
370        );
371        assert_eq!(
372            void("<hr >"),
373            Ok(("", HTMLNode::Void {
374                name: "hr",
375                attrs: BTreeMap::new()
376            }))
377        );
378        assert_eq!(
379            void("<hr />"),
380            Ok(("", HTMLNode::Void {
381                name: "hr",
382                attrs: BTreeMap::new()
383            }))
384        );
385
386        assert_eq!(
387            void("<hr value=yes>"),
388            Ok(("", HTMLNode::Void {
389                name: "hr",
390                attrs: [("value", "yes")].into()
391            }))
392        );
393        assert_eq!(
394            void("<hr value=yes >"),
395            Ok(("", HTMLNode::Void {
396                name: "hr",
397                attrs: [("value", "yes")].into()
398            }))
399        );
400        assert_eq!(
401            void("<hr value  = yes >"),
402            Ok(("", HTMLNode::Void {
403                name: "hr",
404                attrs: [("value", "yes")].into()
405            }))
406        );
407
408        assert_eq!(
409            void(r#"<hr value="yes">"#),
410            Ok(("", HTMLNode::Void {
411                name: "hr",
412                attrs: [("value", "yes")].into()
413            }))
414        );
415        assert_eq!(
416            void(r#"<hr value= "yes" >"#),
417            Ok(("", HTMLNode::Void {
418                name: "hr",
419                attrs: [("value", "yes")].into()
420            }))
421        );
422        assert_eq!(
423            void(r#"<hr value  ="yes">"#),
424            Ok(("", HTMLNode::Void {
425                name: "hr",
426                attrs: [("value", "yes")].into()
427            }))
428        );
429
430        assert_eq!(
431            void("<hr value='yes'>"),
432            Ok(("", HTMLNode::Void {
433                name: "hr",
434                attrs: [("value", "yes")].into()
435            }))
436        );
437        assert_eq!(
438            void("<hr value='yes' >"),
439            Ok(("", HTMLNode::Void {
440                name: "hr",
441                attrs: [("value", "yes")].into()
442            }))
443        );
444        assert_eq!(
445            void("<hr value  = 'yes' >"),
446            Ok(("", HTMLNode::Void {
447                name: "hr",
448                attrs: [("value", "yes")].into()
449            }))
450        );
451
452        assert_eq!(
453            void("<hr disabled>"),
454            Ok(("", HTMLNode::Void {
455                name: "hr",
456                attrs: [("disabled", "")].into()
457            }))
458        );
459
460        assert_eq!(
461            void(r#"<hr value="yes" next='good' final=ok boolean>"#),
462            Ok(("", HTMLNode::Void {
463                name: "hr",
464                attrs: [
465                    ("value", "yes"),
466                    ("next", "good"),
467                    ("final", "ok"),
468                    ("boolean", "")
469                ]
470                .into()
471            }))
472        );
473    }
474
475    #[test]
476    fn test_element() {
477        assert_eq!(
478            element("<a/>"),
479            Ok(("", HTMLNode::Element {
480                name: "a",
481                attrs: [].into(),
482                children: [].into()
483            }))
484        );
485        assert_eq!(
486            element("<a></a>"),
487            Ok(("", HTMLNode::Element {
488                name: "a",
489                attrs: [].into(),
490                children: [].into()
491            }))
492        );
493        assert_eq!(
494            element("<a> </a>"),
495            Ok(("", HTMLNode::Element {
496                name: "a",
497                attrs: [].into(),
498                children: [HTMLNode::Text(" ")].into()
499            }))
500        );
501        assert_eq!(
502            element(r#"<a rel=""></a>"#),
503            Ok(("", HTMLNode::Element {
504                name: "a",
505                attrs: [("rel", "")].into(),
506                children: [].into()
507            }))
508        );
509        assert_eq!(
510            element(r#"<a href="https://example.com"></a>"#),
511            Ok(("", HTMLNode::Element {
512                name: "a",
513                attrs: [("href", "https://example.com")].into(),
514                children: [].into()
515            }))
516        );
517        assert_eq!(
518            element(r#"<a href="https://example.com">Example Link</a>"#),
519            Ok(("", HTMLNode::Element {
520                name: "a",
521                attrs: [("href", "https://example.com")].into(),
522                children: [HTMLNode::Text("Example Link")].into()
523            }))
524        );
525    }
526
527    #[test]
528    fn test_parse() {
529        assert_eq!(
530            parse("<!-- Hello --><!doctype html><!-- second -->"),
531            Ok(("", vec![
532                HTMLNode::Comment(" Hello "),
533                HTMLNode::Doctype("html"),
534                HTMLNode::Comment(" second ")
535            ]))
536        );
537
538        assert_eq!(
539            parse("\t\t<!-- Hello -->\n\t<!doctype html>\n<!-- second -->"),
540            Ok(("", vec![
541                HTMLNode::Text("\t\t"),
542                HTMLNode::Comment(" Hello "),
543                HTMLNode::Text("\n\t"),
544                HTMLNode::Doctype("html"),
545                HTMLNode::Text("\n"),
546                HTMLNode::Comment(" second ")
547            ]))
548        );
549
550        assert_eq!(
551            parse(
552                r#"<!--Here's a link.-->
553                <a href="https://example.com"/>
554                With some text."#
555            ),
556            Ok(("", vec![
557                HTMLNode::Comment("Here's a link."),
558                HTMLNode::Text("\n                "),
559                HTMLNode::Element {
560                    name: "a",
561                    attrs: [("href", "https://example.com")].into(),
562                    children: [].into()
563                },
564                HTMLNode::Text("\n                With some text.")
565            ])),
566        );
567
568        assert_eq!(
569            parse(
570                r#"
571                <div class="outer">
572                    <div class="inner">
573                        <p>Hello, world!</p>
574                    </div>
575                </div>
576            "#
577            ),
578            Ok(("", vec![
579                HTMLNode::Text("\n                "),
580                HTMLNode::Element {
581                    name: "div",
582                    attrs: [("class", "outer")].into(),
583                    children: vec![
584                        HTMLNode::Text("\n                    "),
585                        HTMLNode::Element {
586                            name: "div",
587                            attrs: [("class", "inner")].into(),
588                            children: vec![
589                                HTMLNode::Text("\n                        "),
590                                HTMLNode::Element {
591                                    name: "p",
592                                    attrs: [].into(),
593                                    children: vec![HTMLNode::Text("Hello, world!")],
594                                },
595                                HTMLNode::Text("\n                    "),
596                            ],
597                        },
598                        HTMLNode::Text("\n                "),
599                    ],
600                },
601                HTMLNode::Text("\n            "),
602            ])),
603        );
604
605        assert_eq!(
606            parse(
607                r#"<script type="application/javascript">
608if (1 < 2) {
609    console.log("Hello, world!");
610}
611</script>
612<div class="outer">
613    <div class="inner">
614        <p>Hello, world!</p>
615        <p>Another element...</p>
616        Just some text...
617    </div>
618    <div>
619        <p>Fancy nesting</p>
620    </div>
621</div>
622"#
623            ),
624            Ok(("", vec![
625                HTMLNode::RawElement {
626                    name: "script",
627                    attrs: [("type", "application/javascript")].into(),
628                    content: "if (1 < 2) {\n    console.log(\"Hello, world!\");\n}",
629                },
630                HTMLNode::Text("\n"),
631                HTMLNode::Element {
632                    name: "div",
633                    attrs: [("class", "outer")].into(),
634                    children: vec![
635                        HTMLNode::Text("\n    "),
636                        HTMLNode::Element {
637                            name: "div",
638                            attrs: [("class", "inner")].into(),
639                            children: vec![
640                                HTMLNode::Text("\n        "),
641                                HTMLNode::Element {
642                                    name: "p",
643                                    attrs: [].into(),
644                                    children: vec![HTMLNode::Text("Hello, world!")],
645                                },
646                                HTMLNode::Text("\n        "),
647                                HTMLNode::Element {
648                                    name: "p",
649                                    attrs: [].into(),
650                                    children: vec![HTMLNode::Text("Another element...")],
651                                },
652                                HTMLNode::Text("\n        Just some text...\n    ")
653                            ],
654                        },
655                        HTMLNode::Text("\n    "),
656                        HTMLNode::Element {
657                            name: "div",
658                            attrs: [].into(),
659                            children: vec![
660                                HTMLNode::Text("\n        "),
661                                HTMLNode::Element {
662                                    name: "p",
663                                    attrs: [].into(),
664                                    children: vec![HTMLNode::Text("Fancy nesting")],
665                                },
666                                HTMLNode::Text("\n    "),
667                            ]
668                        },
669                        HTMLNode::Text("\n"),
670                    ],
671                },
672                HTMLNode::Text("\n"),
673            ])),
674        );
675    }
676
677    #[test]
678    fn test_escaping() {
679        assert_eq!(
680            parse_escaped(r#"<a href="&#x2F;index.html">Hello &amp; Goodbye!</a>"#),
681            Ok(("", vec![HTMLNode::Element {
682                name: "a".into(),
683                attrs: [("href".into(), "/index.html".into())].into(),
684                children: [HTMLNode::Text("Hello & Goodbye!".into())].into(),
685            }]))
686        );
687    }
688}