parse_hyperlinks/parser/
html.rs

1//! This module implements parsers for HTML hyperlinks.
2#![allow(dead_code)]
3#![allow(clippy::type_complexity)]
4
5use crate::parser::Link;
6use html_escape::decode_html_entities;
7use nom::branch::alt;
8use nom::bytes::complete::is_not;
9use nom::bytes::complete::tag;
10use nom::character::complete::alphanumeric1;
11use nom::error::Error;
12use nom::error::ErrorKind;
13use std::borrow::Cow;
14
15/// Wrapper around `html_text2dest()` that packs the result in
16/// `Link::Text2Dest`.
17pub fn html_text2dest_link(i: &str) -> nom::IResult<&str, Link> {
18    let (i, (te, de, ti)) = html_text2dest(i)?;
19    Ok((i, Link::Text2Dest(te, de, ti)))
20}
21
22/// Parse an HTML _inline hyperlink_.
23///
24/// It returns either `Ok((i, (link_text, link_destination, link_title)))` or some error.
25///
26/// The parser expects to start at the link start (`<`) to succeed.
27/// ```
28/// use parse_hyperlinks::parser::Link;
29/// use parse_hyperlinks::parser::html::html_text2dest;
30/// use std::borrow::Cow;
31///
32/// assert_eq!(
33///   html_text2dest(r#"<a href="destination" title="title">name</a>abc"#),
34///   Ok(("abc", (Cow::from("name"), Cow::from("destination"), Cow::from("title"))))
35/// );
36/// ```
37pub fn html_text2dest(i: &str) -> nom::IResult<&str, (Cow<str>, Cow<str>, Cow<str>)> {
38    let (i, ((link_destination, link_title), link_text)) = nom::sequence::terminated(
39        nom::sequence::pair(
40            tag_a_opening,
41            alt((
42                nom::bytes::complete::take_until("</a>"),
43                nom::bytes::complete::take_until("</A>"),
44            )),
45        ),
46        // HTML is case insensitive. XHTML, that is being XML is case sensitive.
47        // Here we deal with HTML.
48        alt((tag("</a>"), tag("</A>"))),
49    )(i)?;
50    let link_text = decode_html_entities(link_text);
51    Ok((i, (link_text, link_destination, link_title)))
52}
53
54/// Parses a `<a ...>` opening tag and returns
55/// either `Ok((i, (link_destination, link_title)))` or some error.
56pub(crate) fn tag_a_opening(i: &str) -> nom::IResult<&str, (Cow<str>, Cow<str>)> {
57    nom::sequence::delimited(
58        // HTML is case insensitive. XHTML, that is being XML is case sensitive.
59        // Here we deal with HTML.
60        alt((tag("<a "), tag("<A "))),
61        nom::combinator::map_parser(is_not(">"), parse_attributes),
62        tag(">"),
63    )(i)
64}
65
66/// Parses attributes and returns `Ok((name, value))`.
67/// Boolean attributes are ignored, but silently consumed.
68fn attribute(i: &str) -> nom::IResult<&str, (&str, Cow<str>)> {
69    alt((
70        nom::sequence::pair(
71            nom::combinator::verify(alphanumeric1, |s: &str| {
72                nom::character::is_alphabetic(s.as_bytes()[0])
73            }),
74            alt((
75                nom::combinator::value(Cow::from(""), tag(r#"="""#)),
76                nom::combinator::value(Cow::from(""), tag(r#"=''"#)),
77                nom::combinator::map(
78                    nom::sequence::delimited(tag("=\""), is_not("\""), tag("\"")),
79                    |s: &str| decode_html_entities(s),
80                ),
81                nom::combinator::map(
82                    nom::sequence::delimited(tag("='"), is_not("'"), tag("'")),
83                    |s: &str| decode_html_entities(s),
84                ),
85                nom::combinator::map(nom::sequence::preceded(tag("="), is_not(" ")), |s: &str| {
86                    decode_html_entities(s)
87                }),
88            )),
89        ),
90        // Consume boolean attributes.
91        nom::combinator::value(
92            ("", Cow::from("")),
93            nom::combinator::verify(alphanumeric1, |s: &str| {
94                nom::character::is_alphabetic(s.as_bytes()[0])
95            }),
96        ),
97    ))(i)
98}
99
100/// Parses a whitespace separated list of attributes and returns a vector of (name, value).
101pub fn attribute_list<'a>(i: &'a str) -> nom::IResult<&'a str, Vec<(&'a str, Cow<str>)>> {
102    let i = i.trim();
103    nom::multi::separated_list1(nom::character::complete::multispace1, attribute)(i)
104}
105
106/// Extracts the `href` and `title` attributes and returns
107/// `Ok((link_destination, link_title))`. `link_title` can be empty,
108/// `link_destination` not.
109fn parse_attributes(i: &str) -> nom::IResult<&str, (Cow<str>, Cow<str>)> {
110    let (i, attributes) = attribute_list(i)?;
111    let mut href = Cow::Borrowed("");
112    let mut title = Cow::Borrowed("");
113
114    for (name, value) in attributes {
115        if name == "href" {
116            // Make sure `href` is empty, it can appear only
117            // once.
118            if !(&*href).is_empty() {
119                return Err(nom::Err::Error(Error::new(name, ErrorKind::ManyMN)));
120            }
121            href = value;
122        } else if name == "title" {
123            // Make sure `title` is empty, it can appear only
124            // once.
125            if !(&*title).is_empty() {
126                return Err(nom::Err::Error(Error::new(name, ErrorKind::ManyMN)));
127            }
128            title = value;
129        }
130    }
131
132    // Assure that `href` is not empty.
133    if (&*href).is_empty() {
134        return Err(nom::Err::Error(Error::new(i, ErrorKind::Eof)));
135    };
136
137    Ok((i, (href, title)))
138}
139
140#[cfg(test)]
141mod tests {
142    use super::*;
143
144    #[test]
145    fn test_html_text2dest() {
146        let expected = (
147            "abc",
148            (
149                Cow::from("W3Schools"),
150                Cow::from("https://www.w3schools.com/"),
151                Cow::from("W3S"),
152            ),
153        );
154        assert_eq!(
155            html_text2dest(r#"<a title="W3S" href="https://www.w3schools.com/">W3Schools</a>abc"#)
156                .unwrap(),
157            expected
158        );
159        assert_eq!(
160            html_text2dest(r#"<A title="W3S" href="https://www.w3schools.com/">W3Schools</A>abc"#)
161                .unwrap(),
162            expected
163        );
164
165        let expected = ("abc", (Cow::from("<n>"), Cow::from("h"), Cow::from("t")));
166        assert_eq!(
167            html_text2dest(r#"<a title="t" href="h">&lt;n&gt;</a>abc"#).unwrap(),
168            expected
169        );
170
171        let expected = ("abc", (Cow::from("name"), Cow::from("url"), Cow::from("")));
172        assert_eq!(
173            html_text2dest(r#"<a href="url" title="" >name</a>abc"#).unwrap(),
174            expected
175        );
176
177        let expected = (
178            "abc",
179            (Cow::from("na</me"), Cow::from("url"), Cow::from("")),
180        );
181        assert_eq!(
182            html_text2dest(r#"<a href="url" title="" >na</me</A>abc"#).unwrap(),
183            expected
184        );
185
186        let expected = nom::Err::Error(nom::error::Error::new(
187            r#"<a href="url" title="" >name</a abc"#,
188            nom::error::ErrorKind::AlphaNumeric,
189        ));
190        assert_eq!(
191            parse_attributes(r#"<a href="url" title="" >name</a abc"#).unwrap_err(),
192            expected
193        );
194
195        let expected = (
196            "abc",
197            (
198                Cow::from(
199                    "<img src=\"w3html.gif\" alt=\"W3Schools.com \"width=\"100\" height=\"132\">",
200                ),
201                Cow::from("https://blog.getreu.net"),
202                Cow::from(""),
203            ),
204        );
205        assert_eq!(
206            html_text2dest(
207                "<a href=\"https://blog.getreu.net\">\
208                              <img src=\"w3html.gif\" alt=\"W3Schools.com \"\
209                              width=\"100\" height=\"132\">\
210                              </a>abc"
211            )
212            .unwrap(),
213            expected
214        );
215    }
216
217    #[test]
218    fn test_tag_a_opening() {
219        let expected = (
220            "abc",
221            (Cow::from("http://getreu.net"), Cow::from("My blog")),
222        );
223        assert_eq!(
224            tag_a_opening(r#"<a href="http://getreu.net" title="My blog">abc"#).unwrap(),
225            expected
226        );
227        assert_eq!(
228            tag_a_opening(r#"<A href="http://getreu.net" title="My blog">abc"#).unwrap(),
229            expected
230        );
231    }
232
233    #[test]
234    fn test_parse_attributes() {
235        let expected = ("", (Cow::from("http://getreu.net"), Cow::from("My blog")));
236        assert_eq!(
237            parse_attributes(r#"abc href="http://getreu.net" abc title="My blog" abc"#).unwrap(),
238            expected
239        );
240
241        let expected = nom::Err::Error(nom::error::Error::new(
242            "href",
243            nom::error::ErrorKind::ManyMN,
244        ));
245        assert_eq!(
246            parse_attributes(r#" href="http://getreu.net" href="http://blog.getreu.net" "#)
247                .unwrap_err(),
248            expected
249        );
250
251        let expected = nom::Err::Error(nom::error::Error::new(
252            "title",
253            nom::error::ErrorKind::ManyMN,
254        ));
255        assert_eq!(
256            parse_attributes(r#" href="http://getreu.net" title="a" title="b" "#).unwrap_err(),
257            expected
258        );
259
260        let expected = nom::Err::Error(nom::error::Error::new("", nom::error::ErrorKind::Eof));
261        assert_eq!(
262            parse_attributes(r#" title="title" "#).unwrap_err(),
263            expected
264        );
265    }
266
267    #[test]
268    fn test_attribute_list() {
269        let expected = (
270            "",
271            vec![
272                ("", Cow::from("")),
273                ("href", Cow::from("http://getreu.net")),
274                ("", Cow::from("")),
275                ("title", Cow::from("My blog")),
276                ("", Cow::from("")),
277            ],
278        );
279        assert_eq!(
280            attribute_list(r#"abc href="http://getreu.net" abc title="My blog" abc"#).unwrap(),
281            expected
282        );
283    }
284
285    #[test]
286    fn test_attribute() {
287        let expected = (" abc", ("href", Cow::from("http://getreu.net")));
288        assert_eq!(
289            attribute(r#"href="http://getreu.net" abc"#).unwrap(),
290            expected
291        );
292        assert_eq!(
293            attribute(r#"href='http://getreu.net' abc"#).unwrap(),
294            expected
295        );
296        // Only allowed when no space in value.
297        assert_eq!(
298            attribute(r#"href=http://getreu.net abc"#).unwrap(),
299            expected
300        );
301
302        let expected = (" abc", ("href", Cow::from("http://getreu.net/<>")));
303        assert_eq!(
304            attribute(r#"href="http://getreu.net/&lt;&gt;" abc"#).unwrap(),
305            expected
306        );
307        assert_eq!(
308            attribute(r#"href='http://getreu.net/&lt;&gt;' abc"#).unwrap(),
309            expected
310        );
311        // Only allowed when no space in value.
312        assert_eq!(
313            attribute(r#"href=http://getreu.net/&lt;&gt; abc"#).unwrap(),
314            expected
315        );
316
317        let expected = (" abc", ("", Cow::from("")));
318        assert_eq!(attribute("bool abc").unwrap(), expected);
319
320        let expected = nom::Err::Error(nom::error::Error::new(
321            "1name",
322            nom::error::ErrorKind::Verify,
323        ));
324        assert_eq!(attribute("1name").unwrap_err(), expected);
325
326        let expected = nom::Err::Error(nom::error::Error::new(
327            r#"1name="http://getreu.net"#,
328            nom::error::ErrorKind::Verify,
329        ));
330        assert_eq!(
331            attribute(r#"1name="http://getreu.net"#).unwrap_err(),
332            expected
333        );
334    }
335}