parse_hyperlinks/parser/
html.rs

1//! This module implements parsers for HTML hyperlinks.
2#![allow(dead_code)]
3#![allow(clippy::type_complexity)]
4
5use crate::parser::Link;
6use html_escape::decode_html_entities;
7use nom::AsChar;
8use nom::Parser;
9use nom::branch::alt;
10use nom::bytes::complete::is_not;
11use nom::bytes::complete::tag;
12use nom::character::complete::alphanumeric1;
13use nom::error::Error;
14use nom::error::ErrorKind;
15use std::borrow::Cow;
16
17/// Wrapper around `html_text2dest()` that packs the result in
18/// `Link::Text2Dest`.
19pub fn html_text2dest_link(i: &str) -> nom::IResult<&str, Link> {
20    let (i, (te, de, ti)) = html_text2dest(i)?;
21    Ok((i, Link::Text2Dest(te, de, ti)))
22}
23
24/// Parse an HTML _inline hyperlink_.
25///
26/// It returns either `Ok((i, (link_text, link_destination, link_title)))` or some error.
27///
28/// The parser expects to start at the link start (`<`) to succeed.
29/// ```
30/// use parse_hyperlinks::parser::Link;
31/// use parse_hyperlinks::parser::html::html_text2dest;
32/// use std::borrow::Cow;
33///
34/// assert_eq!(
35///   html_text2dest(r#"<a href="destination" title="title">name</a>abc"#),
36///   Ok(("abc", (Cow::from("name"), Cow::from("destination"), Cow::from("title"))))
37/// );
38/// ```
39pub fn html_text2dest(i: &str) -> nom::IResult<&str, (Cow<str>, Cow<str>, Cow<str>)> {
40    let (i, ((link_destination, link_title), link_text)) = nom::sequence::terminated(
41        nom::sequence::pair(
42            tag_a_opening,
43            alt((
44                nom::bytes::complete::take_until("</a>"),
45                nom::bytes::complete::take_until("</A>"),
46            )),
47        ),
48        // HTML is case insensitive. XHTML, that is being XML is case sensitive.
49        // Here we deal with HTML.
50        alt((tag("</a>"), tag("</A>"))),
51    )
52    .parse(i)?;
53    let link_text = decode_html_entities(link_text);
54    Ok((i, (link_text, link_destination, link_title)))
55}
56
57/// Parses a `<a ...>` opening tag and returns
58/// either `Ok((i, (link_destination, link_title)))` or some error.
59pub(crate) fn tag_a_opening(i: &str) -> nom::IResult<&str, (Cow<str>, Cow<str>)> {
60    nom::sequence::delimited(
61        // HTML is case insensitive. XHTML, that is being XML is case sensitive.
62        // Here we deal with HTML.
63        alt((tag("<a "), tag("<A "))),
64        nom::combinator::map_parser(is_not(">"), parse_attributes),
65        tag(">"),
66    )
67    .parse(i)
68}
69
70/// Parses attributes and returns `Ok((name, value))`.
71/// Boolean attributes are ignored, but silently consumed.
72fn attribute(i: &str) -> nom::IResult<&str, (&str, Cow<str>)> {
73    alt((
74        nom::sequence::pair(
75            nom::combinator::verify(alphanumeric1, |s: &str| {
76                s.chars().next().is_some_and(|c| c.is_alpha())
77            }),
78            alt((
79                nom::combinator::value(Cow::from(""), tag(r#"="""#)),
80                nom::combinator::value(Cow::from(""), tag(r#"=''"#)),
81                nom::combinator::map(
82                    nom::sequence::delimited(tag("=\""), is_not("\""), tag("\"")),
83                    |s: &str| decode_html_entities(s),
84                ),
85                nom::combinator::map(
86                    nom::sequence::delimited(tag("='"), is_not("'"), tag("'")),
87                    |s: &str| decode_html_entities(s),
88                ),
89                nom::combinator::map(nom::sequence::preceded(tag("="), is_not(" ")), |s: &str| {
90                    decode_html_entities(s)
91                }),
92            )),
93        ),
94        // Consume boolean attributes.
95        nom::combinator::value(
96            ("", Cow::from("")),
97            nom::combinator::verify(alphanumeric1, |s: &str| {
98                s.chars().next().is_some_and(|c| c.is_alpha())
99            }),
100        ),
101    ))
102    .parse(i)
103}
104
105/// Parses a whitespace separated list of attributes and returns a vector of (name, value).
106pub fn attribute_list(i: &str) -> nom::IResult<&str, Vec<(&str, Cow<str>)>> {
107    let i = i.trim();
108    nom::multi::separated_list1(nom::character::complete::multispace1, attribute).parse(i)
109}
110
111/// Extracts the `href` and `title` attributes and returns
112/// `Ok((link_destination, link_title))`. `link_title` can be empty,
113/// `link_destination` not.
114fn parse_attributes(i: &str) -> nom::IResult<&str, (Cow<str>, Cow<str>)> {
115    let (i, attributes) = attribute_list(i)?;
116    let mut href = Cow::Borrowed("");
117    let mut title = Cow::Borrowed("");
118
119    for (name, value) in attributes {
120        if name == "href" {
121            // Make sure `href` is empty, it can appear only
122            // once.
123            if !(*href).is_empty() {
124                return Err(nom::Err::Error(Error::new(name, ErrorKind::ManyMN)));
125            }
126            href = value;
127        } else if name == "title" {
128            // Make sure `title` is empty, it can appear only
129            // once.
130            if !(*title).is_empty() {
131                return Err(nom::Err::Error(Error::new(name, ErrorKind::ManyMN)));
132            }
133            title = value;
134        }
135    }
136
137    // Assure that `href` is not empty.
138    if (*href).is_empty() {
139        return Err(nom::Err::Error(Error::new(i, ErrorKind::Eof)));
140    };
141
142    Ok((i, (href, title)))
143}
144
145#[cfg(test)]
146mod tests {
147    use super::*;
148
149    #[test]
150    fn test_html_text2dest() {
151        let expected = (
152            "abc",
153            (
154                Cow::from("W3Schools"),
155                Cow::from("https://www.w3schools.com/"),
156                Cow::from("W3S"),
157            ),
158        );
159        assert_eq!(
160            html_text2dest(r#"<a title="W3S" href="https://www.w3schools.com/">W3Schools</a>abc"#)
161                .unwrap(),
162            expected
163        );
164        assert_eq!(
165            html_text2dest(r#"<A title="W3S" href="https://www.w3schools.com/">W3Schools</A>abc"#)
166                .unwrap(),
167            expected
168        );
169
170        let expected = ("abc", (Cow::from("<n>"), Cow::from("h"), Cow::from("t")));
171        assert_eq!(
172            html_text2dest(r#"<a title="t" href="h">&lt;n&gt;</a>abc"#).unwrap(),
173            expected
174        );
175
176        let expected = ("abc", (Cow::from("name"), Cow::from("url"), Cow::from("")));
177        assert_eq!(
178            html_text2dest(r#"<a href="url" title="" >name</a>abc"#).unwrap(),
179            expected
180        );
181
182        let expected = (
183            "abc",
184            (Cow::from("na</me"), Cow::from("url"), Cow::from("")),
185        );
186        assert_eq!(
187            html_text2dest(r#"<a href="url" title="" >na</me</A>abc"#).unwrap(),
188            expected
189        );
190
191        let expected = nom::Err::Error(nom::error::Error::new(
192            r#"<a href="url" title="" >name</a abc"#,
193            nom::error::ErrorKind::AlphaNumeric,
194        ));
195        assert_eq!(
196            parse_attributes(r#"<a href="url" title="" >name</a abc"#).unwrap_err(),
197            expected
198        );
199
200        let expected = (
201            "abc",
202            (
203                Cow::from(
204                    "<img src=\"w3html.gif\" alt=\"W3Schools.com \"width=\"100\" height=\"132\">",
205                ),
206                Cow::from("https://blog.getreu.net"),
207                Cow::from(""),
208            ),
209        );
210        assert_eq!(
211            html_text2dest(
212                "<a href=\"https://blog.getreu.net\">\
213                              <img src=\"w3html.gif\" alt=\"W3Schools.com \"\
214                              width=\"100\" height=\"132\">\
215                              </a>abc"
216            )
217            .unwrap(),
218            expected
219        );
220    }
221
222    #[test]
223    fn test_tag_a_opening() {
224        let expected = (
225            "abc",
226            (Cow::from("http://getreu.net"), Cow::from("My blog")),
227        );
228        assert_eq!(
229            tag_a_opening(r#"<a href="http://getreu.net" title="My blog">abc"#).unwrap(),
230            expected
231        );
232        assert_eq!(
233            tag_a_opening(r#"<A href="http://getreu.net" title="My blog">abc"#).unwrap(),
234            expected
235        );
236    }
237
238    #[test]
239    fn test_parse_attributes() {
240        let expected = ("", (Cow::from("http://getreu.net"), Cow::from("My blog")));
241        assert_eq!(
242            parse_attributes(r#"abc href="http://getreu.net" abc title="My blog" abc"#).unwrap(),
243            expected
244        );
245
246        let expected = nom::Err::Error(nom::error::Error::new(
247            "href",
248            nom::error::ErrorKind::ManyMN,
249        ));
250        assert_eq!(
251            parse_attributes(r#" href="http://getreu.net" href="http://blog.getreu.net" "#)
252                .unwrap_err(),
253            expected
254        );
255
256        let expected = nom::Err::Error(nom::error::Error::new(
257            "title",
258            nom::error::ErrorKind::ManyMN,
259        ));
260        assert_eq!(
261            parse_attributes(r#" href="http://getreu.net" title="a" title="b" "#).unwrap_err(),
262            expected
263        );
264
265        let expected = nom::Err::Error(nom::error::Error::new("", nom::error::ErrorKind::Eof));
266        assert_eq!(
267            parse_attributes(r#" title="title" "#).unwrap_err(),
268            expected
269        );
270    }
271
272    #[test]
273    fn test_attribute_list() {
274        let expected = (
275            "",
276            vec![
277                ("", Cow::from("")),
278                ("href", Cow::from("http://getreu.net")),
279                ("", Cow::from("")),
280                ("title", Cow::from("My blog")),
281                ("", Cow::from("")),
282            ],
283        );
284        assert_eq!(
285            attribute_list(r#"abc href="http://getreu.net" abc title="My blog" abc"#).unwrap(),
286            expected
287        );
288    }
289
290    #[test]
291    fn test_attribute() {
292        let expected = (" abc", ("href", Cow::from("http://getreu.net")));
293        assert_eq!(
294            attribute(r#"href="http://getreu.net" abc"#).unwrap(),
295            expected
296        );
297        assert_eq!(
298            attribute(r#"href='http://getreu.net' abc"#).unwrap(),
299            expected
300        );
301        // Only allowed when no space in value.
302        assert_eq!(
303            attribute(r#"href=http://getreu.net abc"#).unwrap(),
304            expected
305        );
306
307        let expected = (" abc", ("href", Cow::from("http://getreu.net/<>")));
308        assert_eq!(
309            attribute(r#"href="http://getreu.net/&lt;&gt;" abc"#).unwrap(),
310            expected
311        );
312        assert_eq!(
313            attribute(r#"href='http://getreu.net/&lt;&gt;' abc"#).unwrap(),
314            expected
315        );
316        // Only allowed when no space in value.
317        assert_eq!(
318            attribute(r#"href=http://getreu.net/&lt;&gt; abc"#).unwrap(),
319            expected
320        );
321
322        let expected = (" abc", ("", Cow::from("")));
323        assert_eq!(attribute("bool abc").unwrap(), expected);
324
325        let expected = nom::Err::Error(nom::error::Error::new(
326            "1name",
327            nom::error::ErrorKind::Verify,
328        ));
329        assert_eq!(attribute("1name").unwrap_err(), expected);
330
331        let expected = nom::Err::Error(nom::error::Error::new(
332            r#"1name="http://getreu.net"#,
333            nom::error::ErrorKind::Verify,
334        ));
335        assert_eq!(
336            attribute(r#"1name="http://getreu.net"#).unwrap_err(),
337            expected
338        );
339    }
340}