parse_hyperlinks/parser/
html.rs

1//! This module implements parsers for HTML hyperlinks.
2#![allow(dead_code)]
3#![allow(clippy::type_complexity)]
4
5use crate::parser::Link;
6use html_escape::decode_html_entities;
7use nom::AsChar;
8use nom::Parser;
9use nom::branch::alt;
10use nom::bytes::complete::is_not;
11use nom::bytes::complete::tag;
12use nom::character::complete::alphanumeric1;
13use nom::error::Error;
14use nom::error::ErrorKind;
15use std::borrow::Cow;
16
17/// Wrapper around `html_text2dest()` that packs the result in
18/// `Link::Text2Dest`.
19pub fn html_text2dest_link(i: &'_ str) -> nom::IResult<&'_ str, Link<'_>> {
20    let (i, (te, de, ti)) = html_text2dest(i)?;
21    Ok((i, Link::Text2Dest(te, de, ti)))
22}
23
24/// Parse an HTML _inline hyperlink_.
25///
26/// It returns either `Ok((i, (link_text, link_destination, link_title)))` or some error.
27///
28/// The parser expects to start at the link start (`<`) to succeed.
29/// ```
30/// use parse_hyperlinks::parser::Link;
31/// use parse_hyperlinks::parser::html::html_text2dest;
32/// use std::borrow::Cow;
33///
34/// assert_eq!(
35///   html_text2dest(r#"<a href="destination" title="title">name</a>abc"#),
36///   Ok(("abc", (Cow::from("name"), Cow::from("destination"), Cow::from("title"))))
37/// );
38/// ```
39pub fn html_text2dest(
40    i: &'_ str,
41) -> nom::IResult<&'_ str, (Cow<'_, str>, Cow<'_, str>, Cow<'_, str>)> {
42    let (i, ((link_destination, link_title), link_text)) = nom::sequence::terminated(
43        nom::sequence::pair(
44            tag_a_opening,
45            alt((
46                nom::bytes::complete::take_until("</a>"),
47                nom::bytes::complete::take_until("</A>"),
48            )),
49        ),
50        // HTML is case insensitive. XHTML, that is being XML is case sensitive.
51        // Here we deal with HTML.
52        alt((tag("</a>"), tag("</A>"))),
53    )
54    .parse(i)?;
55    let link_text = decode_html_entities(link_text);
56    Ok((i, (link_text, link_destination, link_title)))
57}
58
59/// Parses a `<a ...>` opening tag and returns
60/// either `Ok((i, (link_destination, link_title)))` or some error.
61pub(crate) fn tag_a_opening(i: &'_ str) -> nom::IResult<&'_ str, (Cow<'_, str>, Cow<'_, str>)> {
62    nom::sequence::delimited(
63        // HTML is case insensitive. XHTML, that is being XML is case sensitive.
64        // Here we deal with HTML.
65        alt((tag("<a "), tag("<A "))),
66        nom::combinator::map_parser(is_not(">"), parse_attributes),
67        tag(">"),
68    )
69    .parse(i)
70}
71
72/// Parses attributes and returns `Ok((name, value))`.
73/// Boolean attributes are ignored, but silently consumed.
74fn attribute(i: &'_ str) -> nom::IResult<&'_ str, (&'_ str, Cow<'_, str>)> {
75    alt((
76        nom::sequence::pair(
77            nom::combinator::verify(alphanumeric1, |s: &str| {
78                s.chars().next().is_some_and(|c| c.is_alpha())
79            }),
80            alt((
81                nom::combinator::value(Cow::from(""), tag(r#"="""#)),
82                nom::combinator::value(Cow::from(""), tag(r#"=''"#)),
83                nom::combinator::map(
84                    nom::sequence::delimited(tag("=\""), is_not("\""), tag("\"")),
85                    |s: &str| decode_html_entities(s),
86                ),
87                nom::combinator::map(
88                    nom::sequence::delimited(tag("='"), is_not("'"), tag("'")),
89                    |s: &str| decode_html_entities(s),
90                ),
91                nom::combinator::map(nom::sequence::preceded(tag("="), is_not(" ")), |s: &str| {
92                    decode_html_entities(s)
93                }),
94            )),
95        ),
96        // Consume boolean attributes.
97        nom::combinator::value(
98            ("", Cow::from("")),
99            nom::combinator::verify(alphanumeric1, |s: &str| {
100                s.chars().next().is_some_and(|c| c.is_alpha())
101            }),
102        ),
103    ))
104    .parse(i)
105}
106
107/// Parses a whitespace separated list of attributes and returns a vector of (name, value).
108pub fn attribute_list(i: &'_ str) -> nom::IResult<&'_ str, Vec<(&'_ str, Cow<'_, str>)>> {
109    let i = i.trim();
110    nom::multi::separated_list1(nom::character::complete::multispace1, attribute).parse(i)
111}
112
113/// Extracts the `href` and `title` attributes and returns
114/// `Ok((link_destination, link_title))`. `link_title` can be empty,
115/// `link_destination` not.
116fn parse_attributes(i: &'_ str) -> nom::IResult<&'_ str, (Cow<'_, str>, Cow<'_, str>)> {
117    let (i, attributes) = attribute_list(i)?;
118    let mut href = Cow::Borrowed("");
119    let mut title = Cow::Borrowed("");
120
121    for (name, value) in attributes {
122        if name == "href" {
123            // Make sure `href` is empty, it can appear only
124            // once.
125            if !(*href).is_empty() {
126                return Err(nom::Err::Error(Error::new(name, ErrorKind::ManyMN)));
127            }
128            href = value;
129        } else if name == "title" {
130            // Make sure `title` is empty, it can appear only
131            // once.
132            if !(*title).is_empty() {
133                return Err(nom::Err::Error(Error::new(name, ErrorKind::ManyMN)));
134            }
135            title = value;
136        }
137    }
138
139    // Assure that `href` is not empty.
140    if (*href).is_empty() {
141        return Err(nom::Err::Error(Error::new(i, ErrorKind::Eof)));
142    };
143
144    Ok((i, (href, title)))
145}
146
147#[cfg(test)]
148mod tests {
149    use super::*;
150
151    #[test]
152    fn test_html_text2dest() {
153        let expected = (
154            "abc",
155            (
156                Cow::from("W3Schools"),
157                Cow::from("https://www.w3schools.com/"),
158                Cow::from("W3S"),
159            ),
160        );
161        assert_eq!(
162            html_text2dest(r#"<a title="W3S" href="https://www.w3schools.com/">W3Schools</a>abc"#)
163                .unwrap(),
164            expected
165        );
166        assert_eq!(
167            html_text2dest(r#"<A title="W3S" href="https://www.w3schools.com/">W3Schools</A>abc"#)
168                .unwrap(),
169            expected
170        );
171
172        let expected = ("abc", (Cow::from("<n>"), Cow::from("h"), Cow::from("t")));
173        assert_eq!(
174            html_text2dest(r#"<a title="t" href="h">&lt;n&gt;</a>abc"#).unwrap(),
175            expected
176        );
177
178        let expected = ("abc", (Cow::from("name"), Cow::from("url"), Cow::from("")));
179        assert_eq!(
180            html_text2dest(r#"<a href="url" title="" >name</a>abc"#).unwrap(),
181            expected
182        );
183
184        let expected = (
185            "abc",
186            (Cow::from("na</me"), Cow::from("url"), Cow::from("")),
187        );
188        assert_eq!(
189            html_text2dest(r#"<a href="url" title="" >na</me</A>abc"#).unwrap(),
190            expected
191        );
192
193        let expected = nom::Err::Error(nom::error::Error::new(
194            r#"<a href="url" title="" >name</a abc"#,
195            nom::error::ErrorKind::AlphaNumeric,
196        ));
197        assert_eq!(
198            parse_attributes(r#"<a href="url" title="" >name</a abc"#).unwrap_err(),
199            expected
200        );
201
202        let expected = (
203            "abc",
204            (
205                Cow::from(
206                    "<img src=\"w3html.gif\" alt=\"W3Schools.com \"width=\"100\" height=\"132\">",
207                ),
208                Cow::from("https://blog.getreu.net"),
209                Cow::from(""),
210            ),
211        );
212        assert_eq!(
213            html_text2dest(
214                "<a href=\"https://blog.getreu.net\">\
215                              <img src=\"w3html.gif\" alt=\"W3Schools.com \"\
216                              width=\"100\" height=\"132\">\
217                              </a>abc"
218            )
219            .unwrap(),
220            expected
221        );
222    }
223
224    #[test]
225    fn test_tag_a_opening() {
226        let expected = (
227            "abc",
228            (Cow::from("http://getreu.net"), Cow::from("My blog")),
229        );
230        assert_eq!(
231            tag_a_opening(r#"<a href="http://getreu.net" title="My blog">abc"#).unwrap(),
232            expected
233        );
234        assert_eq!(
235            tag_a_opening(r#"<A href="http://getreu.net" title="My blog">abc"#).unwrap(),
236            expected
237        );
238    }
239
240    #[test]
241    fn test_parse_attributes() {
242        let expected = ("", (Cow::from("http://getreu.net"), Cow::from("My blog")));
243        assert_eq!(
244            parse_attributes(r#"abc href="http://getreu.net" abc title="My blog" abc"#).unwrap(),
245            expected
246        );
247
248        let expected = nom::Err::Error(nom::error::Error::new(
249            "href",
250            nom::error::ErrorKind::ManyMN,
251        ));
252        assert_eq!(
253            parse_attributes(r#" href="http://getreu.net" href="http://blog.getreu.net" "#)
254                .unwrap_err(),
255            expected
256        );
257
258        let expected = nom::Err::Error(nom::error::Error::new(
259            "title",
260            nom::error::ErrorKind::ManyMN,
261        ));
262        assert_eq!(
263            parse_attributes(r#" href="http://getreu.net" title="a" title="b" "#).unwrap_err(),
264            expected
265        );
266
267        let expected = nom::Err::Error(nom::error::Error::new("", nom::error::ErrorKind::Eof));
268        assert_eq!(
269            parse_attributes(r#" title="title" "#).unwrap_err(),
270            expected
271        );
272    }
273
274    #[test]
275    fn test_attribute_list() {
276        let expected = (
277            "",
278            vec![
279                ("", Cow::from("")),
280                ("href", Cow::from("http://getreu.net")),
281                ("", Cow::from("")),
282                ("title", Cow::from("My blog")),
283                ("", Cow::from("")),
284            ],
285        );
286        assert_eq!(
287            attribute_list(r#"abc href="http://getreu.net" abc title="My blog" abc"#).unwrap(),
288            expected
289        );
290    }
291
292    #[test]
293    fn test_attribute() {
294        let expected = (" abc", ("href", Cow::from("http://getreu.net")));
295        assert_eq!(
296            attribute(r#"href="http://getreu.net" abc"#).unwrap(),
297            expected
298        );
299        assert_eq!(
300            attribute(r#"href='http://getreu.net' abc"#).unwrap(),
301            expected
302        );
303        // Only allowed when no space in value.
304        assert_eq!(
305            attribute(r#"href=http://getreu.net abc"#).unwrap(),
306            expected
307        );
308
309        let expected = (" abc", ("href", Cow::from("http://getreu.net/<>")));
310        assert_eq!(
311            attribute(r#"href="http://getreu.net/&lt;&gt;" abc"#).unwrap(),
312            expected
313        );
314        assert_eq!(
315            attribute(r#"href='http://getreu.net/&lt;&gt;' abc"#).unwrap(),
316            expected
317        );
318        // Only allowed when no space in value.
319        assert_eq!(
320            attribute(r#"href=http://getreu.net/&lt;&gt; abc"#).unwrap(),
321            expected
322        );
323
324        let expected = (" abc", ("", Cow::from("")));
325        assert_eq!(attribute("bool abc").unwrap(), expected);
326
327        let expected = nom::Err::Error(nom::error::Error::new(
328            "1name",
329            nom::error::ErrorKind::Verify,
330        ));
331        assert_eq!(attribute("1name").unwrap_err(), expected);
332
333        let expected = nom::Err::Error(nom::error::Error::new(
334            r#"1name="http://getreu.net"#,
335            nom::error::ErrorKind::Verify,
336        ));
337        assert_eq!(
338            attribute(r#"1name="http://getreu.net"#).unwrap_err(),
339            expected
340        );
341    }
342}