parse_hyperlinks/parser/
html_img.rs

1//! This module implements parsers for HTML image elements.
2#![allow(dead_code)]
3
4use crate::parser::html::attribute_list;
5use crate::parser::html::tag_a_opening as href_tag_a_opening;
6use crate::parser::Link;
7use html_escape::decode_html_entities;
8use nom::branch::alt;
9use nom::bytes::complete::is_not;
10use nom::bytes::complete::tag;
11use nom::error::Error;
12use nom::error::ErrorKind;
13use nom::sequence::tuple;
14use std::borrow::Cow;
15
16/// Wrapper around `html_img()` that packs the result in
17/// `Link::Image`.
18pub fn html_img_link(i: &str) -> nom::IResult<&str, Link> {
19    let (i, (alt, src)) = html_img(i)?;
20    Ok((i, Link::Image(alt, src)))
21}
22
23/// Parse an HTML _image_.
24///
25/// It returns either `Ok((i, (img_alt, img_src)))` or some error.
26///
27/// The parser expects to start at the link start (`<`) to succeed.
28/// ```
29/// use parse_hyperlinks;
30/// use parse_hyperlinks::parser::html_img::html_img;
31/// use std::borrow::Cow;
32///
33/// assert_eq!(
34///   html_img(r#"<img src="/images/my&amp;dog.png" alt="my Dog" width="500">abc"#),
35///   Ok(("abc", (Cow::from("my Dog"), Cow::from("/images/my&dog.png"))))
36/// );
37/// ```
38pub fn html_img(i: &str) -> nom::IResult<&str, (Cow<str>, Cow<str>)> {
39    tag_img(i)
40}
41
42/// Parses a `<img ...>` tag and returns
43/// either `Ok((i, (img_alt, img_src)))` or some error.
44#[inline]
45fn tag_img(i: &str) -> nom::IResult<&str, (Cow<str>, Cow<str>)> {
46    nom::sequence::delimited(
47        // HTML is case insensitive. XHTML, that is being XML is case sensitive.
48        // Here we deal with HTML.
49        alt((tag("<img "), tag("<IMG "))),
50        nom::combinator::map_parser(is_not(">"), parse_attributes),
51        tag(">"),
52    )(i)
53}
54
55/// Wrapper around `html_img()` that packs the result in
56/// `Link::Image`.
57pub fn html_img2dest_link(i: &str) -> nom::IResult<&str, Link> {
58    let (i, (text1, img_alt, img_src, text2, dest, title)) = html_img2dest(i)?;
59    Ok((
60        i,
61        Link::Image2Dest(text1, img_alt, img_src, text2, dest, title),
62    ))
63}
64
65/// Parse an HTML inline hyperlink with embedded image.
66///
67/// It returns either
68// `Ok((i, (text1, img_alt, img_src, text2, dest, title)))` or some error.
69///
70///
71/// The parser expects to start at the link start (`<`) to succeed.
72/// ```
73/// use parse_hyperlinks::parser::Link;
74/// use parse_hyperlinks::parser::html_img::html_img2dest;
75/// use std::borrow::Cow;
76///
77/// assert_eq!(
78///   html_img2dest("<a href=\"my doc.html\" title=\"title\">\
79///                    before<img src=\"dog.png\" alt=\"alt dog\"/>after\
80///                    </a>abc"),
81///   Ok(("abc",
82///    (Cow::from("before"), Cow::from("alt dog"), Cow::from("dog.png"),
83///     Cow::from("after"), Cow::from("my doc.html"), Cow::from("title"),
84/// ))));
85/// ```
86#[allow(clippy::type_complexity)]
87pub fn html_img2dest(
88    i: &str,
89) -> nom::IResult<&str, (Cow<str>, Cow<str>, Cow<str>, Cow<str>, Cow<str>, Cow<str>)> {
90    let (i, ((dest, title), text)) = nom::sequence::terminated(
91        nom::sequence::pair(
92            href_tag_a_opening,
93            alt((
94                nom::bytes::complete::take_until("</a>"),
95                nom::bytes::complete::take_until("</A>"),
96            )),
97        ),
98        // HTML is case insensitive. XHTML, that is being XML is case sensitive.
99        // Here we deal with HTML.
100        alt((tag("</a>"), tag("</A>"))),
101    )(i)?;
102
103    let (_, (text1, (img_alt, img_src), text2)) = tuple((
104        nom::bytes::complete::take_until("<img"),
105        html_img,
106        nom::combinator::rest,
107    ))(text)?;
108
109    let text1 = decode_html_entities(text1);
110    let text2 = decode_html_entities(text2);
111
112    Ok((i, (text1, img_alt, img_src, text2, dest, title)))
113}
114
115/// Extracts the `src` and `alt` attributes and returns
116/// `Ok((img_alt, img_src))`. `img_alt` can be empty,
117/// `img_src` not.
118fn parse_attributes(i: &str) -> nom::IResult<&str, (Cow<str>, Cow<str>)> {
119    let (i, attributes) = attribute_list(i)?;
120    let mut src = Cow::Borrowed("");
121    let mut alt = Cow::Borrowed("");
122
123    for (name, value) in attributes {
124        if name == "src" {
125            // Make sure `src` is empty, it can appear only
126            // once.
127            if !(*src).is_empty() {
128                return Err(nom::Err::Error(Error::new(name, ErrorKind::ManyMN)));
129            }
130            src = value;
131        } else if name == "alt" {
132            // Make sure `title` is empty, it can appear only
133            // once.
134            if !(*alt).is_empty() {
135                return Err(nom::Err::Error(Error::new(name, ErrorKind::ManyMN)));
136            }
137            alt = value;
138        }
139    }
140
141    // Assure that `href` is not empty.
142    if (*src).is_empty() {
143        return Err(nom::Err::Error(Error::new(i, ErrorKind::Eof)));
144    };
145
146    Ok((i, (alt, src)))
147}
148
149#[cfg(test)]
150mod tests {
151    use super::*;
152    use crate::parser::html::attribute_list;
153
154    #[test]
155    fn test_tag_img() {
156        let expected = (
157            "abc",
158            (
159                Cow::from("My dog"),
160                Cow::from("http://getreu.net/my&dog.png"),
161            ),
162        );
163        assert_eq!(
164            tag_img(r#"<img src="http://getreu.net/my&amp;dog.png" alt="My dog">abc"#).unwrap(),
165            expected
166        );
167        assert_eq!(
168            tag_img(r#"<IMG src="http://getreu.net/my&amp;dog.png" alt="My dog">abc"#).unwrap(),
169            expected
170        );
171        assert_eq!(
172            tag_img(r#"<IMG src="http://getreu.net/my&amp;dog.png" alt="My dog"/>abc"#).unwrap(),
173            expected
174        );
175        assert_eq!(
176            tag_img(r#"<IMG src="http://getreu.net/my&amp;dog.png" alt="My dog" />abc"#).unwrap(),
177            expected
178        );
179
180        let expected = (
181            "abc",
182            (Cow::from("Some picture"), Cow::from("t%20m%20p.jpg")),
183        );
184        assert_eq!(
185            tag_img(r#"<img src="t%20m%20p.jpg" alt="Some picture" />abc"#).unwrap(),
186            expected
187        );
188    }
189
190    #[test]
191    fn test_parse_attributes() {
192        let expected = (
193            "",
194            (
195                Cow::from("My dog"),
196                Cow::from("http://getreu.net/my&dog.png"),
197            ),
198        );
199        assert_eq!(
200            parse_attributes(r#"abc src="http://getreu.net/my&amp;dog.png" abc alt="My dog" abc"#)
201                .unwrap(),
202            expected
203        );
204
205        let expected =
206            nom::Err::Error(nom::error::Error::new("src", nom::error::ErrorKind::ManyMN));
207        assert_eq!(
208            parse_attributes(r#" src="http://getreu.net" src="http://blog.getreu.net" "#)
209                .unwrap_err(),
210            expected
211        );
212
213        let expected =
214            nom::Err::Error(nom::error::Error::new("alt", nom::error::ErrorKind::ManyMN));
215        assert_eq!(
216            parse_attributes(r#" src="http://getreu.net" alt="a" alt="b" "#).unwrap_err(),
217            expected
218        );
219
220        let expected = nom::Err::Error(nom::error::Error::new("", nom::error::ErrorKind::Eof));
221        assert_eq!(
222            parse_attributes(r#" title="title" "#).unwrap_err(),
223            expected
224        );
225    }
226
227    #[test]
228    fn test_attribute_list() {
229        let expected = (
230            "",
231            vec![
232                ("", Cow::from("")),
233                ("src", Cow::from("http://getreu.net/my&dog.png")),
234                ("", Cow::from("")),
235                ("alt", Cow::from("My dog")),
236                ("", Cow::from("")),
237            ],
238        );
239        assert_eq!(
240            attribute_list(r#"abc src="http://getreu.net/my&amp;dog.png" abc alt="My dog" abc"#)
241                .unwrap(),
242            expected
243        );
244    }
245}