parse_hyperlinks/parser/
html_img.rs

1//! This module implements parsers for HTML image elements.
2#![allow(dead_code)]
3
4use crate::parser::Link;
5use crate::parser::html::attribute_list;
6use crate::parser::html::tag_a_opening as href_tag_a_opening;
7use html_escape::decode_html_entities;
8use nom::Parser;
9use nom::branch::alt;
10use nom::bytes::complete::is_not;
11use nom::bytes::complete::tag;
12use nom::error::Error;
13use nom::error::ErrorKind;
14use std::borrow::Cow;
15
16/// Wrapper around `html_img()` that packs the result in
17/// `Link::Image`.
18pub fn html_img_link(i: &str) -> nom::IResult<&str, Link> {
19    let (i, (alt, src)) = html_img(i)?;
20    Ok((i, Link::Image(alt, src)))
21}
22
23/// Parse an HTML _image_.
24///
25/// It returns either `Ok((i, (img_alt, img_src)))` or some error.
26///
27/// The parser expects to start at the link start (`<`) to succeed.
28/// ```
29/// use parse_hyperlinks;
30/// use parse_hyperlinks::parser::html_img::html_img;
31/// use std::borrow::Cow;
32///
33/// assert_eq!(
34///   html_img(r#"<img src="/images/my&amp;dog.png" alt="my Dog" width="500">abc"#),
35///   Ok(("abc", (Cow::from("my Dog"), Cow::from("/images/my&dog.png"))))
36/// );
37/// ```
38pub fn html_img(i: &str) -> nom::IResult<&str, (Cow<str>, Cow<str>)> {
39    tag_img(i)
40}
41
42/// Parses a `<img ...>` tag and returns
43/// either `Ok((i, (img_alt, img_src)))` or some error.
44#[inline]
45fn tag_img(i: &str) -> nom::IResult<&str, (Cow<str>, Cow<str>)> {
46    nom::sequence::delimited(
47        // HTML is case insensitive. XHTML, that is being XML is case sensitive.
48        // Here we deal with HTML.
49        alt((tag("<img "), tag("<IMG "))),
50        nom::combinator::map_parser(is_not(">"), parse_attributes),
51        tag(">"),
52    )
53    .parse(i)
54}
55
56/// Wrapper around `html_img()` that packs the result in
57/// `Link::Image`.
58pub fn html_img2dest_link(i: &str) -> nom::IResult<&str, Link> {
59    let (i, (text1, img_alt, img_src, text2, dest, title)) = html_img2dest(i)?;
60    Ok((
61        i,
62        Link::Image2Dest(text1, img_alt, img_src, text2, dest, title),
63    ))
64}
65
66/// Parse an HTML inline hyperlink with embedded image.
67///
68/// It returns either
69// `Ok((i, (text1, img_alt, img_src, text2, dest, title)))` or some error.
70///
71///
72/// The parser expects to start at the link start (`<`) to succeed.
73/// ```
74/// use parse_hyperlinks::parser::Link;
75/// use parse_hyperlinks::parser::html_img::html_img2dest;
76/// use std::borrow::Cow;
77///
78/// assert_eq!(
79///   html_img2dest("<a href=\"my doc.html\" title=\"title\">\
80///                    before<img src=\"dog.png\" alt=\"alt dog\"/>after\
81///                    </a>abc"),
82///   Ok(("abc",
83///    (Cow::from("before"), Cow::from("alt dog"), Cow::from("dog.png"),
84///     Cow::from("after"), Cow::from("my doc.html"), Cow::from("title"),
85/// ))));
86/// ```
87#[allow(clippy::type_complexity)]
88pub fn html_img2dest(
89    i: &str,
90) -> nom::IResult<&str, (Cow<str>, Cow<str>, Cow<str>, Cow<str>, Cow<str>, Cow<str>)> {
91    let (i, ((dest, title), text)) = nom::sequence::terminated(
92        nom::sequence::pair(
93            href_tag_a_opening,
94            alt((
95                nom::bytes::complete::take_until("</a>"),
96                nom::bytes::complete::take_until("</A>"),
97            )),
98        ),
99        // HTML is case insensitive. XHTML, that is being XML is case sensitive.
100        // Here we deal with HTML.
101        alt((tag("</a>"), tag("</A>"))),
102    )
103    .parse(i)?;
104
105    let (_, (text1, (img_alt, img_src), text2)) = (
106        nom::bytes::complete::take_until("<img"),
107        html_img,
108        nom::combinator::rest,
109    )
110        .parse(text)?;
111
112    let text1 = decode_html_entities(text1);
113    let text2 = decode_html_entities(text2);
114
115    Ok((i, (text1, img_alt, img_src, text2, dest, title)))
116}
117
118/// Extracts the `src` and `alt` attributes and returns
119/// `Ok((img_alt, img_src))`. `img_alt` can be empty,
120/// `img_src` not.
121fn parse_attributes(i: &str) -> nom::IResult<&str, (Cow<str>, Cow<str>)> {
122    let (i, attributes) = attribute_list(i)?;
123    let mut src = Cow::Borrowed("");
124    let mut alt = Cow::Borrowed("");
125
126    for (name, value) in attributes {
127        if name == "src" {
128            // Make sure `src` is empty, it can appear only
129            // once.
130            if !(*src).is_empty() {
131                return Err(nom::Err::Error(Error::new(name, ErrorKind::ManyMN)));
132            }
133            src = value;
134        } else if name == "alt" {
135            // Make sure `title` is empty, it can appear only
136            // once.
137            if !(*alt).is_empty() {
138                return Err(nom::Err::Error(Error::new(name, ErrorKind::ManyMN)));
139            }
140            alt = value;
141        }
142    }
143
144    // Assure that `href` is not empty.
145    if (*src).is_empty() {
146        return Err(nom::Err::Error(Error::new(i, ErrorKind::Eof)));
147    };
148
149    Ok((i, (alt, src)))
150}
151
152#[cfg(test)]
153mod tests {
154    use super::*;
155    use crate::parser::html::attribute_list;
156
157    #[test]
158    fn test_tag_img() {
159        let expected = (
160            "abc",
161            (
162                Cow::from("My dog"),
163                Cow::from("http://getreu.net/my&dog.png"),
164            ),
165        );
166        assert_eq!(
167            tag_img(r#"<img src="http://getreu.net/my&amp;dog.png" alt="My dog">abc"#).unwrap(),
168            expected
169        );
170        assert_eq!(
171            tag_img(r#"<IMG src="http://getreu.net/my&amp;dog.png" alt="My dog">abc"#).unwrap(),
172            expected
173        );
174        assert_eq!(
175            tag_img(r#"<IMG src="http://getreu.net/my&amp;dog.png" alt="My dog"/>abc"#).unwrap(),
176            expected
177        );
178        assert_eq!(
179            tag_img(r#"<IMG src="http://getreu.net/my&amp;dog.png" alt="My dog" />abc"#).unwrap(),
180            expected
181        );
182
183        let expected = (
184            "abc",
185            (Cow::from("Some picture"), Cow::from("t%20m%20p.jpg")),
186        );
187        assert_eq!(
188            tag_img(r#"<img src="t%20m%20p.jpg" alt="Some picture" />abc"#).unwrap(),
189            expected
190        );
191    }
192
193    #[test]
194    fn test_parse_attributes() {
195        let expected = (
196            "",
197            (
198                Cow::from("My dog"),
199                Cow::from("http://getreu.net/my&dog.png"),
200            ),
201        );
202        assert_eq!(
203            parse_attributes(r#"abc src="http://getreu.net/my&amp;dog.png" abc alt="My dog" abc"#)
204                .unwrap(),
205            expected
206        );
207
208        let expected =
209            nom::Err::Error(nom::error::Error::new("src", nom::error::ErrorKind::ManyMN));
210        assert_eq!(
211            parse_attributes(r#" src="http://getreu.net" src="http://blog.getreu.net" "#)
212                .unwrap_err(),
213            expected
214        );
215
216        let expected =
217            nom::Err::Error(nom::error::Error::new("alt", nom::error::ErrorKind::ManyMN));
218        assert_eq!(
219            parse_attributes(r#" src="http://getreu.net" alt="a" alt="b" "#).unwrap_err(),
220            expected
221        );
222
223        let expected = nom::Err::Error(nom::error::Error::new("", nom::error::ErrorKind::Eof));
224        assert_eq!(
225            parse_attributes(r#" title="title" "#).unwrap_err(),
226            expected
227        );
228    }
229
230    #[test]
231    fn test_attribute_list() {
232        let expected = (
233            "",
234            vec![
235                ("", Cow::from("")),
236                ("src", Cow::from("http://getreu.net/my&dog.png")),
237                ("", Cow::from("")),
238                ("alt", Cow::from("My dog")),
239                ("", Cow::from("")),
240            ],
241        );
242        assert_eq!(
243            attribute_list(r#"abc src="http://getreu.net/my&amp;dog.png" abc alt="My dog" abc"#)
244                .unwrap(),
245            expected
246        );
247    }
248}