parse_hyperlinks_extras/parser/
parse_html.rs

1//! This module implements parsers to extract hyperlinks and image elements
2//! from HTML text input. The parsers in this module search for HTML only,
3//! no other markup languages are recognized.
4#![allow(dead_code)]
5
6use nom::bytes::complete::take_till;
7use nom::character::complete::anychar;
8use parse_hyperlinks::parser::Link;
9use parse_hyperlinks::parser::html::html_text2dest;
10use parse_hyperlinks::parser::html::html_text2dest_link;
11use parse_hyperlinks::parser::html_img::html_img;
12use parse_hyperlinks::parser::html_img::html_img_link;
13use parse_hyperlinks::parser::html_img::html_img2dest_link;
14use std::borrow::Cow;
15
16/// Consumes the input until the parser finds an HTML formatted _inline image_ (`Link::Image`).
17///
18/// The parser consumes the finding and returns
19/// `Ok((remaining_input, (skipped_input, (img_alt, img_src))))` or some error.
20///
21///
22/// # HTML
23///
24#[allow(clippy::type_complexity)]
25/// ```
26/// use parse_hyperlinks::parser::Link;
27/// use parse_hyperlinks_extras::parser::parse_html::take_img;
28/// use std::borrow::Cow;
29///
30/// let i = r#"abc<img src="destination1" alt="text1">abc
31/// abc<img src="destination2" alt="text2">abc
32/// "#;
33///
34/// let (i, r) = take_img(i).unwrap();
35/// assert_eq!(r.0, "abc");
36/// assert_eq!(r.1, (Cow::from("text1"), Cow::from("destination1")));
37/// let (i, r) = take_img(i).unwrap();
38/// assert_eq!(r.0, "abc\nabc");
39/// assert_eq!(r.1, (Cow::from("text2"), Cow::from("destination2")));
40/// ```
41pub fn take_img(i: &'_ str) -> nom::IResult<&'_ str, (&'_ str, (Cow<'_, str>, Cow<'_, str>))> {
42    let mut j = i;
43    let mut skip_count = 0;
44
45    let res = loop {
46        // Start searching for inline images.
47
48        // Regular `Link::Image` can start everywhere.
49        if let Ok((k, r)) = html_img(j) {
50            break (k, r);
51        };
52
53        // This makes sure that we advance.
54        let (k, _) = anychar(j)?;
55        skip_count += j.len() - k.len();
56        j = k;
57
58        // This might not consume bytes and never fails.
59        let (k, _) = take_till(|c| c == '<')(j)?;
60
61        skip_count += j.len() - k.len();
62        j = k;
63    };
64
65    // We found a link. Return it.
66    let (l, link) = res;
67
68    let skipped_input = &i[0..skip_count];
69
70    Ok((l, (skipped_input, link)))
71}
72
73/// Consumes the input until the parser finds an HTML formatted hyperlink _text2dest_
74/// (`Link::Text2Dest`).
75///
76/// The parser consumes the finding and returns
77/// `Ok((remaining_input, (skipped_input, (link_text, link_destination, link_title))))`
78/// or some error.
79///
80///
81/// # HTML
82///
83#[allow(clippy::type_complexity)]
84/// ```
85/// use parse_hyperlinks::parser::Link;
86/// use parse_hyperlinks_extras::parser::parse_html::take_text2dest;
87/// use std::borrow::Cow;
88///
89/// let i = r#"abc<a href="dest1" title="title1">text1</a>abc
90/// abc<a href="dest2" title="title2">text2</a>abc"#;
91///
92/// let (i, r) = take_text2dest(i).unwrap();
93/// assert_eq!(r.0, "abc");
94/// assert_eq!(r.1, (Cow::from("text1"), Cow::from("dest1"), Cow::from("title1")));
95/// let (i, r) = take_text2dest(i).unwrap();
96/// assert_eq!(r.0, "abc\nabc");
97/// assert_eq!(r.1, (Cow::from("text2"), Cow::from("dest2"), Cow::from("title2")));
98/// ```
99pub fn take_text2dest(
100    i: &'_ str,
101) -> nom::IResult<&'_ str, (&'_ str, (Cow<'_, str>, Cow<'_, str>, Cow<'_, str>))> {
102    let mut j = i;
103    let mut skip_count = 0;
104
105    let res = loop {
106        // Start searching for inline hyperlinks.
107
108        // Regular `Link::Text2Dest` can start everywhere.
109        if let Ok((k, r)) = html_text2dest(j) {
110            break (k, r);
111        };
112
113        // This makes sure that we advance.
114        let (k, _) = anychar(j)?;
115        skip_count += j.len() - k.len();
116        j = k;
117
118        // This might not consume bytes and never fails.
119        let (k, _) = take_till(|c| c == '<')(j)?;
120
121        skip_count += j.len() - k.len();
122        j = k;
123    };
124
125    // We found a link. Return it.
126    let (l, link) = res;
127
128    let skipped_input = &i[0..skip_count];
129
130    Ok((l, (skipped_input, link)))
131}
132
133/// Consumes the input until the parser finds an HTML formatted _inline
134/// image_ (`Link::Image`) and HTML formatted hyperlinks _text2dest_
135/// (`Link::Text2Dest`).
136///
137/// The parser consumes the finding and returns
138/// `Ok((remaining_input, (skipped_input, Link)))` or some error.
139///
140///
141/// # HTML
142///
143/// ```
144/// use parse_hyperlinks::parser::Link;
145/// use parse_hyperlinks_extras::parser::parse_html::take_link;
146/// use std::borrow::Cow;
147///
148/// let i = r#"abc<img src="dest1" alt="text1">abc
149/// abc<a href="dest2" title="title2">text2</a>abc
150/// abc<img src="dest3" alt="text3">abc
151/// abc<a href="dest4" title="title4">text4</a>abc
152/// abc<a href="dest5" title="title5">cde<img alt="alt5" src="src5"/>fgh</a>abc
153/// "#;
154///
155/// let (i, r) = take_link(i).unwrap();
156/// assert_eq!(r.0, "abc");
157/// assert_eq!(r.1, Link::Image(Cow::from("text1"), Cow::from("dest1")));
158/// let (i, r) = take_link(i).unwrap();
159/// assert_eq!(r.0, "abc\nabc");
160/// assert_eq!(r.1, Link::Text2Dest(Cow::from("text2"), Cow::from("dest2"), Cow::from("title2")));
161/// let (i, r) = take_link(i).unwrap();
162/// assert_eq!(r.0, "abc\nabc");
163/// assert_eq!(r.1, Link::Image(Cow::from("text3"), Cow::from("dest3")));
164/// let (i, r) = take_link(i).unwrap();
165/// assert_eq!(r.0, "abc\nabc");
166/// assert_eq!(r.1, Link::Text2Dest(Cow::from("text4"), Cow::from("dest4"), Cow::from("title4")));
167/// let (i, r) = take_link(i).unwrap();
168/// assert_eq!(r.0, "abc\nabc");
169/// assert_eq!(r.1, Link::Image2Dest(Cow::from("cde"), Cow::from("alt5"), Cow::from("src5"), Cow::from("fgh"), Cow::from("dest5"), Cow::from("title5")));
170/// ```
171pub fn take_link(i: &'_ str) -> nom::IResult<&'_ str, (&'_ str, Link<'_>)> {
172    let mut j = i;
173    let mut skip_count = 0;
174
175    let res = loop {
176        // Start searching for inline images.
177
178        // Regular `Link::Image` can start everywhere.
179        if let Ok((k, r)) = html_img2dest_link(j) {
180            break (k, r);
181        };
182        // Regular `Link::Image` can start everywhere.
183        if let Ok((k, r)) = html_img_link(j) {
184            break (k, r);
185        };
186        // Regular `Link::Text2Dest` can start everywhere.
187        if let Ok((k, r)) = html_text2dest_link(j) {
188            break (k, r);
189        };
190
191        // This makes sure that we advance.
192        let (k, _) = anychar(j)?;
193        skip_count += j.len() - k.len();
194        j = k;
195
196        // This might not consume bytes and never fails.
197        let (k, _) = take_till(|c| c == '<')(j)?;
198
199        skip_count += j.len() - k.len();
200        j = k;
201    };
202
203    // We found a link. Return it.
204    let (l, link) = res;
205
206    let skipped_input = &i[0..skip_count];
207
208    Ok((l, (skipped_input, link)))
209}
210
211#[cfg(test)]
212mod tests {
213    use super::*;
214    use crate::parser::parse_html::take_link;
215
216    #[test]
217    fn test_take_link() {
218        let (i, r) = take_link(r#"<img src="t%20m%20p&amp;.jpg" alt="test1" />"#).unwrap();
219        assert_eq!(i, "");
220        assert_eq!(r.0, "");
221        assert_eq!(
222            r.1,
223            Link::Image(Cow::from("test1"), Cow::from("t%20m%20p&.jpg"))
224        );
225
226        //
227        let (i, r) = take_link(
228            r#"<a href="http://getreu.net/%C3%9C%20&amp;">http://getreu.net/Ü%20&amp;</a>abc"#,
229        )
230        .unwrap();
231        assert_eq!(i, "abc");
232        assert_eq!(r.0, "");
233        assert_eq!(
234            r.1,
235            Link::Text2Dest(
236                Cow::from("http://getreu.net/Ü%20&"),
237                Cow::from("http://getreu.net/%C3%9C%20&"),
238                Cow::from(""),
239            )
240        );
241
242        //
243        let (i, r) = take_link(
244            r#"<a href="http://getreu.net/%C3%9C%20&amp;">http://getreu.net/Ü &amp;</a>abc"#,
245        )
246        .unwrap();
247        assert_eq!(i, "abc");
248        assert_eq!(r.0, "");
249        assert_eq!(
250            r.1,
251            Link::Text2Dest(
252                Cow::from("http://getreu.net/Ü &"),
253                Cow::from("http://getreu.net/%C3%9C%20&"),
254                Cow::from(""),
255            )
256        );
257    }
258}