parse_hyperlinks_extras/parser/
parse_html.rs

1//! This module implements parsers to extract hyperlinks and image elements
2//! from HTML text input. The parsers in this module search for HTML only,
3//! no other markup languages are recognized.
4#![allow(dead_code)]
5
6use nom::bytes::complete::take_till;
7use nom::character::complete::anychar;
8use parse_hyperlinks::parser::html::html_text2dest;
9use parse_hyperlinks::parser::html::html_text2dest_link;
10use parse_hyperlinks::parser::html_img::html_img;
11use parse_hyperlinks::parser::html_img::html_img2dest_link;
12use parse_hyperlinks::parser::html_img::html_img_link;
13use parse_hyperlinks::parser::Link;
14use std::borrow::Cow;
15
16/// Consumes the input until the parser finds an HTML formatted _inline image_ (`Link::Image`).
17///
18/// The parser consumes the finding and returns
19/// `Ok((remaining_input, (skipped_input, (img_alt, img_src))))` or some error.
20///
21///
22/// # HTML
23///
24#[allow(clippy::type_complexity)]
25/// ```
26/// use parse_hyperlinks::parser::Link;
27/// use parse_hyperlinks_extras::parser::parse_html::take_img;
28/// use std::borrow::Cow;
29///
30/// let i = r#"abc<img src="destination1" alt="text1">abc
31/// abc<img src="destination2" alt="text2">abc
32/// "#;
33///
34/// let (i, r) = take_img(i).unwrap();
35/// assert_eq!(r.0, "abc");
36/// assert_eq!(r.1, (Cow::from("text1"), Cow::from("destination1")));
37/// let (i, r) = take_img(i).unwrap();
38/// assert_eq!(r.0, "abc\nabc");
39/// assert_eq!(r.1, (Cow::from("text2"), Cow::from("destination2")));
40/// ```
41pub fn take_img(i: &str) -> nom::IResult<&str, (&str, (Cow<str>, Cow<str>))> {
42    let mut j = i;
43    let mut skip_count = 0;
44
45    let res = loop {
46        // Start searching for inline images.
47
48        // Regular `Link::Image` can start everywhere.
49        if let Ok((k, r)) = html_img(j) {
50            break (k, r);
51        };
52
53        // This makes sure that we advance.
54        let (k, _) = anychar(j)?;
55        skip_count += j.len() - k.len();
56        j = k;
57
58        // This might not consume bytes and never fails.
59        let (k, _) = take_till(|c| c == '<')(j)?;
60
61        skip_count += j.len() - k.len();
62        j = k;
63    };
64
65    // We found a link. Return it.
66    let (l, link) = res;
67
68    let skipped_input = &i[0..skip_count];
69
70    Ok((l, (skipped_input, link)))
71}
72
73/// Consumes the input until the parser finds an HTML formatted hyperlink _text2dest_
74/// (`Link::Text2Dest`).
75///
76/// The parser consumes the finding and returns
77/// `Ok((remaining_input, (skipped_input, (link_text, link_destination, link_title))))`
78/// or some error.
79///
80///
81/// # HTML
82///
83#[allow(clippy::type_complexity)]
84/// ```
85/// use parse_hyperlinks::parser::Link;
86/// use parse_hyperlinks_extras::parser::parse_html::take_text2dest;
87/// use std::borrow::Cow;
88///
89/// let i = r#"abc<a href="dest1" title="title1">text1</a>abc
90/// abc<a href="dest2" title="title2">text2</a>abc"#;
91///
92/// let (i, r) = take_text2dest(i).unwrap();
93/// assert_eq!(r.0, "abc");
94/// assert_eq!(r.1, (Cow::from("text1"), Cow::from("dest1"), Cow::from("title1")));
95/// let (i, r) = take_text2dest(i).unwrap();
96/// assert_eq!(r.0, "abc\nabc");
97/// assert_eq!(r.1, (Cow::from("text2"), Cow::from("dest2"), Cow::from("title2")));
98/// ```
99pub fn take_text2dest(i: &str) -> nom::IResult<&str, (&str, (Cow<str>, Cow<str>, Cow<str>))> {
100    let mut j = i;
101    let mut skip_count = 0;
102
103    let res = loop {
104        // Start searching for inline hyperlinks.
105
106        // Regular `Link::Text2Dest` can start everywhere.
107        if let Ok((k, r)) = html_text2dest(j) {
108            break (k, r);
109        };
110
111        // This makes sure that we advance.
112        let (k, _) = anychar(j)?;
113        skip_count += j.len() - k.len();
114        j = k;
115
116        // This might not consume bytes and never fails.
117        let (k, _) = take_till(|c| c == '<')(j)?;
118
119        skip_count += j.len() - k.len();
120        j = k;
121    };
122
123    // We found a link. Return it.
124    let (l, link) = res;
125
126    let skipped_input = &i[0..skip_count];
127
128    Ok((l, (skipped_input, link)))
129}
130
131/// Consumes the input until the parser finds an HTML formatted _inline
132/// image_ (`Link::Image`) and HTML formatted hyperlinks _text2dest_
133/// (`Link::Text2Dest`).
134///
135/// The parser consumes the finding and returns
136/// `Ok((remaining_input, (skipped_input, Link)))` or some error.
137///
138///
139/// # HTML
140///
141/// ```
142/// use parse_hyperlinks::parser::Link;
143/// use parse_hyperlinks_extras::parser::parse_html::take_link;
144/// use std::borrow::Cow;
145///
146/// let i = r#"abc<img src="dest1" alt="text1">abc
147/// abc<a href="dest2" title="title2">text2</a>abc
148/// abc<img src="dest3" alt="text3">abc
149/// abc<a href="dest4" title="title4">text4</a>abc
150/// abc<a href="dest5" title="title5">cde<img alt="alt5" src="src5"/>fgh</a>abc
151/// "#;
152///
153/// let (i, r) = take_link(i).unwrap();
154/// assert_eq!(r.0, "abc");
155/// assert_eq!(r.1, Link::Image(Cow::from("text1"), Cow::from("dest1")));
156/// let (i, r) = take_link(i).unwrap();
157/// assert_eq!(r.0, "abc\nabc");
158/// assert_eq!(r.1, Link::Text2Dest(Cow::from("text2"), Cow::from("dest2"), Cow::from("title2")));
159/// let (i, r) = take_link(i).unwrap();
160/// assert_eq!(r.0, "abc\nabc");
161/// assert_eq!(r.1, Link::Image(Cow::from("text3"), Cow::from("dest3")));
162/// let (i, r) = take_link(i).unwrap();
163/// assert_eq!(r.0, "abc\nabc");
164/// assert_eq!(r.1, Link::Text2Dest(Cow::from("text4"), Cow::from("dest4"), Cow::from("title4")));
165/// let (i, r) = take_link(i).unwrap();
166/// assert_eq!(r.0, "abc\nabc");
167/// assert_eq!(r.1, Link::Image2Dest(Cow::from("cde"), Cow::from("alt5"), Cow::from("src5"), Cow::from("fgh"), Cow::from("dest5"), Cow::from("title5")));
168/// ```
169pub fn take_link(i: &str) -> nom::IResult<&str, (&str, Link)> {
170    let mut j = i;
171    let mut skip_count = 0;
172
173    let res = loop {
174        // Start searching for inline images.
175
176        // Regular `Link::Image` can start everywhere.
177        if let Ok((k, r)) = html_img2dest_link(j) {
178            break (k, r);
179        };
180        // Regular `Link::Image` can start everywhere.
181        if let Ok((k, r)) = html_img_link(j) {
182            break (k, r);
183        };
184        // Regular `Link::Text2Dest` can start everywhere.
185        if let Ok((k, r)) = html_text2dest_link(j) {
186            break (k, r);
187        };
188
189        // This makes sure that we advance.
190        let (k, _) = anychar(j)?;
191        skip_count += j.len() - k.len();
192        j = k;
193
194        // This might not consume bytes and never fails.
195        let (k, _) = take_till(|c| c == '<')(j)?;
196
197        skip_count += j.len() - k.len();
198        j = k;
199    };
200
201    // We found a link. Return it.
202    let (l, link) = res;
203
204    let skipped_input = &i[0..skip_count];
205
206    Ok((l, (skipped_input, link)))
207}
208
209#[cfg(test)]
210mod tests {
211    use super::*;
212    use crate::parser::parse_html::take_link;
213
214    #[test]
215    fn test_take_link() {
216        let (i, r) = take_link(r#"<img src="t%20m%20p&amp;.jpg" alt="test1" />"#).unwrap();
217        assert_eq!(i, "");
218        assert_eq!(r.0, "");
219        assert_eq!(
220            r.1,
221            Link::Image(Cow::from("test1"), Cow::from("t%20m%20p&.jpg"))
222        );
223
224        //
225        let (i, r) = take_link(
226            r#"<a href="http://getreu.net/%C3%9C%20&amp;">http://getreu.net/Ü%20&amp;</a>abc"#,
227        )
228        .unwrap();
229        assert_eq!(i, "abc");
230        assert_eq!(r.0, "");
231        assert_eq!(
232            r.1,
233            Link::Text2Dest(
234                Cow::from("http://getreu.net/Ü%20&"),
235                Cow::from("http://getreu.net/%C3%9C%20&"),
236                Cow::from(""),
237            )
238        );
239
240        //
241        let (i, r) = take_link(
242            r#"<a href="http://getreu.net/%C3%9C%20&amp;">http://getreu.net/Ü &amp;</a>abc"#,
243        )
244        .unwrap();
245        assert_eq!(i, "abc");
246        assert_eq!(r.0, "");
247        assert_eq!(
248            r.1,
249            Link::Text2Dest(
250                Cow::from("http://getreu.net/Ü &"),
251                Cow::from("http://getreu.net/%C3%9C%20&"),
252                Cow::from(""),
253            )
254        );
255    }
256}