parse_hyperlinks_extras/parser/parse_html.rs
1//! This module implements parsers to extract hyperlinks and image elements
2//! from HTML text input. The parsers in this module search for HTML only,
3//! no other markup languages are recognized.
4#![allow(dead_code)]
5
6use nom::bytes::complete::take_till;
7use nom::character::complete::anychar;
8use parse_hyperlinks::parser::Link;
9use parse_hyperlinks::parser::html::html_text2dest;
10use parse_hyperlinks::parser::html::html_text2dest_link;
11use parse_hyperlinks::parser::html_img::html_img;
12use parse_hyperlinks::parser::html_img::html_img_link;
13use parse_hyperlinks::parser::html_img::html_img2dest_link;
14use std::borrow::Cow;
15
16/// Consumes the input until the parser finds an HTML formatted _inline image_ (`Link::Image`).
17///
18/// The parser consumes the finding and returns
19/// `Ok((remaining_input, (skipped_input, (img_alt, img_src))))` or some error.
20///
21///
22/// # HTML
23///
24#[allow(clippy::type_complexity)]
25/// ```
26/// use parse_hyperlinks::parser::Link;
27/// use parse_hyperlinks_extras::parser::parse_html::take_img;
28/// use std::borrow::Cow;
29///
30/// let i = r#"abc<img src="destination1" alt="text1">abc
31/// abc<img src="destination2" alt="text2">abc
32/// "#;
33///
34/// let (i, r) = take_img(i).unwrap();
35/// assert_eq!(r.0, "abc");
36/// assert_eq!(r.1, (Cow::from("text1"), Cow::from("destination1")));
37/// let (i, r) = take_img(i).unwrap();
38/// assert_eq!(r.0, "abc\nabc");
39/// assert_eq!(r.1, (Cow::from("text2"), Cow::from("destination2")));
40/// ```
41pub fn take_img(i: &'_ str) -> nom::IResult<&'_ str, (&'_ str, (Cow<'_, str>, Cow<'_, str>))> {
42 let mut j = i;
43 let mut skip_count = 0;
44
45 let res = loop {
46 // Start searching for inline images.
47
48 // Regular `Link::Image` can start everywhere.
49 if let Ok((k, r)) = html_img(j) {
50 break (k, r);
51 };
52
53 // This makes sure that we advance.
54 let (k, _) = anychar(j)?;
55 skip_count += j.len() - k.len();
56 j = k;
57
58 // This might not consume bytes and never fails.
59 let (k, _) = take_till(|c| c == '<')(j)?;
60
61 skip_count += j.len() - k.len();
62 j = k;
63 };
64
65 // We found a link. Return it.
66 let (l, link) = res;
67
68 let skipped_input = &i[0..skip_count];
69
70 Ok((l, (skipped_input, link)))
71}
72
73/// Consumes the input until the parser finds an HTML formatted hyperlink _text2dest_
74/// (`Link::Text2Dest`).
75///
76/// The parser consumes the finding and returns
77/// `Ok((remaining_input, (skipped_input, (link_text, link_destination, link_title))))`
78/// or some error.
79///
80///
81/// # HTML
82///
83#[allow(clippy::type_complexity)]
84/// ```
85/// use parse_hyperlinks::parser::Link;
86/// use parse_hyperlinks_extras::parser::parse_html::take_text2dest;
87/// use std::borrow::Cow;
88///
89/// let i = r#"abc<a href="dest1" title="title1">text1</a>abc
90/// abc<a href="dest2" title="title2">text2</a>abc"#;
91///
92/// let (i, r) = take_text2dest(i).unwrap();
93/// assert_eq!(r.0, "abc");
94/// assert_eq!(r.1, (Cow::from("text1"), Cow::from("dest1"), Cow::from("title1")));
95/// let (i, r) = take_text2dest(i).unwrap();
96/// assert_eq!(r.0, "abc\nabc");
97/// assert_eq!(r.1, (Cow::from("text2"), Cow::from("dest2"), Cow::from("title2")));
98/// ```
99pub fn take_text2dest(
100 i: &'_ str,
101) -> nom::IResult<&'_ str, (&'_ str, (Cow<'_, str>, Cow<'_, str>, Cow<'_, str>))> {
102 let mut j = i;
103 let mut skip_count = 0;
104
105 let res = loop {
106 // Start searching for inline hyperlinks.
107
108 // Regular `Link::Text2Dest` can start everywhere.
109 if let Ok((k, r)) = html_text2dest(j) {
110 break (k, r);
111 };
112
113 // This makes sure that we advance.
114 let (k, _) = anychar(j)?;
115 skip_count += j.len() - k.len();
116 j = k;
117
118 // This might not consume bytes and never fails.
119 let (k, _) = take_till(|c| c == '<')(j)?;
120
121 skip_count += j.len() - k.len();
122 j = k;
123 };
124
125 // We found a link. Return it.
126 let (l, link) = res;
127
128 let skipped_input = &i[0..skip_count];
129
130 Ok((l, (skipped_input, link)))
131}
132
133/// Consumes the input until the parser finds an HTML formatted _inline
134/// image_ (`Link::Image`) and HTML formatted hyperlinks _text2dest_
135/// (`Link::Text2Dest`).
136///
137/// The parser consumes the finding and returns
138/// `Ok((remaining_input, (skipped_input, Link)))` or some error.
139///
140///
141/// # HTML
142///
143/// ```
144/// use parse_hyperlinks::parser::Link;
145/// use parse_hyperlinks_extras::parser::parse_html::take_link;
146/// use std::borrow::Cow;
147///
148/// let i = r#"abc<img src="dest1" alt="text1">abc
149/// abc<a href="dest2" title="title2">text2</a>abc
150/// abc<img src="dest3" alt="text3">abc
151/// abc<a href="dest4" title="title4">text4</a>abc
152/// abc<a href="dest5" title="title5">cde<img alt="alt5" src="src5"/>fgh</a>abc
153/// "#;
154///
155/// let (i, r) = take_link(i).unwrap();
156/// assert_eq!(r.0, "abc");
157/// assert_eq!(r.1, Link::Image(Cow::from("text1"), Cow::from("dest1")));
158/// let (i, r) = take_link(i).unwrap();
159/// assert_eq!(r.0, "abc\nabc");
160/// assert_eq!(r.1, Link::Text2Dest(Cow::from("text2"), Cow::from("dest2"), Cow::from("title2")));
161/// let (i, r) = take_link(i).unwrap();
162/// assert_eq!(r.0, "abc\nabc");
163/// assert_eq!(r.1, Link::Image(Cow::from("text3"), Cow::from("dest3")));
164/// let (i, r) = take_link(i).unwrap();
165/// assert_eq!(r.0, "abc\nabc");
166/// assert_eq!(r.1, Link::Text2Dest(Cow::from("text4"), Cow::from("dest4"), Cow::from("title4")));
167/// let (i, r) = take_link(i).unwrap();
168/// assert_eq!(r.0, "abc\nabc");
169/// assert_eq!(r.1, Link::Image2Dest(Cow::from("cde"), Cow::from("alt5"), Cow::from("src5"), Cow::from("fgh"), Cow::from("dest5"), Cow::from("title5")));
170/// ```
171pub fn take_link(i: &'_ str) -> nom::IResult<&'_ str, (&'_ str, Link<'_>)> {
172 let mut j = i;
173 let mut skip_count = 0;
174
175 let res = loop {
176 // Start searching for inline images.
177
178 // Regular `Link::Image` can start everywhere.
179 if let Ok((k, r)) = html_img2dest_link(j) {
180 break (k, r);
181 };
182 // Regular `Link::Image` can start everywhere.
183 if let Ok((k, r)) = html_img_link(j) {
184 break (k, r);
185 };
186 // Regular `Link::Text2Dest` can start everywhere.
187 if let Ok((k, r)) = html_text2dest_link(j) {
188 break (k, r);
189 };
190
191 // This makes sure that we advance.
192 let (k, _) = anychar(j)?;
193 skip_count += j.len() - k.len();
194 j = k;
195
196 // This might not consume bytes and never fails.
197 let (k, _) = take_till(|c| c == '<')(j)?;
198
199 skip_count += j.len() - k.len();
200 j = k;
201 };
202
203 // We found a link. Return it.
204 let (l, link) = res;
205
206 let skipped_input = &i[0..skip_count];
207
208 Ok((l, (skipped_input, link)))
209}
210
211#[cfg(test)]
212mod tests {
213 use super::*;
214 use crate::parser::parse_html::take_link;
215
216 #[test]
217 fn test_take_link() {
218 let (i, r) = take_link(r#"<img src="t%20m%20p&.jpg" alt="test1" />"#).unwrap();
219 assert_eq!(i, "");
220 assert_eq!(r.0, "");
221 assert_eq!(
222 r.1,
223 Link::Image(Cow::from("test1"), Cow::from("t%20m%20p&.jpg"))
224 );
225
226 //
227 let (i, r) = take_link(
228 r#"<a href="http://getreu.net/%C3%9C%20&">http://getreu.net/Ü%20&</a>abc"#,
229 )
230 .unwrap();
231 assert_eq!(i, "abc");
232 assert_eq!(r.0, "");
233 assert_eq!(
234 r.1,
235 Link::Text2Dest(
236 Cow::from("http://getreu.net/Ü%20&"),
237 Cow::from("http://getreu.net/%C3%9C%20&"),
238 Cow::from(""),
239 )
240 );
241
242 //
243 let (i, r) = take_link(
244 r#"<a href="http://getreu.net/%C3%9C%20&">http://getreu.net/Ü &</a>abc"#,
245 )
246 .unwrap();
247 assert_eq!(i, "abc");
248 assert_eq!(r.0, "");
249 assert_eq!(
250 r.1,
251 Link::Text2Dest(
252 Cow::from("http://getreu.net/Ü &"),
253 Cow::from("http://getreu.net/%C3%9C%20&"),
254 Cow::from(""),
255 )
256 );
257 }
258}