parse_hyperlinks_extras/parser/parse_html.rs
1//! This module implements parsers to extract hyperlinks and image elements
2//! from HTML text input. The parsers in this module search for HTML only,
3//! no other markup languages are recognized.
4#![allow(dead_code)]
5
6use nom::bytes::complete::take_till;
7use nom::character::complete::anychar;
8use parse_hyperlinks::parser::html::html_text2dest;
9use parse_hyperlinks::parser::html::html_text2dest_link;
10use parse_hyperlinks::parser::html_img::html_img;
11use parse_hyperlinks::parser::html_img::html_img2dest_link;
12use parse_hyperlinks::parser::html_img::html_img_link;
13use parse_hyperlinks::parser::Link;
14use std::borrow::Cow;
15
16/// Consumes the input until the parser finds an HTML formatted _inline image_ (`Link::Image`).
17///
18/// The parser consumes the finding and returns
19/// `Ok((remaining_input, (skipped_input, (img_alt, img_src))))` or some error.
20///
21///
22/// # HTML
23///
24#[allow(clippy::type_complexity)]
25/// ```
26/// use parse_hyperlinks::parser::Link;
27/// use parse_hyperlinks_extras::parser::parse_html::take_img;
28/// use std::borrow::Cow;
29///
30/// let i = r#"abc<img src="destination1" alt="text1">abc
31/// abc<img src="destination2" alt="text2">abc
32/// "#;
33///
34/// let (i, r) = take_img(i).unwrap();
35/// assert_eq!(r.0, "abc");
36/// assert_eq!(r.1, (Cow::from("text1"), Cow::from("destination1")));
37/// let (i, r) = take_img(i).unwrap();
38/// assert_eq!(r.0, "abc\nabc");
39/// assert_eq!(r.1, (Cow::from("text2"), Cow::from("destination2")));
40/// ```
41pub fn take_img(i: &str) -> nom::IResult<&str, (&str, (Cow<str>, Cow<str>))> {
42 let mut j = i;
43 let mut skip_count = 0;
44
45 let res = loop {
46 // Start searching for inline images.
47
48 // Regular `Link::Image` can start everywhere.
49 if let Ok((k, r)) = html_img(j) {
50 break (k, r);
51 };
52
53 // This makes sure that we advance.
54 let (k, _) = anychar(j)?;
55 skip_count += j.len() - k.len();
56 j = k;
57
58 // This might not consume bytes and never fails.
59 let (k, _) = take_till(|c| c == '<')(j)?;
60
61 skip_count += j.len() - k.len();
62 j = k;
63 };
64
65 // We found a link. Return it.
66 let (l, link) = res;
67
68 let skipped_input = &i[0..skip_count];
69
70 Ok((l, (skipped_input, link)))
71}
72
73/// Consumes the input until the parser finds an HTML formatted hyperlink _text2dest_
74/// (`Link::Text2Dest`).
75///
76/// The parser consumes the finding and returns
77/// `Ok((remaining_input, (skipped_input, (link_text, link_destination, link_title))))`
78/// or some error.
79///
80///
81/// # HTML
82///
83#[allow(clippy::type_complexity)]
84/// ```
85/// use parse_hyperlinks::parser::Link;
86/// use parse_hyperlinks_extras::parser::parse_html::take_text2dest;
87/// use std::borrow::Cow;
88///
89/// let i = r#"abc<a href="dest1" title="title1">text1</a>abc
90/// abc<a href="dest2" title="title2">text2</a>abc"#;
91///
92/// let (i, r) = take_text2dest(i).unwrap();
93/// assert_eq!(r.0, "abc");
94/// assert_eq!(r.1, (Cow::from("text1"), Cow::from("dest1"), Cow::from("title1")));
95/// let (i, r) = take_text2dest(i).unwrap();
96/// assert_eq!(r.0, "abc\nabc");
97/// assert_eq!(r.1, (Cow::from("text2"), Cow::from("dest2"), Cow::from("title2")));
98/// ```
99pub fn take_text2dest(i: &str) -> nom::IResult<&str, (&str, (Cow<str>, Cow<str>, Cow<str>))> {
100 let mut j = i;
101 let mut skip_count = 0;
102
103 let res = loop {
104 // Start searching for inline hyperlinks.
105
106 // Regular `Link::Text2Dest` can start everywhere.
107 if let Ok((k, r)) = html_text2dest(j) {
108 break (k, r);
109 };
110
111 // This makes sure that we advance.
112 let (k, _) = anychar(j)?;
113 skip_count += j.len() - k.len();
114 j = k;
115
116 // This might not consume bytes and never fails.
117 let (k, _) = take_till(|c| c == '<')(j)?;
118
119 skip_count += j.len() - k.len();
120 j = k;
121 };
122
123 // We found a link. Return it.
124 let (l, link) = res;
125
126 let skipped_input = &i[0..skip_count];
127
128 Ok((l, (skipped_input, link)))
129}
130
131/// Consumes the input until the parser finds an HTML formatted _inline
132/// image_ (`Link::Image`) and HTML formatted hyperlinks _text2dest_
133/// (`Link::Text2Dest`).
134///
135/// The parser consumes the finding and returns
136/// `Ok((remaining_input, (skipped_input, Link)))` or some error.
137///
138///
139/// # HTML
140///
141/// ```
142/// use parse_hyperlinks::parser::Link;
143/// use parse_hyperlinks_extras::parser::parse_html::take_link;
144/// use std::borrow::Cow;
145///
146/// let i = r#"abc<img src="dest1" alt="text1">abc
147/// abc<a href="dest2" title="title2">text2</a>abc
148/// abc<img src="dest3" alt="text3">abc
149/// abc<a href="dest4" title="title4">text4</a>abc
150/// abc<a href="dest5" title="title5">cde<img alt="alt5" src="src5"/>fgh</a>abc
151/// "#;
152///
153/// let (i, r) = take_link(i).unwrap();
154/// assert_eq!(r.0, "abc");
155/// assert_eq!(r.1, Link::Image(Cow::from("text1"), Cow::from("dest1")));
156/// let (i, r) = take_link(i).unwrap();
157/// assert_eq!(r.0, "abc\nabc");
158/// assert_eq!(r.1, Link::Text2Dest(Cow::from("text2"), Cow::from("dest2"), Cow::from("title2")));
159/// let (i, r) = take_link(i).unwrap();
160/// assert_eq!(r.0, "abc\nabc");
161/// assert_eq!(r.1, Link::Image(Cow::from("text3"), Cow::from("dest3")));
162/// let (i, r) = take_link(i).unwrap();
163/// assert_eq!(r.0, "abc\nabc");
164/// assert_eq!(r.1, Link::Text2Dest(Cow::from("text4"), Cow::from("dest4"), Cow::from("title4")));
165/// let (i, r) = take_link(i).unwrap();
166/// assert_eq!(r.0, "abc\nabc");
167/// assert_eq!(r.1, Link::Image2Dest(Cow::from("cde"), Cow::from("alt5"), Cow::from("src5"), Cow::from("fgh"), Cow::from("dest5"), Cow::from("title5")));
168/// ```
169pub fn take_link(i: &str) -> nom::IResult<&str, (&str, Link)> {
170 let mut j = i;
171 let mut skip_count = 0;
172
173 let res = loop {
174 // Start searching for inline images.
175
176 // Regular `Link::Image` can start everywhere.
177 if let Ok((k, r)) = html_img2dest_link(j) {
178 break (k, r);
179 };
180 // Regular `Link::Image` can start everywhere.
181 if let Ok((k, r)) = html_img_link(j) {
182 break (k, r);
183 };
184 // Regular `Link::Text2Dest` can start everywhere.
185 if let Ok((k, r)) = html_text2dest_link(j) {
186 break (k, r);
187 };
188
189 // This makes sure that we advance.
190 let (k, _) = anychar(j)?;
191 skip_count += j.len() - k.len();
192 j = k;
193
194 // This might not consume bytes and never fails.
195 let (k, _) = take_till(|c| c == '<')(j)?;
196
197 skip_count += j.len() - k.len();
198 j = k;
199 };
200
201 // We found a link. Return it.
202 let (l, link) = res;
203
204 let skipped_input = &i[0..skip_count];
205
206 Ok((l, (skipped_input, link)))
207}
208
209#[cfg(test)]
210mod tests {
211 use super::*;
212 use crate::parser::parse_html::take_link;
213
214 #[test]
215 fn test_take_link() {
216 let (i, r) = take_link(r#"<img src="t%20m%20p&.jpg" alt="test1" />"#).unwrap();
217 assert_eq!(i, "");
218 assert_eq!(r.0, "");
219 assert_eq!(
220 r.1,
221 Link::Image(Cow::from("test1"), Cow::from("t%20m%20p&.jpg"))
222 );
223
224 //
225 let (i, r) = take_link(
226 r#"<a href="http://getreu.net/%C3%9C%20&">http://getreu.net/Ü%20&</a>abc"#,
227 )
228 .unwrap();
229 assert_eq!(i, "abc");
230 assert_eq!(r.0, "");
231 assert_eq!(
232 r.1,
233 Link::Text2Dest(
234 Cow::from("http://getreu.net/Ü%20&"),
235 Cow::from("http://getreu.net/%C3%9C%20&"),
236 Cow::from(""),
237 )
238 );
239
240 //
241 let (i, r) = take_link(
242 r#"<a href="http://getreu.net/%C3%9C%20&">http://getreu.net/Ü &</a>abc"#,
243 )
244 .unwrap();
245 assert_eq!(i, "abc");
246 assert_eq!(r.0, "");
247 assert_eq!(
248 r.1,
249 Link::Text2Dest(
250 Cow::from("http://getreu.net/Ü &"),
251 Cow::from("http://getreu.net/%C3%9C%20&"),
252 Cow::from(""),
253 )
254 );
255 }
256}