1use bytes::Bytes;
11use kuchiki::traits::TendrilSink;
12use kuchiki::{parse_html, NodeData};
13use std::collections::HashMap;
14use url::Url;
15
16const MAGIC: [(&[u8], &str); 18] = [
18 (b"GIF87a", "image/gif"),
20 (b"GIF89a", "image/gif"),
21 (b"\xFF\xD8\xFF", "image/jpeg"),
22 (b"\x89PNG\x0D\x0A\x1A\x0A", "image/png"),
23 (b"<svg ", "image/svg+xml"),
24 (b"RIFF....WEBPVP8 ", "image/webp"),
25 (b"\x00\x00\x01\x00", "image/x-icon"),
26 (b"ID3", "audio/mpeg"),
28 (b"\xFF\x0E", "audio/mpeg"),
29 (b"\xFF\x0F", "audio/mpeg"),
30 (b"OggS", "audio/ogg"),
31 (b"RIFF....WAVEfmt ", "audio/wav"),
32 (b"fLaC", "audio/x-flac"),
33 (b"RIFF....AVI LIST", "video/avi"),
35 (b"....ftyp", "video/mp4"),
36 (b"\x00\x00\x01\x0B", "video/mpeg"),
37 (b"....moov", "video/quicktime"),
38 (b"\x1A\x45\xDF\xA3", "video/webm"),
39];
40
41pub(crate) fn parse_resource_urls(
43 url_base: &Url,
44 page: &str,
45) -> Vec<ResourceUrl> {
46 let document = parse_html().one(page);
47
48 let mut resource_urls = Vec::new();
50
51 for element in document.select("img").unwrap() {
52 let node = element.as_node();
53 if let NodeData::Element(data) = node.data() {
54 let attr = data.attributes.borrow();
55 if let Some(u) = attr.get("src") {
56 if let Ok(u) = url_base.join(u) {
57 resource_urls.push(ResourceUrl::Image(u));
58 }
59 }
60 }
61 }
62
63 for element in document.select("link").unwrap() {
64 let node = element.as_node();
65 if let NodeData::Element(data) = node.data() {
66 let attr = data.attributes.borrow();
67 if Some("stylesheet") == attr.get("rel") {
68 if let Some(u) = attr.get("href") {
69 if let Ok(u) = url_base.join(u) {
70 resource_urls.push(ResourceUrl::Css(u));
71 }
72 }
73 }
74 }
75 }
76
77 for element in document.select("script").unwrap() {
78 let node = element.as_node();
79 if let NodeData::Element(data) = node.data() {
80 let attr = data.attributes.borrow();
81 if let Some(u) = attr.get("src") {
82 if let Ok(u) = url_base.join(u) {
83 resource_urls.push(ResourceUrl::Javascript(u));
84 }
85 }
86 }
87 }
88
89 resource_urls.sort();
91 resource_urls.dedup();
92
93 resource_urls
94}
95
96#[derive(Debug, PartialEq, Eq)]
98pub enum ResourceUrl {
99 Javascript(Url),
101 Css(Url),
103 Image(Url),
105}
106
107impl ResourceUrl {
108 pub fn url(&self) -> &Url {
110 use ResourceUrl::*;
111 match self {
112 Javascript(u) => &u,
113 Css(u) => &u,
114 Image(u) => &u,
115 }
116 }
117}
118
119impl PartialOrd for ResourceUrl {
120 fn partial_cmp(&self, rhs: &ResourceUrl) -> Option<std::cmp::Ordering> {
121 Some(self.url().cmp(rhs.url()))
122 }
123}
124
125impl Ord for ResourceUrl {
126 fn cmp(&self, rhs: &ResourceUrl) -> std::cmp::Ordering {
127 self.url().cmp(rhs.url())
128 }
129}
130
131pub type ResourceMap = HashMap<Url, Resource>;
134
135#[derive(Debug, PartialEq, Eq)]
137pub enum Resource {
138 Javascript(String),
140 Css(String),
142 Image(ImageResource),
145}
146
147#[derive(Debug, PartialEq, Eq)]
149pub struct ImageResource {
150 pub data: Bytes,
152 pub mimetype: String,
154}
155
156impl ImageResource {
157 pub fn to_data_uri(&self) -> String {
160 let encoded = base64::encode(&self.data);
161 format!("data:{};base64,{}", self.mimetype, encoded)
162 }
163}
164
165pub(crate) fn mimetype_from_response(data: &[u8], url: &Url) -> String {
167 for item in MAGIC.iter() {
168 if data.starts_with(item.0) {
169 return item.1.to_string();
170 }
171 }
172
173 if url.path().to_lowercase().ends_with(".svg") {
174 return "image/svg+xml".to_string();
175 }
176
177 "".to_string()
178}
179
180#[cfg(test)]
181mod test {
182 use super::*;
183
184 fn u() -> Url {
185 Url::parse("http://example.com").unwrap()
186 }
187
188 #[test]
189 fn test_image_resouce_base_64() {
190 let img = ImageResource {
191 data: Bytes::from(
192 include_bytes!(
193 "../dynamic_tests/resources/rustacean-flat-happy.png"
194 )
195 .to_vec(),
196 ),
197 mimetype: "image/png".to_string(),
198 };
199
200 let data_uri = img.to_data_uri();
201
202 assert!(data_uri
204 .starts_with(""));
205 assert!(data_uri.ends_with("Q/hkoEnAH1wAAAABJRU5ErkJggg=="));
206 }
207
208 #[test]
209 fn test_image_tags() {
210 let html = r#"
211 <!DOCTYPE html>
212 <html>
213 <head></head>
214 <body>
215 <div id="content">
216 <img src="/images/fun.png" />
217 </div>
218 </body>
219 </html>
220 "#;
221
222 let resource_urls = parse_resource_urls(&u(), &html);
223
224 assert_eq!(resource_urls.len(), 1);
225 assert_eq!(
226 resource_urls[0],
227 ResourceUrl::Image(
228 Url::parse("http://example.com/images/fun.png").unwrap()
229 )
230 );
231 }
232
233 #[test]
234 fn test_css_tags() {
235 let html = r#"
236 <!DOCTYPE html>
237 <html>
238 <head>
239 <link rel="stylesheet" type="text/css" href="/style.css" />
240 <link rel="something_else" href="NOT_ALLOWED" />
241 </head>
242 <body>
243 <div id="content">
244 </div>
245 </body>
246 </html>
247 "#;
248
249 let resource_urls = parse_resource_urls(&u(), &html);
250
251 assert_eq!(resource_urls.len(), 1);
252 assert_eq!(
253 resource_urls[0],
254 ResourceUrl::Css(
255 Url::parse("http://example.com/style.css").unwrap()
256 )
257 );
258 }
259
260 #[test]
261 fn test_script_tags() {
262 let html = r#"
263 <!DOCTYPE html>
264 <html>
265 <head>
266 <script language="javascript" src="/js.js"></script>
267 </head>
268 <body>
269 <div id="content">
270 </div>
271 </body>
272 </html>
273 "#;
274
275 let resource_urls = parse_resource_urls(&u(), &html);
276
277 assert_eq!(resource_urls.len(), 1);
278 assert_eq!(
279 resource_urls[0],
280 ResourceUrl::Javascript(
281 Url::parse("http://example.com/js.js").unwrap()
282 )
283 );
284 }
285
286 #[test]
287 fn test_deep_nesting() {
288 let html = r#"
289 <!DOCTYPE html>
290 <html>
291 <head>
292 <script language="javascript" src="/js.js"></script>
293 <link rel="stylesheet" href="1.css" type="text/css" />
294 </head>
295 <body>
296 <div id="content">
297 <div><div><div>
298 <img src="1.png" />
299 </div></div>
300 <script src="2.js"></script>
301 </div>
302 <div><div>
303 <img src="2.tiff" />
304 </div></div>
305 </div>
306 </body>
307 </html>
308 "#;
309
310 let resource_urls = parse_resource_urls(&u(), &html);
311
312 let mut test_urls = vec![
313 ResourceUrl::Javascript(
314 Url::parse("http://example.com/js.js").unwrap(),
315 ),
316 ResourceUrl::Css(Url::parse("http://example.com/1.css").unwrap()),
317 ResourceUrl::Image(Url::parse("http://example.com/1.png").unwrap()),
318 ResourceUrl::Javascript(
319 Url::parse("http://example.com/2.js").unwrap(),
320 ),
321 ResourceUrl::Image(
322 Url::parse("http://example.com/2.tiff").unwrap(),
323 ),
324 ];
325 test_urls.sort();
326
327 assert_eq!(resource_urls.len(), 5);
328 assert_eq!(resource_urls, test_urls,);
329 }
330
331 #[test]
332 fn test_relative_paths() {
333 let html = r#"
334 <!DOCTYPE html>
335 <html>
336 <head></head>
337 <body>
338 <div id="content">
339 <img src="../../images/fun.png" />
340 <img src="/absolute_path.jpg" />
341 <img src="https://www.rust-lang.org/static/images/rust-logo-blk.svg" />
342 </div>
343 </body>
344 </html>
345 "#;
346
347 let u = Url::parse("http://example.com/one/two/three/four/").unwrap();
348 let resource_urls = parse_resource_urls(&u, &html);
349 let mut test_urls = vec![
350 ResourceUrl::Image(
351 Url::parse("http://example.com/one/two/images/fun.png")
352 .unwrap(),
353 ),
354 ResourceUrl::Image(
355 Url::parse("http://example.com/absolute_path.jpg").unwrap(),
356 ),
357 ResourceUrl::Image(
358 Url::parse(
359 "https://www.rust-lang.org/static/images/rust-logo-blk.svg",
360 )
361 .unwrap(),
362 ),
363 ];
364 test_urls.sort();
365
366 assert_eq!(resource_urls.len(), 3);
367 assert_eq!(resource_urls, test_urls);
368 }
369
370 #[test]
371 fn test_upper_case_tags() {
372 let html = r#"
373 <HTML>
374 <HEAD>
375 <SCRIPT LANGUAGE="javascript" SRC="/js.js"></SCRIPT>
376 </HEAD>
377 <BODY>
378 <DIV ID="content">
379 </DIV>
380 </BODY>
381 </HTML>
382 "#;
383
384 let resource_urls = parse_resource_urls(&u(), &html);
385
386 assert_eq!(resource_urls.len(), 1);
387 assert_eq!(
388 resource_urls[0],
389 ResourceUrl::Javascript(
390 Url::parse("http://example.com/js.js").unwrap()
391 )
392 );
393 }
394
395 #[test]
396 fn test_malformed_html() {
397 let html = r#"
398 <!DOCTYPE html>
399 <html>
400 <head>
401 <script language="javascript" src="/js.js"></script>
402 </head>
403 <body>
404 <div id="content">
405 <p>Closing paragraphs is for losers
406 <p><img src="a.jpg">
407 </div>
408 </body>
409 </html>
410 "#;
411
412 let resource_urls = parse_resource_urls(&u(), &html);
413 let mut test_urls = vec![
414 ResourceUrl::Javascript(
415 Url::parse("http://example.com/js.js").unwrap(),
416 ),
417 ResourceUrl::Image(Url::parse("http://example.com/a.jpg").unwrap()),
418 ];
419 test_urls.sort();
420
421 assert_eq!(resource_urls.len(), 2);
422 assert_eq!(resource_urls, test_urls);
423 }
424
425 #[test]
426 fn test_mimetype_detection() {
427 let data: &[u8] = include_bytes!(
428 "../dynamic_tests/resources/rustacean-flat-happy.png"
429 );
430 let url = Url::parse("http://example.com/ferris.png").unwrap();
431 let mimetype = mimetype_from_response(&data, &url);
432 assert_eq!(mimetype, "image/png");
433
434 let data: &[u8] =
435 include_bytes!("../dynamic_tests/resources/rust-logo-blk.svg");
436 let url = Url::parse("http://example.com/rust.svg").unwrap();
437 let mimetype = mimetype_from_response(&data, &url);
438 assert_eq!(mimetype, "image/svg+xml");
439 }
440}