web_archive/
parsing.rs

1// Copyright 2020 David Young
2//
3// Licensed under the Apache License, Version 2.0, <LICENSE-APACHE or
4// http://apache.org/licenses/LICENSE-2.0> or the MIT license <LICENSE-MIT or
5// http://opensource.org/licenses/MIT>, at your option. This file may not be
6// copied, modified, or distributed except according to those terms.
7
8//! Module for the core parsing functionality
9
10use bytes::Bytes;
11use kuchiki::traits::TendrilSink;
12use kuchiki::{parse_html, NodeData};
13use std::collections::HashMap;
14use url::Url;
15
16// https://github.com/Y2Z/monolith/blob/fa71f6a42c94df4c48d01819922afe1248eabad5/src/utils.rs#L13
17const MAGIC: [(&[u8], &str); 18] = [
18    // Image
19    (b"GIF87a", "image/gif"),
20    (b"GIF89a", "image/gif"),
21    (b"\xFF\xD8\xFF", "image/jpeg"),
22    (b"\x89PNG\x0D\x0A\x1A\x0A", "image/png"),
23    (b"<svg ", "image/svg+xml"),
24    (b"RIFF....WEBPVP8 ", "image/webp"),
25    (b"\x00\x00\x01\x00", "image/x-icon"),
26    // Audio
27    (b"ID3", "audio/mpeg"),
28    (b"\xFF\x0E", "audio/mpeg"),
29    (b"\xFF\x0F", "audio/mpeg"),
30    (b"OggS", "audio/ogg"),
31    (b"RIFF....WAVEfmt ", "audio/wav"),
32    (b"fLaC", "audio/x-flac"),
33    // Video
34    (b"RIFF....AVI LIST", "video/avi"),
35    (b"....ftyp", "video/mp4"),
36    (b"\x00\x00\x01\x0B", "video/mpeg"),
37    (b"....moov", "video/quicktime"),
38    (b"\x1A\x45\xDF\xA3", "video/webm"),
39];
40
41/// Search image, style, and script resources and store their URIs
42pub(crate) fn parse_resource_urls(
43    url_base: &Url,
44    page: &str,
45) -> Vec<ResourceUrl> {
46    let document = parse_html().one(page);
47
48    // Collect resource URLs for each element type
49    let mut resource_urls = Vec::new();
50
51    for element in document.select("img").unwrap() {
52        let node = element.as_node();
53        if let NodeData::Element(data) = node.data() {
54            let attr = data.attributes.borrow();
55            if let Some(u) = attr.get("src") {
56                if let Ok(u) = url_base.join(u) {
57                    resource_urls.push(ResourceUrl::Image(u));
58                }
59            }
60        }
61    }
62
63    for element in document.select("link").unwrap() {
64        let node = element.as_node();
65        if let NodeData::Element(data) = node.data() {
66            let attr = data.attributes.borrow();
67            if Some("stylesheet") == attr.get("rel") {
68                if let Some(u) = attr.get("href") {
69                    if let Ok(u) = url_base.join(u) {
70                        resource_urls.push(ResourceUrl::Css(u));
71                    }
72                }
73            }
74        }
75    }
76
77    for element in document.select("script").unwrap() {
78        let node = element.as_node();
79        if let NodeData::Element(data) = node.data() {
80            let attr = data.attributes.borrow();
81            if let Some(u) = attr.get("src") {
82                if let Ok(u) = url_base.join(u) {
83                    resource_urls.push(ResourceUrl::Javascript(u));
84                }
85            }
86        }
87    }
88
89    // Dedup the URLs to avoid fetching the same one twice
90    resource_urls.sort();
91    resource_urls.dedup();
92
93    resource_urls
94}
95
96/// Tag the resource URLs with the type of resource they correspond to
97#[derive(Debug, PartialEq, Eq)]
98pub enum ResourceUrl {
99    /// Javascript files
100    Javascript(Url),
101    /// CSS files
102    Css(Url),
103    /// Image files
104    Image(Url),
105}
106
107impl ResourceUrl {
108    /// Returns a reference to the inner [`Url`]
109    pub fn url(&self) -> &Url {
110        use ResourceUrl::*;
111        match self {
112            Javascript(u) => &u,
113            Css(u) => &u,
114            Image(u) => &u,
115        }
116    }
117}
118
119impl PartialOrd for ResourceUrl {
120    fn partial_cmp(&self, rhs: &ResourceUrl) -> Option<std::cmp::Ordering> {
121        Some(self.url().cmp(rhs.url()))
122    }
123}
124
125impl Ord for ResourceUrl {
126    fn cmp(&self, rhs: &ResourceUrl) -> std::cmp::Ordering {
127        self.url().cmp(rhs.url())
128    }
129}
130
131/// Newtype wrapper around [`HashMap`], mapping between resource URLs
132/// and the downloaded file contents
133pub type ResourceMap = HashMap<Url, Resource>;
134
135/// Generic resource type
136#[derive(Debug, PartialEq, Eq)]
137pub enum Resource {
138    /// Javascript is stored as a String
139    Javascript(String),
140    /// Stylesheets are stored as a String
141    Css(String),
142    /// Images are stored as an [`ImageResource`] to allow the mimetype
143    /// metadata to be useful
144    Image(ImageResource),
145}
146
147/// Data type representing an image
148#[derive(Debug, PartialEq, Eq)]
149pub struct ImageResource {
150    /// Raw image data
151    pub data: Bytes,
152    /// Mime type of the image, e.g. `image/png`
153    pub mimetype: String,
154}
155
156impl ImageResource {
157    /// Encode the image data as base 64 and embed it into a `data:` URI,
158    /// e.g. `...`.
159    pub fn to_data_uri(&self) -> String {
160        let encoded = base64::encode(&self.data);
161        format!("data:{};base64,{}", self.mimetype, encoded)
162    }
163}
164
165// https://github.com/Y2Z/monolith/blob/fa71f6a42c94df4c48d01819922afe1248eabad5/src/utils.rs#L44
166pub(crate) fn mimetype_from_response(data: &[u8], url: &Url) -> String {
167    for item in MAGIC.iter() {
168        if data.starts_with(item.0) {
169            return item.1.to_string();
170        }
171    }
172
173    if url.path().to_lowercase().ends_with(".svg") {
174        return "image/svg+xml".to_string();
175    }
176
177    "".to_string()
178}
179
180#[cfg(test)]
181mod test {
182    use super::*;
183
184    fn u() -> Url {
185        Url::parse("http://example.com").unwrap()
186    }
187
188    #[test]
189    fn test_image_resouce_base_64() {
190        let img = ImageResource {
191            data: Bytes::from(
192                include_bytes!(
193                    "../dynamic_tests/resources/rustacean-flat-happy.png"
194                )
195                .to_vec(),
196            ),
197            mimetype: "image/png".to_string(),
198        };
199
200        let data_uri = img.to_data_uri();
201
202        // base64 < dynamic_tests/resources/rustacean-flat-happy.png
203        assert!(data_uri
204            .starts_with(""));
205        assert!(data_uri.ends_with("Q/hkoEnAH1wAAAABJRU5ErkJggg=="));
206    }
207
208    #[test]
209    fn test_image_tags() {
210        let html = r#"
211        <!DOCTYPE html>
212        <html>
213            <head></head>
214            <body>
215                <div id="content">
216                    <img src="/images/fun.png" />
217                </div>
218            </body>
219        </html>
220        "#;
221
222        let resource_urls = parse_resource_urls(&u(), &html);
223
224        assert_eq!(resource_urls.len(), 1);
225        assert_eq!(
226            resource_urls[0],
227            ResourceUrl::Image(
228                Url::parse("http://example.com/images/fun.png").unwrap()
229            )
230        );
231    }
232
233    #[test]
234    fn test_css_tags() {
235        let html = r#"
236        <!DOCTYPE html>
237        <html>
238            <head>
239                <link rel="stylesheet" type="text/css" href="/style.css" />
240                <link rel="something_else" href="NOT_ALLOWED" />
241            </head>
242            <body>
243                <div id="content">
244                </div>
245            </body>
246        </html>
247        "#;
248
249        let resource_urls = parse_resource_urls(&u(), &html);
250
251        assert_eq!(resource_urls.len(), 1);
252        assert_eq!(
253            resource_urls[0],
254            ResourceUrl::Css(
255                Url::parse("http://example.com/style.css").unwrap()
256            )
257        );
258    }
259
260    #[test]
261    fn test_script_tags() {
262        let html = r#"
263        <!DOCTYPE html>
264        <html>
265            <head>
266                <script language="javascript" src="/js.js"></script>
267            </head>
268            <body>
269                <div id="content">
270                </div>
271            </body>
272        </html>
273        "#;
274
275        let resource_urls = parse_resource_urls(&u(), &html);
276
277        assert_eq!(resource_urls.len(), 1);
278        assert_eq!(
279            resource_urls[0],
280            ResourceUrl::Javascript(
281                Url::parse("http://example.com/js.js").unwrap()
282            )
283        );
284    }
285
286    #[test]
287    fn test_deep_nesting() {
288        let html = r#"
289        <!DOCTYPE html>
290        <html>
291            <head>
292                <script language="javascript" src="/js.js"></script>
293                <link rel="stylesheet" href="1.css" type="text/css" />
294            </head>
295            <body>
296                <div id="content">
297                    <div><div><div>
298                            <img src="1.png" />
299                        </div></div>
300                        <script src="2.js"></script>
301                    </div>
302                    <div><div>
303                        <img src="2.tiff" />
304                    </div></div>
305                </div>
306            </body>
307        </html>
308        "#;
309
310        let resource_urls = parse_resource_urls(&u(), &html);
311
312        let mut test_urls = vec![
313            ResourceUrl::Javascript(
314                Url::parse("http://example.com/js.js").unwrap(),
315            ),
316            ResourceUrl::Css(Url::parse("http://example.com/1.css").unwrap()),
317            ResourceUrl::Image(Url::parse("http://example.com/1.png").unwrap()),
318            ResourceUrl::Javascript(
319                Url::parse("http://example.com/2.js").unwrap(),
320            ),
321            ResourceUrl::Image(
322                Url::parse("http://example.com/2.tiff").unwrap(),
323            ),
324        ];
325        test_urls.sort();
326
327        assert_eq!(resource_urls.len(), 5);
328        assert_eq!(resource_urls, test_urls,);
329    }
330
331    #[test]
332    fn test_relative_paths() {
333        let html = r#"
334        <!DOCTYPE html>
335        <html>
336            <head></head>
337            <body>
338                <div id="content">
339                    <img src="../../images/fun.png" />
340                    <img src="/absolute_path.jpg" />
341        <img src="https://www.rust-lang.org/static/images/rust-logo-blk.svg" />
342                </div>
343            </body>
344        </html>
345        "#;
346
347        let u = Url::parse("http://example.com/one/two/three/four/").unwrap();
348        let resource_urls = parse_resource_urls(&u, &html);
349        let mut test_urls = vec![
350            ResourceUrl::Image(
351                Url::parse("http://example.com/one/two/images/fun.png")
352                    .unwrap(),
353            ),
354            ResourceUrl::Image(
355                Url::parse("http://example.com/absolute_path.jpg").unwrap(),
356            ),
357            ResourceUrl::Image(
358                Url::parse(
359                    "https://www.rust-lang.org/static/images/rust-logo-blk.svg",
360                )
361                .unwrap(),
362            ),
363        ];
364        test_urls.sort();
365
366        assert_eq!(resource_urls.len(), 3);
367        assert_eq!(resource_urls, test_urls);
368    }
369
370    #[test]
371    fn test_upper_case_tags() {
372        let html = r#"
373        <HTML>
374            <HEAD>
375                <SCRIPT LANGUAGE="javascript" SRC="/js.js"></SCRIPT>
376            </HEAD>
377            <BODY>
378                <DIV ID="content">
379                </DIV>
380            </BODY>
381        </HTML>
382        "#;
383
384        let resource_urls = parse_resource_urls(&u(), &html);
385
386        assert_eq!(resource_urls.len(), 1);
387        assert_eq!(
388            resource_urls[0],
389            ResourceUrl::Javascript(
390                Url::parse("http://example.com/js.js").unwrap()
391            )
392        );
393    }
394
395    #[test]
396    fn test_malformed_html() {
397        let html = r#"
398        <!DOCTYPE html>
399        <html>
400            <head>
401                <script language="javascript" src="/js.js"></script>
402            </head>
403            <body>
404                <div id="content">
405                    <p>Closing paragraphs is for losers
406                    <p><img src="a.jpg">
407                </div>
408            </body>
409        </html>
410        "#;
411
412        let resource_urls = parse_resource_urls(&u(), &html);
413        let mut test_urls = vec![
414            ResourceUrl::Javascript(
415                Url::parse("http://example.com/js.js").unwrap(),
416            ),
417            ResourceUrl::Image(Url::parse("http://example.com/a.jpg").unwrap()),
418        ];
419        test_urls.sort();
420
421        assert_eq!(resource_urls.len(), 2);
422        assert_eq!(resource_urls, test_urls);
423    }
424
425    #[test]
426    fn test_mimetype_detection() {
427        let data: &[u8] = include_bytes!(
428            "../dynamic_tests/resources/rustacean-flat-happy.png"
429        );
430        let url = Url::parse("http://example.com/ferris.png").unwrap();
431        let mimetype = mimetype_from_response(&data, &url);
432        assert_eq!(mimetype, "image/png");
433
434        let data: &[u8] =
435            include_bytes!("../dynamic_tests/resources/rust-logo-blk.svg");
436        let url = Url::parse("http://example.com/rust.svg").unwrap();
437        let mimetype = mimetype_from_response(&data, &url);
438        assert_eq!(mimetype, "image/svg+xml");
439    }
440}