web_archive/
blocking.rs

1// Copyright 2020 David Young
2//
3// Licensed under the Apache License, Version 2.0, <LICENSE-APACHE or
4// http://apache.org/licenses/LICENSE-2.0> or the MIT license <LICENSE-MIT or
5// http://opensource.org/licenses/MIT>, at your option. This file may not be
6// copied, modified, or distributed except according to those terms.
7
8//! ### Blocking
9//!
10//! This is the blocking API
11//!
12//! ```no_run
13//! use web_archive::blocking;
14//!
15//! // Fetch page and all its resources
16//! let archive = blocking::archive("http://example.com", Default::default())
17//!     .unwrap();
18//!
19//! // Embed the resources into the page
20//! let page = archive.embed_resources();
21//! println!("{}", page);
22//!
23//! ```
24
25use crate::error::Error;
26use crate::page_archive::PageArchive;
27use crate::parsing::{
28    mimetype_from_response, parse_resource_urls, ImageResource, Resource,
29    ResourceMap, ResourceUrl,
30};
31use crate::ArchiveOptions;
32use reqwest::StatusCode;
33use std::convert::TryInto;
34use std::fmt::Display;
35use url::Url;
36
37/// The blocking archive function.
38///
39/// Takes in a URL and attempts to download the page and its resources.
40/// Network errors get wrapped in [`Error`] and returned as the `Err`
41/// case.
42pub fn archive<U>(url: U, options: ArchiveOptions) -> Result<PageArchive, Error>
43where
44    U: TryInto<Url>,
45    <U as TryInto<Url>>::Error: Display,
46{
47    let url: Url = url
48        .try_into()
49        .map_err(|e| Error::ParseError(format!("{}", e)))?;
50
51    // Initialise client
52    let client = reqwest::blocking::Client::builder()
53        .use_native_tls()
54        .danger_accept_invalid_certs(options.accept_invalid_certificates)
55        .danger_accept_invalid_hostnames(options.accept_invalid_certificates)
56        .build()?;
57
58    // Fetch the page contents
59    let content = client.get(url.clone()).send()?.text()?;
60
61    // Determine the resources that the page needs
62    let resource_urls = parse_resource_urls(&url, &content);
63    let mut resource_map = ResourceMap::new();
64
65    // Download them
66    for resource_url in resource_urls {
67        use ResourceUrl::*;
68
69        let response = client.get(resource_url.url().clone()).send()?;
70        if response.status() != StatusCode::OK {
71            // Skip any errors
72            println!("Code: {}", response.status());
73            continue;
74        }
75        match resource_url {
76            Image(u) => {
77                let data = response.bytes()?;
78                let mimetype = mimetype_from_response(&data, &u);
79                resource_map.insert(
80                    u,
81                    Resource::Image(ImageResource { data, mimetype }),
82                );
83            }
84            Css(u) => {
85                resource_map.insert(u, Resource::Css(response.text()?));
86            }
87            Javascript(u) => {
88                resource_map.insert(u, Resource::Javascript(response.text()?));
89            }
90        }
91    }
92
93    Ok(PageArchive {
94        url,
95        content,
96        resource_map,
97    })
98}
99
100#[cfg(test)]
101mod tests {
102    use super::*;
103
104    #[test]
105    fn parse_invalid_url_blocking() {
106        let u = "this~is~not~a~url";
107
108        let res = archive(u, Default::default());
109        assert!(res.is_err());
110
111        if let Err(Error::ParseError(_err)) = res {
112            // Okay, it's a parse error
113        } else {
114            panic!("Expected parse error");
115        }
116    }
117}