web_archive/
lib.rs

1#![warn(missing_docs)]
2#![forbid(unsafe_code)]
3
4//! The purpose of this crate is to download a web page, then download
5//! its linked image, Javascript, and CSS resources and embed them in
6//! the HTML.
7//!
8//! Both async and blocking APIs are provided, making use of `reqwest`'s
9//! support for both. The blocking APIs are enabled with the `blocking`
10// Copyright 2021 David Young
11//
12// Licensed under the Apache License, Version 2.0, <LICENSE-APACHE or
13// http://apache.org/licenses/LICENSE-2.0> or the MIT license <LICENSE-MIT or
14// http://opensource.org/licenses/MIT>, at your option. This file may not be
15// copied, modified, or distributed except according to those terms.
16
17//! feature.
18//!
19//! ## Examples
20//!
21//! ### Async
22//!
23//! ```no_run
24//! use web_archive::archive;
25//!
26//! # async fn archive_async() {
27//! // Fetch page and all its resources
28//! let archive = archive("http://example.com", Default::default())
29//!     .await
30//!     .unwrap();
31//!
32//! // Embed the resources into the page
33//! let page = archive.embed_resources();
34//! println!("{}", page);
35//! # }
36//!
37//! ```
38//!
39//! ### Blocking
40//!
41//! ```no_run
42//! use web_archive::blocking;
43//!
44//! // Fetch page and all its resources
45//! let archive =
46//!     blocking::archive("http://example.com", Default::default()).unwrap();
47//!
48//! // Embed the resources into the page
49//! let page = archive.embed_resources();
50//! println!("{}", page);
51//!
52//! ```
53//!
54//! ### Ignore certificate errors (dangerous!)
55//!
56//! ```no_run
57//! use web_archive::{archive, ArchiveOptions};
58//!
59//! # async fn archive_async() {
60//! // Fetch page and all its resources
61//! let archive_options = ArchiveOptions {
62//!     accept_invalid_certificates: true,
63//!     ..Default::default()
64//! };
65//! let archive = archive("http://example.com", archive_options)
66//!     .await
67//!     .unwrap();
68//!
69//! // Embed the resources into the page
70//! let page = archive.embed_resources();
71//! println!("{}", page);
72//! # }
73//!
74//! ```
75
76pub use error::Error;
77pub use page_archive::PageArchive;
78use parsing::{mimetype_from_response, parse_resource_urls};
79pub use parsing::{ImageResource, Resource, ResourceMap, ResourceUrl};
80use reqwest::StatusCode;
81use std::convert::TryInto;
82use std::fmt::Display;
83use url::Url;
84
85pub mod error;
86pub mod page_archive;
87pub mod parsing;
88
89#[cfg(feature = "blocking")]
90pub mod blocking;
91
92/// The async archive function.
93///
94/// Takes in a URL and attempts to download the page and its resources.
95/// Network errors get wrapped in [`Error`] and returned as the `Err`
96/// case.
97pub async fn archive<U>(
98    url: U,
99    options: ArchiveOptions,
100) -> Result<PageArchive, Error>
101where
102    U: TryInto<Url>,
103    <U as TryInto<Url>>::Error: Display,
104{
105    let url: Url = url
106        .try_into()
107        .map_err(|e| Error::ParseError(format!("{}", e)))?;
108
109    // Initialise client
110    let client = reqwest::Client::builder()
111        .use_native_tls()
112        .danger_accept_invalid_certs(options.accept_invalid_certificates)
113        .danger_accept_invalid_hostnames(options.accept_invalid_certificates)
114        .build()?;
115
116    // Fetch the page contents
117    let content = client.get(url.clone()).send().await?.text().await?;
118
119    // Determine the resources that the page needs
120    let resource_urls = parse_resource_urls(&url, &content);
121
122    // Download them
123    let mut resource_map = ResourceMap::new();
124    for resource_url in resource_urls {
125        use ResourceUrl::*;
126
127        let response = client.get(resource_url.url().clone()).send().await?;
128        if response.status() != StatusCode::OK {
129            // Skip any errors
130            continue;
131        }
132        match resource_url {
133            Image(u) => {
134                // Get mimetype of image
135                let data = response.bytes().await?;
136                let mimetype = mimetype_from_response(&data, &u);
137                resource_map.insert(
138                    u,
139                    Resource::Image(ImageResource { data, mimetype }),
140                );
141            }
142            Css(u) => {
143                resource_map.insert(u, Resource::Css(response.text().await?));
144            }
145            Javascript(u) => {
146                resource_map
147                    .insert(u, Resource::Javascript(response.text().await?));
148            }
149        }
150    }
151
152    Ok(PageArchive {
153        url,
154        content,
155        resource_map,
156    })
157}
158
159/// Configuration options to control aspects of the archiving behaviour.
160pub struct ArchiveOptions {
161    /// Accept invalid certificates or certificates that do not match
162    /// the requested hostname. For example, performing an HTTPS request
163    /// against an IP address will more than likely result in a hostname
164    /// mismatch.
165    ///
166    /// Corresponds to [`reqwest::ClientBuilder::danger_accept_invalid_certs`]
167    /// and [`reqwest::ClientBuilder::danger_accept_invalid_hostnames`].
168    ///
169    /// Default: `false`
170    pub accept_invalid_certificates: bool,
171}
172
173impl Default for ArchiveOptions {
174    fn default() -> Self {
175        Self {
176            accept_invalid_certificates: false,
177        }
178    }
179}
180
181#[cfg(test)]
182mod tests {
183    use super::*;
184    use tokio_test::block_on;
185
186    #[test]
187    fn parse_invalid_url_async() {
188        let u = "this~is~not~a~url";
189
190        let res = block_on(archive(u, Default::default()));
191        assert!(res.is_err());
192
193        if let Err(Error::ParseError(_err)) = res {
194            // Okay, it's a parse error
195        } else {
196            panic!("Expected parse error");
197        }
198    }
199}