web_archive/lib.rs
1#![warn(missing_docs)]
2#![forbid(unsafe_code)]
3
4//! The purpose of this crate is to download a web page, then download
5//! its linked image, Javascript, and CSS resources and embed them in
6//! the HTML.
7//!
8//! Both async and blocking APIs are provided, making use of `reqwest`'s
9//! support for both. The blocking APIs are enabled with the `blocking`
10// Copyright 2021 David Young
11//
12// Licensed under the Apache License, Version 2.0, <LICENSE-APACHE or
13// http://apache.org/licenses/LICENSE-2.0> or the MIT license <LICENSE-MIT or
14// http://opensource.org/licenses/MIT>, at your option. This file may not be
15// copied, modified, or distributed except according to those terms.
16
17//! feature.
18//!
19//! ## Examples
20//!
21//! ### Async
22//!
23//! ```no_run
24//! use web_archive::archive;
25//!
26//! # async fn archive_async() {
27//! // Fetch page and all its resources
28//! let archive = archive("http://example.com", Default::default())
29//! .await
30//! .unwrap();
31//!
32//! // Embed the resources into the page
33//! let page = archive.embed_resources();
34//! println!("{}", page);
35//! # }
36//!
37//! ```
38//!
39//! ### Blocking
40//!
41//! ```no_run
42//! use web_archive::blocking;
43//!
44//! // Fetch page and all its resources
45//! let archive =
46//! blocking::archive("http://example.com", Default::default()).unwrap();
47//!
48//! // Embed the resources into the page
49//! let page = archive.embed_resources();
50//! println!("{}", page);
51//!
52//! ```
53//!
54//! ### Ignore certificate errors (dangerous!)
55//!
56//! ```no_run
57//! use web_archive::{archive, ArchiveOptions};
58//!
59//! # async fn archive_async() {
60//! // Fetch page and all its resources
61//! let archive_options = ArchiveOptions {
62//! accept_invalid_certificates: true,
63//! ..Default::default()
64//! };
65//! let archive = archive("http://example.com", archive_options)
66//! .await
67//! .unwrap();
68//!
69//! // Embed the resources into the page
70//! let page = archive.embed_resources();
71//! println!("{}", page);
72//! # }
73//!
74//! ```
75
76pub use error::Error;
77pub use page_archive::PageArchive;
78use parsing::{mimetype_from_response, parse_resource_urls};
79pub use parsing::{ImageResource, Resource, ResourceMap, ResourceUrl};
80use reqwest::StatusCode;
81use std::convert::TryInto;
82use std::fmt::Display;
83use url::Url;
84
85pub mod error;
86pub mod page_archive;
87pub mod parsing;
88
89#[cfg(feature = "blocking")]
90pub mod blocking;
91
92/// The async archive function.
93///
94/// Takes in a URL and attempts to download the page and its resources.
95/// Network errors get wrapped in [`Error`] and returned as the `Err`
96/// case.
97pub async fn archive<U>(
98 url: U,
99 options: ArchiveOptions,
100) -> Result<PageArchive, Error>
101where
102 U: TryInto<Url>,
103 <U as TryInto<Url>>::Error: Display,
104{
105 let url: Url = url
106 .try_into()
107 .map_err(|e| Error::ParseError(format!("{}", e)))?;
108
109 // Initialise client
110 let client = reqwest::Client::builder()
111 .use_native_tls()
112 .danger_accept_invalid_certs(options.accept_invalid_certificates)
113 .danger_accept_invalid_hostnames(options.accept_invalid_certificates)
114 .build()?;
115
116 // Fetch the page contents
117 let content = client.get(url.clone()).send().await?.text().await?;
118
119 // Determine the resources that the page needs
120 let resource_urls = parse_resource_urls(&url, &content);
121
122 // Download them
123 let mut resource_map = ResourceMap::new();
124 for resource_url in resource_urls {
125 use ResourceUrl::*;
126
127 let response = client.get(resource_url.url().clone()).send().await?;
128 if response.status() != StatusCode::OK {
129 // Skip any errors
130 continue;
131 }
132 match resource_url {
133 Image(u) => {
134 // Get mimetype of image
135 let data = response.bytes().await?;
136 let mimetype = mimetype_from_response(&data, &u);
137 resource_map.insert(
138 u,
139 Resource::Image(ImageResource { data, mimetype }),
140 );
141 }
142 Css(u) => {
143 resource_map.insert(u, Resource::Css(response.text().await?));
144 }
145 Javascript(u) => {
146 resource_map
147 .insert(u, Resource::Javascript(response.text().await?));
148 }
149 }
150 }
151
152 Ok(PageArchive {
153 url,
154 content,
155 resource_map,
156 })
157}
158
159/// Configuration options to control aspects of the archiving behaviour.
160pub struct ArchiveOptions {
161 /// Accept invalid certificates or certificates that do not match
162 /// the requested hostname. For example, performing an HTTPS request
163 /// against an IP address will more than likely result in a hostname
164 /// mismatch.
165 ///
166 /// Corresponds to [`reqwest::ClientBuilder::danger_accept_invalid_certs`]
167 /// and [`reqwest::ClientBuilder::danger_accept_invalid_hostnames`].
168 ///
169 /// Default: `false`
170 pub accept_invalid_certificates: bool,
171}
172
173impl Default for ArchiveOptions {
174 fn default() -> Self {
175 Self {
176 accept_invalid_certificates: false,
177 }
178 }
179}
180
181#[cfg(test)]
182mod tests {
183 use super::*;
184 use tokio_test::block_on;
185
186 #[test]
187 fn parse_invalid_url_async() {
188 let u = "this~is~not~a~url";
189
190 let res = block_on(archive(u, Default::default()));
191 assert!(res.is_err());
192
193 if let Err(Error::ParseError(_err)) = res {
194 // Okay, it's a parse error
195 } else {
196 panic!("Expected parse error");
197 }
198 }
199}