1pub mod archivableurl;
2pub mod errors;
3
4pub use crate::archivableurl::ArchivableUrl;
5pub use crate::errors::Error;
6use chrono::{NaiveDateTime, TimeDelta, Utc};
7use reqwest_middleware::{ClientBuilder, ClientWithMiddleware};
8use reqwest_retry::{policies::ExponentialBackoff, RetryTransientMiddleware};
9use serde::Deserialize;
10use url::Url;
11
12const DEFAULT_MAX_REQUEST_RETRIES: u32 = 10;
14
15const DEFAULT_ARCHIVE_THRESHOLD_DAYS: i64 = 30;
18
19const DEFAULT_USER_AGENT: &str =
21 "Mozilla/5.0 (X11; Fedora; Linux x86_64; rv:40.0) Gecko/20100101 Firefox/40.0";
22
23pub const WAYBACK_MACHINE_ARCHIVE_ENDPOINT: &str = "https://web.archive.org/save/";
25pub const WAYBACK_MACHINE_CHECK_ENDPOINT: &str =
27 "https://web.archive.org/cdx/search/cdx?fl=timestamp&limit=-1&output=json&url=";
28
29#[derive(Debug, Deserialize)]
30struct WaybackCheckResponse(Vec<Vec<String>>);
31
32pub struct ClientConfig {
34 archive_endpoint: String,
35 check_endpoint: String,
36 retry_policy: ExponentialBackoff,
37 archive_threshold_timestamp: NaiveDateTime,
38 user_agent: String,
39}
40
41pub enum ArchiveResult {
43 Archived(String),
44 RecentArchiveExists,
45}
46
47impl ClientConfig {
48 pub fn new(
50 archive_endpoint: String,
51 check_endpoint: String,
52 max_request_retries: u32,
53 archive_threshold_days: i64,
54 user_agent: String,
55 ) -> Self {
56 ClientConfig {
57 archive_endpoint: Url::parse(&archive_endpoint)
58 .unwrap_or_else(|_| panic!("Invalid archive_endpoint URL: {}", archive_endpoint))
59 .to_string(),
60 check_endpoint: Url::parse(&check_endpoint)
61 .unwrap_or_else(|_| panic!("Invalid check_endpoint URL: {}", check_endpoint))
62 .to_string(),
63 retry_policy: ExponentialBackoff::builder().build_with_max_retries(max_request_retries),
64 archive_threshold_timestamp: (Utc::now()
65 - TimeDelta::try_days(archive_threshold_days).unwrap())
66 .naive_utc(),
67 user_agent,
68 }
69 }
70}
71impl Default for ClientConfig {
72 fn default() -> Self {
74 ClientConfig {
75 archive_endpoint: WAYBACK_MACHINE_ARCHIVE_ENDPOINT.into(),
76 check_endpoint: WAYBACK_MACHINE_CHECK_ENDPOINT.into(),
77 retry_policy: ExponentialBackoff::builder()
78 .build_with_max_retries(DEFAULT_MAX_REQUEST_RETRIES),
79 archive_threshold_timestamp: (Utc::now()
80 - TimeDelta::try_days(DEFAULT_ARCHIVE_THRESHOLD_DAYS).unwrap())
81 .naive_utc(),
82 user_agent: DEFAULT_USER_AGENT.into(),
83 }
84 }
85}
86
87pub struct WaybackMachineClient {
89 http_client: ClientWithMiddleware,
90 client_config: ClientConfig,
91}
92
93impl WaybackMachineClient {
94 pub fn new(client_config: ClientConfig) -> Self {
96 let http_client = ClientBuilder::new(
97 reqwest::Client::builder()
98 .user_agent(client_config.user_agent.clone())
99 .build()
100 .unwrap(),
101 )
102 .with(RetryTransientMiddleware::new_with_policy(
103 client_config.retry_policy,
104 ))
105 .build();
106 WaybackMachineClient {
107 http_client,
108 client_config,
109 }
110 }
111
112 async fn check_recent_archive_exists(&self, url: &str) -> Result<(), Error> {
122 let to_check = ArchivableUrl::parse(url)?;
123 let response = self
124 .http_client
125 .get(format!("{}{}", self.client_config.check_endpoint, to_check))
126 .send()
127 .await
128 .map_err(|err| Error::CannotCheckArchive(err.to_string()))?
129 .json::<WaybackCheckResponse>()
130 .await
131 .map_err(|e| Error::CannotCheckArchive(e.to_string()))?;
132
133 match &response.0[..] {
134 [_, timestamp] if timestamp.len() == 1 => {
135 let snapshot_timestamp =
136 NaiveDateTime::parse_from_str(×tamp[0], "%Y%m%d%H%M%S")?;
137 if snapshot_timestamp > self.client_config.archive_threshold_timestamp {
138 Ok(())
139 } else {
140 Err(Error::NoRecentArchive(url.to_string()))
141 }
142 }
143 _ => Err(Error::NoRecentArchive(url.to_string())),
144 }
145 }
146
147 pub async fn archive_url(&self, url: &str) -> Result<ArchiveResult, Error> {
173 let to_archive = ArchivableUrl::parse(url)?;
174 let to_check = self
177 .http_client
178 .get(to_archive.as_str())
179 .send()
180 .await
181 .map_or(Ok(to_archive.clone()), |response| {
182 ArchivableUrl::parse(response.url().as_str())
183 })?
184 .url
185 .clone();
186
187 if self
188 .check_recent_archive_exists(to_check.as_str())
189 .await
190 .is_ok()
191 {
192 return Ok(ArchiveResult::RecentArchiveExists);
193 }
194
195 let response = self
196 .http_client
197 .get(format!(
198 "{}{}",
199 self.client_config.archive_endpoint, to_archive
200 ))
201 .send()
202 .await?;
203 if !response.status().is_success() {
204 if self.check_recent_archive_exists(url).await.is_err() {
206 return Err(Error::CannotArchive(
207 response.status().to_string(),
208 url.to_string(),
209 ));
210 }
211 }
212 Ok(ArchiveResult::Archived(response.url().to_string()))
213 }
214}
215
216#[cfg(test)]
217mod tests {
218 use super::*;
219 use mockito::ServerGuard;
220 use serde_json::{json, Value};
221
222 const ARCHIVE_ROOT_PATH: &str = "/save/";
223 const CHECK_ROOT_PATH: &str = "/cdx/search/cdx?fl=timestamp&limit=-1&output=json&url=";
224 const MAX_REQUEST_RETRIES: u32 = 3;
225
226 async fn mock_server() -> (ServerGuard, WaybackMachineClient) {
227 let server = mockito::Server::new_async().await;
228 let client_config = ClientConfig::new(
229 format!("{}{}", server.url(), ARCHIVE_ROOT_PATH),
230 format!("{}{}", server.url(), CHECK_ROOT_PATH),
231 MAX_REQUEST_RETRIES,
232 30,
233 "TestUserAgent".to_string(),
234 );
235 let wayback_client = WaybackMachineClient::new(client_config);
236 (server, wayback_client)
237 }
238
239 #[tokio::test]
240 async fn test_archive_url_success() {
241 let to_archive = "https://example.com/";
242 let snapshot_timestamp = "20230227054528";
243 let (mut server, wayback_client) = mock_server().await;
244
245 let snapshot: Value = json!({
246 "url": to_archive,
247 "archived_snapshots": {
248 "closest": {
249 "status": "200",
250 "available": true,
251 "url": format!("http://web.archive.org/web/{}/{}", snapshot_timestamp, to_archive),
252 "timestamp": snapshot_timestamp
253 }
254 }
255 });
256 let mock1 = server
257 .mock("GET", &format!("{}{}", CHECK_ROOT_PATH, to_archive)[..])
258 .with_status(200)
259 .with_body(snapshot.to_string())
260 .create_async()
261 .await;
262 let mock2 = server
263 .mock("GET", &format!("{}{}", ARCHIVE_ROOT_PATH, to_archive)[..])
264 .with_status(200)
265 .create_async()
266 .await;
267
268 assert!(wayback_client.archive_url(to_archive).await.is_ok());
269 mock1.assert_async().await;
270 mock2.assert_async().await;
271 }
272
273 #[tokio::test]
274 async fn test_archive_url_no_scheme() {
275 let to_archive = "example.com";
276 let wayback_client = WaybackMachineClient::new(ClientConfig::default());
277
278 assert_eq!(
279 wayback_client.archive_url(to_archive).await.err().unwrap(),
280 Error::InvalidUrl(to_archive.to_string())
281 );
282 }
283
284 #[tokio::test]
285 async fn test_archive_url_local_url() {
286 let to_archive = "http://localhost/page";
287 let wayback_client = WaybackMachineClient::new(ClientConfig::default());
288
289 assert_eq!(
290 wayback_client.archive_url(to_archive).await.err().unwrap(),
291 Error::InvalidUrl(to_archive.to_string())
292 );
293 }
294
295 #[tokio::test]
296 async fn test_archive_url_failure() {
297 let to_archive = "https://example.com/";
298 let snapshot_timestamp = "20230227054528";
299 let (mut server, wayback_client) = mock_server().await;
300
301 let snapshot: Value = json!({
302 "url": to_archive,
303 "archived_snapshots": {
304 "closest": {
305 "status": "200",
306 "available": true,
307 "url": format!("http://web.archive.org/web/{}/{}", snapshot_timestamp, to_archive),
308 "timestamp": snapshot_timestamp
309 }
310 }
311 });
312 let mock1 = server
313 .mock("GET", &format!("{}{}", CHECK_ROOT_PATH, to_archive)[..])
314 .with_status(200)
315 .with_body(snapshot.to_string())
316 .create_async()
317 .await;
318 let mock2 = server
319 .mock("GET", &format!("{}{}", ARCHIVE_ROOT_PATH, to_archive)[..])
320 .with_status(520)
321 .expect_at_least(MAX_REQUEST_RETRIES as usize)
322 .create_async()
323 .await;
324 let mock3 = server
326 .mock("GET", &format!("{}{}", CHECK_ROOT_PATH, to_archive)[..])
327 .with_status(200)
328 .with_body(snapshot.to_string())
329 .create_async()
330 .await;
331
332 assert!(wayback_client.archive_url(to_archive).await.is_err());
333 mock1.assert_async().await;
334 mock2.assert_async().await;
335 mock3.assert_async().await;
336 }
337
338 #[tokio::test]
339 async fn test_check_recent_archive_exists_success() {
340 let to_archive = "https://example.com/";
341 let snapshot_timestamp = (Utc::now() - TimeDelta::try_days(1).unwrap())
342 .format("%Y%m%d%H%M%S")
343 .to_string();
344 let (mut server, wayback_client) = mock_server().await;
345
346 let snapshot: Value = json!([["timestamp"], [snapshot_timestamp]]);
347 let mock = server
348 .mock("GET", &format!("{}{}", CHECK_ROOT_PATH, to_archive)[..])
349 .with_status(200)
350 .with_body(snapshot.to_string())
351 .create_async()
352 .await;
353
354 assert!(wayback_client
355 .check_recent_archive_exists(to_archive)
356 .await
357 .is_ok());
358 mock.assert_async().await;
359 }
360
361 #[tokio::test]
362 async fn test_check_recent_archive_exists_old_snapshot() {
363 let to_archive = "https://example.com/";
364 let snapshot_timestamp = (Utc::now() - TimeDelta::try_days(100).unwrap())
365 .format("%Y%m%d%H%M%S")
366 .to_string();
367 let (mut server, wayback_client) = mock_server().await;
368
369 let snapshot: Value = json!([["timestamp"], [snapshot_timestamp]]);
370 let mock = server
371 .mock("GET", &format!("{}{}", CHECK_ROOT_PATH, to_archive)[..])
372 .with_status(200)
373 .with_body(snapshot.to_string())
374 .create_async()
375 .await;
376
377 assert!(wayback_client
378 .check_recent_archive_exists(to_archive)
379 .await
380 .is_err());
381 mock.assert_async().await;
382 }
383
384 #[tokio::test]
385 async fn test_check_recent_archive_exists_no_snapshot() {
386 let to_archive = "https://example.com/";
387 let (mut server, wayback_client) = mock_server().await;
388
389 let snapshot: Value = json!([]);
390 let mock = server
391 .mock("GET", &format!("{}{}", CHECK_ROOT_PATH, to_archive)[..])
392 .with_status(200)
393 .with_body(snapshot.to_string())
394 .create_async()
395 .await;
396
397 assert!(wayback_client
398 .check_recent_archive_exists(to_archive)
399 .await
400 .is_err());
401 mock.assert_async().await;
402 }
403}