waybackmachine_client/
lib.rs

1pub mod archivableurl;
2pub mod errors;
3
4pub use crate::archivableurl::ArchivableUrl;
5pub use crate::errors::Error;
6use chrono::{NaiveDateTime, TimeDelta, Utc};
7use reqwest_middleware::{ClientBuilder, ClientWithMiddleware};
8use reqwest_retry::{policies::ExponentialBackoff, RetryTransientMiddleware};
9use serde::Deserialize;
10use url::Url;
11
12/// Maximum number of allowed request retries attempts.
13const DEFAULT_MAX_REQUEST_RETRIES: u32 = 10;
14
15/// Default threshold for considering an archive as recent, in days.
16/// URLs with archives older than this threshold will be re-archived.
17const DEFAULT_ARCHIVE_THRESHOLD_DAYS: i64 = 30;
18
19/// User-agent to make requests from
20const DEFAULT_USER_AGENT: &str =
21    "Mozilla/5.0 (X11; Fedora; Linux x86_64; rv:40.0) Gecko/20100101 Firefox/40.0";
22
23/// Endpoint for the Wayback Machine archiving service
24pub const WAYBACK_MACHINE_ARCHIVE_ENDPOINT: &str = "https://web.archive.org/save/";
25/// Endpoint to check if an archive is present in the Wayback Machine
26pub const WAYBACK_MACHINE_CHECK_ENDPOINT: &str =
27    "https://web.archive.org/cdx/search/cdx?fl=timestamp&limit=-1&output=json&url=";
28
29#[derive(Debug, Deserialize)]
30struct WaybackCheckResponse(Vec<Vec<String>>);
31
32/// Configuration for the Wayback Machine client
33pub struct ClientConfig {
34    archive_endpoint: String,
35    check_endpoint: String,
36    retry_policy: ExponentialBackoff,
37    archive_threshold_timestamp: NaiveDateTime,
38    user_agent: String,
39}
40
41/// Status of the archive request
42pub enum ArchiveResult {
43    Archived(String),
44    RecentArchiveExists,
45}
46
47impl ClientConfig {
48    /// Constructs a new `ClientConfig` with custom retry policy and user agent
49    pub fn new(
50        archive_endpoint: String,
51        check_endpoint: String,
52        max_request_retries: u32,
53        archive_threshold_days: i64,
54        user_agent: String,
55    ) -> Self {
56        ClientConfig {
57            archive_endpoint: Url::parse(&archive_endpoint)
58                .unwrap_or_else(|_| panic!("Invalid archive_endpoint URL: {}", archive_endpoint))
59                .to_string(),
60            check_endpoint: Url::parse(&check_endpoint)
61                .unwrap_or_else(|_| panic!("Invalid check_endpoint URL: {}", check_endpoint))
62                .to_string(),
63            retry_policy: ExponentialBackoff::builder().build_with_max_retries(max_request_retries),
64            archive_threshold_timestamp: (Utc::now()
65                - TimeDelta::try_days(archive_threshold_days).unwrap())
66            .naive_utc(),
67            user_agent,
68        }
69    }
70}
71impl Default for ClientConfig {
72    /// Constructs a default `ClientConfig` with default retry policy and user agent
73    fn default() -> Self {
74        ClientConfig {
75            archive_endpoint: WAYBACK_MACHINE_ARCHIVE_ENDPOINT.into(),
76            check_endpoint: WAYBACK_MACHINE_CHECK_ENDPOINT.into(),
77            retry_policy: ExponentialBackoff::builder()
78                .build_with_max_retries(DEFAULT_MAX_REQUEST_RETRIES),
79            archive_threshold_timestamp: (Utc::now()
80                - TimeDelta::try_days(DEFAULT_ARCHIVE_THRESHOLD_DAYS).unwrap())
81            .naive_utc(),
82            user_agent: DEFAULT_USER_AGENT.into(),
83        }
84    }
85}
86
87/// Wayback Machine client for archiving URLs
88pub struct WaybackMachineClient {
89    http_client: ClientWithMiddleware,
90    client_config: ClientConfig,
91}
92
93impl WaybackMachineClient {
94    /// Constructs a new `WaybackMachineClient` with the given configuration
95    pub fn new(client_config: ClientConfig) -> Self {
96        let http_client = ClientBuilder::new(
97            reqwest::Client::builder()
98                .user_agent(client_config.user_agent.clone())
99                .build()
100                .unwrap(),
101        )
102        .with(RetryTransientMiddleware::new_with_policy(
103            client_config.retry_policy,
104        ))
105        .build();
106        WaybackMachineClient {
107            http_client,
108            client_config,
109        }
110    }
111
112    /// Checks if a recent archive exists for the given URL.
113    ///
114    /// If an archive exists, and it is newer than the configured archive threshold,
115    /// the function returns Ok(()), indicating that the URL is considered recently archived.
116    /// If no recent archive is found or the found archive is older than the threshold,
117    /// it returns Err(Error::NoRecentArchive).
118    ///
119    /// https://github.com/internetarchive/wayback/tree/master/wayback-cdx-server
120    ///
121    async fn check_recent_archive_exists(&self, url: &str) -> Result<(), Error> {
122        let to_check = ArchivableUrl::parse(url)?;
123        let response = self
124            .http_client
125            .get(format!("{}{}", self.client_config.check_endpoint, to_check))
126            .send()
127            .await
128            .map_err(|err| Error::CannotCheckArchive(err.to_string()))?
129            .json::<WaybackCheckResponse>()
130            .await
131            .map_err(|e| Error::CannotCheckArchive(e.to_string()))?;
132
133        match &response.0[..] {
134            [_, timestamp] if timestamp.len() == 1 => {
135                let snapshot_timestamp =
136                    NaiveDateTime::parse_from_str(&timestamp[0], "%Y%m%d%H%M%S")?;
137                if snapshot_timestamp > self.client_config.archive_threshold_timestamp {
138                    Ok(())
139                } else {
140                    Err(Error::NoRecentArchive(url.to_string()))
141                }
142            }
143            _ => Err(Error::NoRecentArchive(url.to_string())),
144        }
145    }
146
147    /// Checks if a recent Wayback Machine archive exists for the given URL
148    /// and archives it if necessary.
149    ///
150    /// This function first checks if a recent archive exists for the URL by calling
151    /// `check_recent_archive_exists`. If an archive does not exist or is older than the
152    /// configured archive threshold, it proceeds to archive the URL.
153    ///
154    /// It returns an `ArchiveResult::Archived` if it archives the URL,
155    /// or an `ArchiveResult::RecentArchiveExists` if a recent archive already exists.
156    ///
157    /// # Errors
158    ///
159    /// This method fails if the `url` provided is not well formatted
160    /// of if there was an error while sending the request.
161    ///
162    /// # Example
163    /// ```
164    /// use waybackmachine_client::{ClientConfig, Error, WaybackMachineClient};
165    ///
166    /// # async fn run() -> Result<(), Error> {
167    /// let wayback_client = WaybackMachineClient::new(ClientConfig::default());
168    /// wayback_client.archive_url("https://www.openbookpublishers.com/").await?;
169    /// # Ok(())
170    /// # }
171    /// ```
172    pub async fn archive_url(&self, url: &str) -> Result<ArchiveResult, Error> {
173        let to_archive = ArchivableUrl::parse(url)?;
174        // get the latest location in case of a redirect
175        // check that the latest location is actually archivable
176        let to_check = self
177            .http_client
178            .get(to_archive.as_str())
179            .send()
180            .await
181            .map_or(Ok(to_archive.clone()), |response| {
182                ArchivableUrl::parse(response.url().as_str())
183            })?
184            .url
185            .clone();
186
187        if self
188            .check_recent_archive_exists(to_check.as_str())
189            .await
190            .is_ok()
191        {
192            return Ok(ArchiveResult::RecentArchiveExists);
193        }
194
195        let response = self
196            .http_client
197            .get(format!(
198                "{}{}",
199                self.client_config.archive_endpoint, to_archive
200            ))
201            .send()
202            .await?;
203        if !response.status().is_success() {
204            // check just in case the request returns a false negative
205            if self.check_recent_archive_exists(url).await.is_err() {
206                return Err(Error::CannotArchive(
207                    response.status().to_string(),
208                    url.to_string(),
209                ));
210            }
211        }
212        Ok(ArchiveResult::Archived(response.url().to_string()))
213    }
214}
215
216#[cfg(test)]
217mod tests {
218    use super::*;
219    use mockito::ServerGuard;
220    use serde_json::{json, Value};
221
222    const ARCHIVE_ROOT_PATH: &str = "/save/";
223    const CHECK_ROOT_PATH: &str = "/cdx/search/cdx?fl=timestamp&limit=-1&output=json&url=";
224    const MAX_REQUEST_RETRIES: u32 = 3;
225
226    async fn mock_server() -> (ServerGuard, WaybackMachineClient) {
227        let server = mockito::Server::new_async().await;
228        let client_config = ClientConfig::new(
229            format!("{}{}", server.url(), ARCHIVE_ROOT_PATH),
230            format!("{}{}", server.url(), CHECK_ROOT_PATH),
231            MAX_REQUEST_RETRIES,
232            30,
233            "TestUserAgent".to_string(),
234        );
235        let wayback_client = WaybackMachineClient::new(client_config);
236        (server, wayback_client)
237    }
238
239    #[tokio::test]
240    async fn test_archive_url_success() {
241        let to_archive = "https://example.com/";
242        let snapshot_timestamp = "20230227054528";
243        let (mut server, wayback_client) = mock_server().await;
244
245        let snapshot: Value = json!({
246            "url": to_archive,
247            "archived_snapshots": {
248                "closest": {
249                    "status": "200",
250                    "available": true,
251                    "url": format!("http://web.archive.org/web/{}/{}", snapshot_timestamp, to_archive),
252                    "timestamp": snapshot_timestamp
253                }
254            }
255        });
256        let mock1 = server
257            .mock("GET", &format!("{}{}", CHECK_ROOT_PATH, to_archive)[..])
258            .with_status(200)
259            .with_body(snapshot.to_string())
260            .create_async()
261            .await;
262        let mock2 = server
263            .mock("GET", &format!("{}{}", ARCHIVE_ROOT_PATH, to_archive)[..])
264            .with_status(200)
265            .create_async()
266            .await;
267
268        assert!(wayback_client.archive_url(to_archive).await.is_ok());
269        mock1.assert_async().await;
270        mock2.assert_async().await;
271    }
272
273    #[tokio::test]
274    async fn test_archive_url_no_scheme() {
275        let to_archive = "example.com";
276        let wayback_client = WaybackMachineClient::new(ClientConfig::default());
277
278        assert_eq!(
279            wayback_client.archive_url(to_archive).await.err().unwrap(),
280            Error::InvalidUrl(to_archive.to_string())
281        );
282    }
283
284    #[tokio::test]
285    async fn test_archive_url_local_url() {
286        let to_archive = "http://localhost/page";
287        let wayback_client = WaybackMachineClient::new(ClientConfig::default());
288
289        assert_eq!(
290            wayback_client.archive_url(to_archive).await.err().unwrap(),
291            Error::InvalidUrl(to_archive.to_string())
292        );
293    }
294
295    #[tokio::test]
296    async fn test_archive_url_failure() {
297        let to_archive = "https://example.com/";
298        let snapshot_timestamp = "20230227054528";
299        let (mut server, wayback_client) = mock_server().await;
300
301        let snapshot: Value = json!({
302            "url": to_archive,
303            "archived_snapshots": {
304                "closest": {
305                    "status": "200",
306                    "available": true,
307                    "url": format!("http://web.archive.org/web/{}/{}", snapshot_timestamp, to_archive),
308                    "timestamp": snapshot_timestamp
309                }
310            }
311        });
312        let mock1 = server
313            .mock("GET", &format!("{}{}", CHECK_ROOT_PATH, to_archive)[..])
314            .with_status(200)
315            .with_body(snapshot.to_string())
316            .create_async()
317            .await;
318        let mock2 = server
319            .mock("GET", &format!("{}{}", ARCHIVE_ROOT_PATH, to_archive)[..])
320            .with_status(520)
321            .expect_at_least(MAX_REQUEST_RETRIES as usize)
322            .create_async()
323            .await;
324        // checking if it actually was archived after receiving an archiving error
325        let mock3 = server
326            .mock("GET", &format!("{}{}", CHECK_ROOT_PATH, to_archive)[..])
327            .with_status(200)
328            .with_body(snapshot.to_string())
329            .create_async()
330            .await;
331
332        assert!(wayback_client.archive_url(to_archive).await.is_err());
333        mock1.assert_async().await;
334        mock2.assert_async().await;
335        mock3.assert_async().await;
336    }
337
338    #[tokio::test]
339    async fn test_check_recent_archive_exists_success() {
340        let to_archive = "https://example.com/";
341        let snapshot_timestamp = (Utc::now() - TimeDelta::try_days(1).unwrap())
342            .format("%Y%m%d%H%M%S")
343            .to_string();
344        let (mut server, wayback_client) = mock_server().await;
345
346        let snapshot: Value = json!([["timestamp"], [snapshot_timestamp]]);
347        let mock = server
348            .mock("GET", &format!("{}{}", CHECK_ROOT_PATH, to_archive)[..])
349            .with_status(200)
350            .with_body(snapshot.to_string())
351            .create_async()
352            .await;
353
354        assert!(wayback_client
355            .check_recent_archive_exists(to_archive)
356            .await
357            .is_ok());
358        mock.assert_async().await;
359    }
360
361    #[tokio::test]
362    async fn test_check_recent_archive_exists_old_snapshot() {
363        let to_archive = "https://example.com/";
364        let snapshot_timestamp = (Utc::now() - TimeDelta::try_days(100).unwrap())
365            .format("%Y%m%d%H%M%S")
366            .to_string();
367        let (mut server, wayback_client) = mock_server().await;
368
369        let snapshot: Value = json!([["timestamp"], [snapshot_timestamp]]);
370        let mock = server
371            .mock("GET", &format!("{}{}", CHECK_ROOT_PATH, to_archive)[..])
372            .with_status(200)
373            .with_body(snapshot.to_string())
374            .create_async()
375            .await;
376
377        assert!(wayback_client
378            .check_recent_archive_exists(to_archive)
379            .await
380            .is_err());
381        mock.assert_async().await;
382    }
383
384    #[tokio::test]
385    async fn test_check_recent_archive_exists_no_snapshot() {
386        let to_archive = "https://example.com/";
387        let (mut server, wayback_client) = mock_server().await;
388
389        let snapshot: Value = json!([]);
390        let mock = server
391            .mock("GET", &format!("{}{}", CHECK_ROOT_PATH, to_archive)[..])
392            .with_status(200)
393            .with_body(snapshot.to_string())
394            .create_async()
395            .await;
396
397        assert!(wayback_client
398            .check_recent_archive_exists(to_archive)
399            .await
400            .is_err());
401        mock.assert_async().await;
402    }
403}