wayback_archiver/
lib.rs

1use chrono::{Duration, NaiveDateTime, TimeZone, Utc};
2use lazy_static::lazy_static;
3use regex::Regex;
4use serde::{Deserialize, Serialize};
5use std::collections::HashMap;
6
7pub async fn archive_url(url: &str) -> Result<ArchivingResult, ArchiveError> {
8    // Check to see if there's an existing archive of the requested URL.
9    let latest_snapshot = fetch_latest_snapshot(url).await;
10    if let Ok(ref snapshot) = latest_snapshot {
11        // Only accept the existing snapshot if it was made recently.
12        if (Utc::now() - Duration::days(90)).naive_utc() < snapshot.last_archived {
13            return latest_snapshot;
14        }
15    }
16
17    // Request a new snapshot of the URL.
18    let resp = reqwest::get(format!("https://web.archive.org/save/{}", url))
19        .await
20        .map_err(|err| ArchiveError::Unknown(err.to_string()))?;
21    let archive_url: Result<String, ArchiveError> = match resp.status().as_u16() {
22        // Return the redirected URL (which is the archive snapshot URL).
23        200 => Ok(resp.url().clone().to_string()),
24        404 => {
25            // Sometimes, the snapshot URL returns a 404, even though the archival was successful.
26            // Probably due to a race condition in the Wayback machine; these URLs do (eventually) exist.
27            if resp.url().path().starts_with("/web") {
28                Ok(resp.url().to_string())
29            } else {
30                Err(ArchiveError::Unknown(format!(
31                    "Unexpected HTTP 404 at {:#?}",
32                    resp.url().to_string()
33                )))
34            }
35        }
36        509 => Err(ArchiveError::BandwidthExceeded),
37        // There may be more status codes that indicate archive failure, but these were the most common.
38        403 | 520 | 523 => Err(ArchiveError::UnableToArchive),
39        _ => {
40            dbg!(&resp);
41            Err(ArchiveError::Unknown(format!(
42                "Got status {}: {:#?}",
43                resp.status(),
44                resp
45            )))
46        }
47    };
48    let result = archive_url.and_then(|url| {
49        Ok(ArchivingResult {
50            last_archived: timestamp_from_archive_url(&url)?,
51            url: Some(url),
52            existing_snapshot: false,
53        })
54    });
55    match result {
56        Err(ArchiveError::UnableToArchive) => {
57            // If we weren't able to archive the URL, but a valid (if old) snapshot exists,
58            // then return that older snapshot.
59            latest_snapshot.map_err(|_| ArchiveError::UnableToArchive)
60        }
61        _ => result,
62    }
63}
64
65fn timestamp_from_archive_url(url: &str) -> Result<NaiveDateTime, ArchiveError> {
66    lazy_static! {
67        static ref RE: Regex = Regex::new(r"/web/(\d+)/").unwrap();
68    }
69    let timestamp_url_component = RE
70        .captures(url)
71        .and_then(|cap| cap.get(1).map(|ts_str| ts_str.as_str()))
72        .ok_or_else(|| ArchiveError::ParseError("unable to extract timestamp from url".into()))?;
73    parse_wayback_timestamp(timestamp_url_component)
74}
75
76async fn fetch_latest_snapshot(url: &str) -> Result<ArchivingResult, ArchiveError> {
77    let resp = reqwest::get(format!("http://archive.org/wayback/available?url={}", url))
78        .await
79        .map_err(|err| ArchiveError::Unknown(err.to_string()))?
80        .json::<WaybackAvailabilityResponse>()
81        .await
82        .map_err(|err| ArchiveError::ParseError(err.to_string()))?;
83
84    if let Some(snapshots) = resp.archived_snapshots {
85        if let Some((_, latest)) = snapshots
86            .iter()
87            .max_by_key(|(_, snapshot)| &snapshot.timestamp)
88        {
89            return Ok(ArchivingResult {
90                existing_snapshot: true,
91                last_archived: parse_wayback_timestamp(&latest.timestamp)?,
92                url: Some(latest.url.clone()),
93            });
94        }
95    }
96    Err(ArchiveError::NoExistingSnapshot)
97}
98
99fn parse_wayback_timestamp(ts: &str) -> Result<NaiveDateTime, ArchiveError> {
100    let naive_utc = NaiveDateTime::parse_from_str(ts, "%Y%m%d%H%M%S")
101        .map_err(|err| ArchiveError::ParseError(err.to_string()))?;
102    Ok(Utc.from_utc_datetime(&naive_utc).naive_local())
103}
104
105#[derive(Deserialize, Debug)]
106struct WaybackAvailabilityResponse {
107    url: String,
108    archived_snapshots: Option<HashMap<String, WaybackSnapshot>>,
109}
110
111#[derive(Deserialize, Debug)]
112struct WaybackSnapshot {
113    status: String,
114    available: bool,
115    url: String,
116    timestamp: String,
117}
118
119#[derive(Deserialize, Serialize, Debug)]
120pub struct ArchivingResult {
121    pub url: Option<String>,
122    pub last_archived: NaiveDateTime,
123    #[serde(skip)]
124    pub existing_snapshot: bool,
125}
126
127#[derive(Debug, PartialEq)]
128pub enum ArchiveError {
129    BandwidthExceeded,
130    UnableToArchive,
131    NoExistingSnapshot,
132    ParseError(String),
133    Unknown(String),
134}
135
136impl std::fmt::Display for ArchiveError {
137    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
138        match self {
139            ArchiveError::BandwidthExceeded => write!(f, "Bandwidth Exceeded"),
140            ArchiveError::UnableToArchive => {
141                write!(f, "Wayback Machine unable to archive this URL")
142            }
143            ArchiveError::NoExistingSnapshot => write!(f, "No existing snapshots"),
144            ArchiveError::ParseError(err) => write!(f, "Parse error: {}", err),
145            ArchiveError::Unknown(err) => write!(f, "Unknown error: {}", err),
146        }
147    }
148}
149
150impl std::error::Error for ArchiveError {}