1use chrono::{Duration, NaiveDateTime, TimeZone, Utc};
2use lazy_static::lazy_static;
3use regex::Regex;
4use serde::{Deserialize, Serialize};
5use std::collections::HashMap;
6
7pub async fn archive_url(url: &str) -> Result<ArchivingResult, ArchiveError> {
8 let latest_snapshot = fetch_latest_snapshot(url).await;
10 if let Ok(ref snapshot) = latest_snapshot {
11 if (Utc::now() - Duration::days(90)).naive_utc() < snapshot.last_archived {
13 return latest_snapshot;
14 }
15 }
16
17 let resp = reqwest::get(format!("https://web.archive.org/save/{}", url))
19 .await
20 .map_err(|err| ArchiveError::Unknown(err.to_string()))?;
21 let archive_url: Result<String, ArchiveError> = match resp.status().as_u16() {
22 200 => Ok(resp.url().clone().to_string()),
24 404 => {
25 if resp.url().path().starts_with("/web") {
28 Ok(resp.url().to_string())
29 } else {
30 Err(ArchiveError::Unknown(format!(
31 "Unexpected HTTP 404 at {:#?}",
32 resp.url().to_string()
33 )))
34 }
35 }
36 509 => Err(ArchiveError::BandwidthExceeded),
37 403 | 520 | 523 => Err(ArchiveError::UnableToArchive),
39 _ => {
40 dbg!(&resp);
41 Err(ArchiveError::Unknown(format!(
42 "Got status {}: {:#?}",
43 resp.status(),
44 resp
45 )))
46 }
47 };
48 let result = archive_url.and_then(|url| {
49 Ok(ArchivingResult {
50 last_archived: timestamp_from_archive_url(&url)?,
51 url: Some(url),
52 existing_snapshot: false,
53 })
54 });
55 match result {
56 Err(ArchiveError::UnableToArchive) => {
57 latest_snapshot.map_err(|_| ArchiveError::UnableToArchive)
60 }
61 _ => result,
62 }
63}
64
65fn timestamp_from_archive_url(url: &str) -> Result<NaiveDateTime, ArchiveError> {
66 lazy_static! {
67 static ref RE: Regex = Regex::new(r"/web/(\d+)/").unwrap();
68 }
69 let timestamp_url_component = RE
70 .captures(url)
71 .and_then(|cap| cap.get(1).map(|ts_str| ts_str.as_str()))
72 .ok_or_else(|| ArchiveError::ParseError("unable to extract timestamp from url".into()))?;
73 parse_wayback_timestamp(timestamp_url_component)
74}
75
76async fn fetch_latest_snapshot(url: &str) -> Result<ArchivingResult, ArchiveError> {
77 let resp = reqwest::get(format!("http://archive.org/wayback/available?url={}", url))
78 .await
79 .map_err(|err| ArchiveError::Unknown(err.to_string()))?
80 .json::<WaybackAvailabilityResponse>()
81 .await
82 .map_err(|err| ArchiveError::ParseError(err.to_string()))?;
83
84 if let Some(snapshots) = resp.archived_snapshots {
85 if let Some((_, latest)) = snapshots
86 .iter()
87 .max_by_key(|(_, snapshot)| &snapshot.timestamp)
88 {
89 return Ok(ArchivingResult {
90 existing_snapshot: true,
91 last_archived: parse_wayback_timestamp(&latest.timestamp)?,
92 url: Some(latest.url.clone()),
93 });
94 }
95 }
96 Err(ArchiveError::NoExistingSnapshot)
97}
98
99fn parse_wayback_timestamp(ts: &str) -> Result<NaiveDateTime, ArchiveError> {
100 let naive_utc = NaiveDateTime::parse_from_str(ts, "%Y%m%d%H%M%S")
101 .map_err(|err| ArchiveError::ParseError(err.to_string()))?;
102 Ok(Utc.from_utc_datetime(&naive_utc).naive_local())
103}
104
105#[derive(Deserialize, Debug)]
106struct WaybackAvailabilityResponse {
107 url: String,
108 archived_snapshots: Option<HashMap<String, WaybackSnapshot>>,
109}
110
111#[derive(Deserialize, Debug)]
112struct WaybackSnapshot {
113 status: String,
114 available: bool,
115 url: String,
116 timestamp: String,
117}
118
119#[derive(Deserialize, Serialize, Debug)]
120pub struct ArchivingResult {
121 pub url: Option<String>,
122 pub last_archived: NaiveDateTime,
123 #[serde(skip)]
124 pub existing_snapshot: bool,
125}
126
127#[derive(Debug, PartialEq)]
128pub enum ArchiveError {
129 BandwidthExceeded,
130 UnableToArchive,
131 NoExistingSnapshot,
132 ParseError(String),
133 Unknown(String),
134}
135
136impl std::fmt::Display for ArchiveError {
137 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
138 match self {
139 ArchiveError::BandwidthExceeded => write!(f, "Bandwidth Exceeded"),
140 ArchiveError::UnableToArchive => {
141 write!(f, "Wayback Machine unable to archive this URL")
142 }
143 ArchiveError::NoExistingSnapshot => write!(f, "No existing snapshots"),
144 ArchiveError::ParseError(err) => write!(f, "Parse error: {}", err),
145 ArchiveError::Unknown(err) => write!(f, "Unknown error: {}", err),
146 }
147 }
148}
149
150impl std::error::Error for ArchiveError {}