use once_cell::sync::Lazy;
use serde::de::Error as SerdeError;
use serde::{Deserialize, Deserializer};
use http::StatusCode;
use reqwest::{Error, Url};
static WAYBACK_URL: Lazy<Url> =
Lazy::new(|| Url::parse("https://archive.org/wayback/available").unwrap());
pub(crate) async fn get_wayback_link(url: &Url) -> Result<Option<Url>, Error> {
let mut archive_url: Url = WAYBACK_URL.clone();
archive_url.set_query(Some(&format!("url={url}")));
let response = reqwest::get(archive_url)
.await?
.json::<InternetArchiveResponse>()
.await?;
Ok(response
.archived_snapshots
.closest
.map(|closest| closest.url))
}
#[derive(Debug, Deserialize, Eq, PartialEq)]
pub(crate) struct InternetArchiveResponse {
pub(crate) url: Url,
pub(crate) archived_snapshots: ArchivedSnapshots,
}
#[derive(Debug, Deserialize, Eq, PartialEq)]
pub(crate) struct ArchivedSnapshots {
pub(crate) closest: Option<Closest>,
}
#[derive(Debug, Deserialize, Eq, PartialEq)]
pub(crate) struct Closest {
#[serde(deserialize_with = "from_string")]
pub(crate) status: StatusCode,
pub(crate) available: bool,
pub(crate) url: Url,
pub(crate) timestamp: String,
}
fn from_string<'d, D>(deserializer: D) -> Result<StatusCode, D::Error>
where
D: Deserializer<'d>,
{
let value: &str = Deserialize::deserialize(deserializer)?;
let result = value
.parse::<u16>()
.map_err(|e| D::Error::custom(e.to_string()))?;
StatusCode::from_u16(result).map_err(|e| D::Error::custom(e.to_string()))
}
#[cfg(test)]
mod tests {
use crate::archive::wayback::get_wayback_link;
use reqwest::{Error, Url};
use std::{error::Error as StdError, time::Duration};
use tokio::time::sleep;
#[tokio::test]
#[ignore]
async fn wayback_suggestion() -> Result<(), Box<dyn StdError>> {
let target_url = "https://example.com".parse::<Url>()?;
let expected_ending = (target_url.host_str().ok_or("Invalid target URL")?).to_string();
for _ in 0..3 {
match get_wayback_link(&target_url).await {
Ok(Some(suggested_url)) => {
let host = suggested_url
.host_str()
.ok_or("Suggestion doesn't have a host")?;
assert_eq!(host, "web.archive.org");
let archived_url = suggested_url
.path()
.trim_start_matches("/web/")
.split_once('/')
.map(|x| x.1)
.ok_or("Failed to extract archived URL from Wayback suggestion")?;
if !archived_url
.trim_end_matches('/')
.ends_with(&expected_ending)
{
return Err(format!(
"Expected suggestion '{archived_url}' to end with '{expected_ending}'"
)
.into());
}
return Ok(());
}
Ok(None) => {
sleep(Duration::from_secs(1)).await;
}
Err(e) => {
return Err(format!("Error retrieving Wayback link: {e}").into());
}
}
}
Err("Did not get a valid Wayback Machine suggestion after multiple attempts.".into())
}
#[tokio::test]
async fn wayback_suggestion_unknown_url() -> Result<(), Error> {
let url = &"https://github.com/mre/idiomatic-rust-doesnt-exist-man"
.try_into()
.unwrap();
let response = get_wayback_link(url).await?;
assert_eq!(response, None);
Ok(())
}
}