//! API for working with the Save Page Now API of the Wayback Machine.
//!
//! This could probably be factored into its own crate at some point, but for
//! now it's internal to ebg.
//!
//! This uses the Save Page Now (SPN2) API, which is documented at:
//!
//! <https://docs.google.com/document/d/1Nsv52MvSjbLb2PCpHlat0gkzw0EvtSgpKHu4mk0MnrA/edit>
//!
//! For now this does not aim to be a complete implementation of the API, just
//! enough to support the features EBG needs.
use std::collections::HashMap;
use miette::Diagnostic;
use serde::Deserialize;
use thiserror::Error;
use tracing::debug;
use url::Url;
/// The top level client for the Wayback Machine's Save Page Now API.
pub struct Wayback {
access_key: String,
secret_key: String,
client: reqwest::Client,
}
impl Wayback {
/// Creates a new [`Wayback`] client using the given credentials.
///
/// Credentials can be obtained from <https://archive.org/account/s3.php>.
pub fn with_credentials(access_key: impl ToString, secret_key: impl ToString) -> Self {
Self {
access_key: access_key.to_string(),
secret_key: secret_key.to_string(),
client: reqwest::Client::new(),
}
}
/// Begins a new job to save the given page.
///
/// On success, returns a [`Job`] object that can be used to check the
/// status.
pub async fn begin_save_page(&self, url: &Url) -> Result<Job, Error> {
debug!(?url, "beginning save page job");
let response = self
.client
.post("https://web.archive.org/save")
.header(
"Authorization",
format!("LOW {}:{}", self.access_key, self.secret_key),
)
.header("Accept", "application/json")
.form(&[("url", url.as_str())])
.send()
.await?;
let job = response.json().await?;
debug!(?job, "save page job started");
Ok(job)
}
pub async fn job_status(&self, job: &Job) -> Result<Status, Error> {
let response = self
.client
.post("https://web.archive.org/save/status")
.header(
"Authorization",
format!("LOW {}:{}", self.access_key, self.secret_key),
)
.header("Accept", "application/json")
.form(&[("job_id", &job.job_id)])
.send()
.await?;
let status = response.json().await?;
Ok(status)
}
}
#[derive(Debug, Error, Diagnostic)]
pub enum Error {
#[error("HTTP error while communicating with Wayback Machine Save Page Now API")]
HttpError(
#[from]
#[source]
reqwest::Error,
),
}
#[derive(Deserialize, Debug)]
pub struct Job {
url: Option<Url>,
job_id: String,
message: Option<String>,
}
impl Job {
pub fn job_id(&self) -> &str {
self.job_id.as_ref()
}
}
#[derive(Deserialize, Debug)]
pub struct Status {
counters: Option<HashMap<String, u32>>,
delay_wb_availability: Option<bool>,
duration_sec: Option<f64>,
http_status: Option<u16>,
job_id: String,
original_url: Option<Url>,
outlinks: Option<Vec<Url>>,
resources: Option<Vec<Url>>,
status: String,
timestamp: Option<String>,
}
impl Status {
pub fn status(&self) -> &str {
self.status.as_ref()
}
/// Returns true if the job is complete (success or error).
pub fn is_complete(&self) -> bool {
self.status != "pending"
}
/// Returns true if the job completed successfully.
pub fn is_success(&self) -> bool {
self.status == "success"
}
/// Builds the wayback URL for this archived page.
///
/// Returns None if the job hasn't completed successfully yet.
pub fn wayback_url(&self) -> Option<Url> {
if !self.is_success() {
return None;
}
let timestamp = self.timestamp.as_ref()?;
let original_url = self.original_url.as_ref()?;
// Build URL: https://web.archive.org/web/{timestamp}/{original_url}
let wayback_str = format!("https://web.archive.org/web/{}/{}", timestamp, original_url);
Url::parse(&wayback_str).ok()
}
/// Returns the timestamp of the archive, if available.
pub fn timestamp(&self) -> Option<&str> {
self.timestamp.as_deref()
}
/// Returns the original URL that was archived.
pub fn original_url(&self) -> Option<&Url> {
self.original_url.as_ref()
}
}
#[cfg(test)]
mod test {
use miette::IntoDiagnostic;
#[test]
fn deserialize_jobs() -> miette::Result<()> {
// some example responses from the API:
//
// Successful job submission:
// {"url":"https://theincredibleholk.org/about/","job_id":"spn2-ffa890ab71a52c6ca87389d4f214becedbaa275a"}
//
// One that's already been submitted:
// {"url":"https://theincredibleholk.org/about/","job_id":"spn2-ffa890ab71a52c6ca87389d4f214becedbaa275a","message":"The same snapshot had been made 1 second ago. You can make new capture of this URL after 1 hour."}
serde_json::from_str::<super::Job>(
r#"{"url":"https://theincredibleholk.org/about/","job_id":"spn2-ffa890ab71a52c6ca87389d4f214becedbaa275a"}"#,
).into_diagnostic()?;
serde_json::from_str::<super::Job>(
r#"{"url":"https://theincredibleholk.org/about/","job_id":"spn2-ffa890ab71a52c6ca87389d4f214becedbaa275a","message":"The same snapshot had been made 1 second ago. You can make new capture of this URL after 1 hour."}"#,
).into_diagnostic()?;
Ok(())
}
#[test]
fn deserialize_job_status() -> miette::Result<()> {
// {"counters":{"embeds":8,"outlinks":47},"delay_wb_availability":true,"duration_sec":12.72,"http_status":200,"job_id":"spn2-c093004522eaa435107c0d9ee8aac46a17199841","original_url":"https://theincredibleholk.org/","outlinks":["https://github.com/eholk","https://mastodon.social/@theincredibleholk","https://theincredibleholk.org/blog/2023/07/11/how-to-elect-rust-project-directors/","https://theincredibleholk.org/blog/2023/01/25/hello-from-erics-blog-generator/","https://rust-lang.zulipchat.com/#narrow/stream/213817-t-lang/topic/Where.20to.20talk.20about.20.60try.20.7B.7D.60.2C.20.60yeet.60.2C.20etc.3F","https://theincredibleholk.org/blog/2023/11/08/cancellation-async-state-machines/","https://theincredibleholk.org/office-hours/","https://ryanlevick.com/","https://smallcultfollowing.com/babysteps/blog/2023/02/01/async-trait-send-bounds-part-1-intro/","https://github.com/rust-lang/rust/pull/118457","https://veykril.github.io/about/","https://github.com/eholk/ebg","https://theincredibleholk.org/blog/2023/06/20/rust-leadership-council/","https://theincredibleholk.org/blog/2023/12/15/rethinking-rusts-function-declaration-syntax/","https://theincredibleholk.org/blog/2023/06/23/an-exercise-on-culture/","https://theincredibleholk.org/atom.xml","https://yaah.dev/","https://github.com/rust-lang/rust/pull/118420","https://theincredibleholk.org/about/","https://github.com/rust-lang/wg-async/issues/297","https://www.prdaily.com/how-microsoft-manages-culture-change/","https://theincredibleholk.org/blog/2023/11/14/a-mechanism-for-async-cancellation/","https://theincredibleholk.org/blog/2023/02/16/lightweight-predictable-async-send-bounds/","https://theincredibleholk.org/blog/2023/01/24/who-makes-the-boxes/","http://www.apache.org/licenses/LICENSE-2.0","https://smallcultfollowing.com/babysteps/blog/2023/02/13/return-type-notation-send-bounds-part-2/","https://github.com/orgs/rust-lang/projects/28/views/1","https://theincredibleholk.org/blog/2023/02/13/inferred-async-send-bounds/","https://doc.rust-lang.org/std/ops/trait.Try.html#impl-Try-for-Option%3CT%3E","http://creativecommons.org/licenses/by-nc/4.0/","https://theincredibleholk.org/blog/archives/","https://theincredibleholk.org/papers/"],"resources":["https://theincredibleholk.org/","https://theincredibleholk.org/assets/main.css","https://theincredibleholk.org/images/cc-by-nc-4.0-88x31.png","https://theincredibleholk.org/assets/fonts/BerkeleyMono-Bold.woff2","https://theincredibleholk.org/assets/fonts/BerkeleyMono-Regular.woff2","https://theincredibleholk.org/assets/fonts/D-DIN.otf","https://theincredibleholk.org/assets/fonts/BerkeleyMono-Italic.woff2","https://theincredibleholk.org/assets/fonts/D-DIN-Italic.otf"],"status":"success","timestamp":"20240104034229"}
let raw_json = r#"{"counters":{"embeds":8,"outlinks":47},"delay_wb_availability":true,"duration_sec":12.72,"http_status":200,"job_id":"spn2-c093004522eaa435107c0d9ee8aac46a17199841","original_url":"https://theincredibleholk.org/","outlinks":["https://github.com/eholk","https://mastodon.social/@theincredibleholk","https://theincredibleholk.org/blog/2023/07/11/how-to-elect-rust-project-directors/","https://theincredibleholk.org/blog/2023/01/25/hello-from-erics-blog-generator/","https://rust-lang.zulipchat.com/#narrow/stream/213817-t-lang/topic/Where.20to.20talk.20about.20.60try.20.7B.7D.60.2C.20.60yeet.60.2C.20etc.3F","https://theincredibleholk.org/blog/2023/11/08/cancellation-async-state-machines/","https://theincredibleholk.org/office-hours/","https://ryanlevick.com/","https://smallcultfollowing.com/babysteps/blog/2023/02/01/async-trait-send-bounds-part-1-intro/","https://github.com/rust-lang/rust/pull/118457","https://veykril.github.io/about/","https://github.com/eholk/ebg","https://theincredibleholk.org/blog/2023/06/20/rust-leadership-council/","https://theincredibleholk.org/blog/2023/12/15/rethinking-rusts-function-declaration-syntax/","https://theincredibleholk.org/blog/2023/06/23/an-exercise-on-culture/","https://theincredibleholk.org/atom.xml","https://yaah.dev/","https://github.com/rust-lang/rust/pull/118420","https://theincredibleholk.org/about/","https://github.com/rust-lang/wg-async/issues/297","https://www.prdaily.com/how-microsoft-manages-culture-change/","https://theincredibleholk.org/blog/2023/11/14/a-mechanism-for-async-cancellation/","https://theincredibleholk.org/blog/2023/02/16/lightweight-predictable-async-send-bounds/","https://theincredibleholk.org/blog/2023/01/24/who-makes-the-boxes/","http://www.apache.org/licenses/LICENSE-2.0","https://smallcultfollowing.com/babysteps/blog/2023/02/13/return-type-notation-send-bounds-part-2/","https://github.com/orgs/rust-lang/projects/28/views/1","https://theincredibleholk.org/blog/2023/02/13/inferred-async-send-bounds/","https://doc.rust-lang.org/std/ops/trait.Try.html#impl-Try-for-Option%3CT%3E","http://creativecommons.org/licenses/by-nc/4.0/","https://theincredibleholk.org/blog/archives/","https://theincredibleholk.org/papers/"],"resources":["https://theincredibleholk.org/","https://theincredibleholk.org/assets/main.css","https://theincredibleholk.org/images/cc-by-nc-4.0-88x31.png","https://theincredibleholk.org/assets/fonts/BerkeleyMono-Bold.woff2","https://theincredibleholk.org/assets/fonts/BerkeleyMono-Regular.woff2","https://theincredibleholk.org/assets/fonts/D-DIN.otf","https://theincredibleholk.org/assets/fonts/BerkeleyMono-Italic.woff2","https://theincredibleholk.org/assets/fonts/D-DIN-Italic.otf"],"status":"success","timestamp":"20240104034229"}"#;
serde_json::from_str::<super::Status>(raw_json).into_diagnostic()?;
Ok(())
}
#[test]
fn status_helper_methods() -> miette::Result<()> {
let raw_json = r#"{"counters":{"embeds":8,"outlinks":47},"delay_wb_availability":true,"duration_sec":12.72,"http_status":200,"job_id":"spn2-test","original_url":"https://example.com/","status":"success","timestamp":"20240104034229"}"#;
let status: super::Status = serde_json::from_str(raw_json).into_diagnostic()?;
assert!(status.is_complete());
assert!(status.is_success());
assert_eq!(status.timestamp(), Some("20240104034229"));
let wayback_url = status.wayback_url().expect("should have wayback URL");
assert_eq!(
wayback_url.as_str(),
"https://web.archive.org/web/20240104034229/https://example.com/"
);
Ok(())
}
#[test]
fn status_pending() -> miette::Result<()> {
let raw_json = r#"{"job_id":"spn2-test","status":"pending"}"#;
let status: super::Status = serde_json::from_str(raw_json).into_diagnostic()?;
assert!(!status.is_complete());
assert!(!status.is_success());
assert!(status.wayback_url().is_none());
Ok(())
}
}