#[macro_use]
extern crate log;
#[cfg(feature = "with-serde")]
use serde::{Deserialize, Serialize};
use chrono::offset::TimeZone;
use chrono::DateTime;
use futures::{stream, StreamExt};
use reqwest::{header, IntoUrl};
use std::fmt;
use std::rc::Rc;
#[derive(Debug)]
pub enum Error {
Reqwest(reqwest::Error),
MissingToken,
MissingUrl(String),
ServerError(String),
}
impl From<reqwest::Error> for Error {
fn from(err: reqwest::Error) -> Self {
Error::Reqwest(err)
}
}
impl std::error::Error for Error {}
impl fmt::Display for Error {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
match self {
Error::MissingToken => write!(f, "Missing required token."),
Error::Reqwest(err) => err.fmt(f),
Error::MissingUrl(url) => write!(f, "Missing archiveis url after archiving {}", url),
Error::ServerError(url) => write!(f, "Encountered server error for {}", url),
}
}
}
pub type Result<T> = ::std::result::Result<T, Error>;
#[derive(Debug, Clone)]
#[cfg_attr(feature = "with-serde", derive(Serialize, Deserialize))]
pub struct Archived {
pub target_url: String,
pub archived_url: String,
pub time_stamp: Option<DateTime<chrono::Utc>>,
pub submit_token: String,
}
pub struct ArchiveClient {
client: Rc<reqwest::Client>,
}
impl ArchiveClient {
pub fn new<T: ToString>(user_agent: T) -> Self {
let mut headers = header::HeaderMap::with_capacity(1);
headers.insert(
header::USER_AGENT,
user_agent
.to_string()
.parse()
.expect("Failed to parse user agent."),
);
let client = reqwest::ClientBuilder::default()
.default_headers(headers)
.build()
.expect("Failed to create reqwest client");
ArchiveClient {
client: Rc::new(client),
}
}
pub async fn capture_all<U: IntoUrl>(self, links: Vec<U>) -> Result<Vec<Result<Archived>>> {
let token = self.get_unique_token().await?;
Ok(stream::iter(
links
.into_iter()
.map(|url| async { self.capture_with_token(url, token.clone()).await }),
)
.buffer_unordered(10)
.collect::<Vec<_>>()
.await)
}
pub async fn capture<U: IntoUrl>(&self, url: U) -> Result<Archived> {
self.capture_with_token(url, self.get_unique_token().await?)
.await
}
pub async fn capture_with_token<U: IntoUrl, T: ToString>(
&self,
url: U,
submit_token: T,
) -> Result<Archived> {
let target_url = url.into_url()?;
let submit_token = submit_token.to_string();
let body: String = url::form_urlencoded::Serializer::new(String::new())
.append_pair("url", target_url.as_str())
.append_pair("anyway", "1")
.append_pair("submitid", &submit_token)
.finish();
let resp = self
.client
.post(target_url.clone())
.body(body)
.send()
.await?;
if let Some(archived_url) = resp.headers().get("Refresh").and_then(|x| {
x.to_str()
.ok()
.and_then(|x| x.split('=').nth(1).map(str::to_string))
}) {
let time_stamp = resp.headers().get("Date").and_then(|x| {
x.to_str()
.ok()
.and_then(|x| chrono::Utc.datetime_from_str(x, "%a, %e %b %Y %T GMT").ok())
});
let archived = Archived {
target_url: target_url.to_string(),
archived_url,
time_stamp,
submit_token: submit_token.to_string(),
};
debug!(
"Archived target url {} at {}",
archived.target_url, archived.archived_url
);
return Ok(archived);
} else {
if let Ok(html) = resp.text().await {
if html.starts_with("<h1>Server Error</h1>") {
error!("Server Error while archiving {}", target_url);
return Err(Error::ServerError(target_url.to_string()));
}
let archived_url =
html.splitn(2, "<meta property=\"og:url\"")
.nth(1)
.and_then(|x| {
x.splitn(2, "content=\"")
.nth(1)
.and_then(|id| id.splitn(2, '\"').next().map(str::to_owned))
});
if let Some(archived_url) = archived_url {
let archived = Archived {
target_url: target_url.to_string(),
archived_url,
time_stamp: None,
submit_token: submit_token.to_string(),
};
debug!(
"Archived target url {} at {}",
archived.target_url, archived.archived_url
);
return Ok(archived);
}
}
error!("Failed to archive {}", target_url);
return Err(Error::MissingUrl(target_url.into_string()));
}
}
pub async fn get_unique_token(&self) -> Result<String> {
let html = self
.client
.get("http://archive.is/")
.send()
.await?
.text()
.await
.map_err(|_| Error::MissingToken)?;
html.rsplitn(2, "name=\"submitid")
.next()
.and_then(|x| {
x.splitn(2, "value=\"")
.nth(1)
.and_then(|token| token.splitn(2, '\"').next().map(str::to_string))
})
.ok_or(Error::MissingToken)
}
}
impl Default for ArchiveClient {
fn default() -> Self {
ArchiveClient::new("archiveis-rs")
}
}
#[cfg(test)]
mod tests {
#[test]
fn extract_unique_token() {
let html = r###"type="hidden" name="submitid" value="1yPA39C6QcM84Dzspl+7s28rrAFOnliPMCiJtoP+OlTKmd5kJd21G4ucgTkx0mnZ"/>"###;
let split = html
.rsplitn(2, "name=\"submitid")
.filter_map(|x| {
x.splitn(2, "value=\"")
.skip(1)
.filter_map(|token| token.splitn(2, '\"').next())
.next()
})
.next();
assert_eq!(
Some("1yPA39C6QcM84Dzspl+7s28rrAFOnliPMCiJtoP+OlTKmd5kJd21G4ucgTkx0mnZ"),
split
);
}
}