use anyhow::Result;
use serde_json::Value;
use std::future::Future;
use std::pin::Pin;
use super::Provider;
#[derive(Clone)]
pub struct WaybackMachineProvider {
include_subdomains: bool,
proxy: Option<String>,
proxy_auth: Option<String>,
timeout: u64,
retries: u32,
random_agent: bool,
insecure: bool,
parallel: u32,
rate_limit: Option<f32>,
}
impl WaybackMachineProvider {
pub fn new() -> Self {
WaybackMachineProvider {
include_subdomains: false,
proxy: None,
proxy_auth: None,
timeout: 30,
retries: 3,
random_agent: false,
insecure: false,
parallel: 5,
rate_limit: None,
}
}
}
impl Provider for WaybackMachineProvider {
fn clone_box(&self) -> Box<dyn Provider> {
Box::new(self.clone())
}
fn fetch_urls<'a>(
&'a self,
domain: &'a str,
) -> Pin<Box<dyn Future<Output = Result<Vec<String>>> + Send + 'a>> {
Box::pin(async move {
let url = if self.include_subdomains {
format!(
"https://web.archive.org/cdx/search/cdx?url=*.{}/*&output=json&fl=original",
domain
)
} else {
format!(
"https://web.archive.org/cdx/search/cdx?url={}/*&output=json&fl=original",
domain
)
};
let mut client_builder =
reqwest::Client::builder().timeout(std::time::Duration::from_secs(self.timeout));
if self.insecure {
client_builder = client_builder.danger_accept_invalid_certs(true);
}
if self.random_agent {
let user_agents = [
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.5 Safari/605.1.15",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/115.0",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36",
"Mozilla/5.0 (iPhone; CPU iPhone OS 16_5 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.5 Mobile/15E148 Safari/604.1",
];
let random_index = std::time::SystemTime::now()
.duration_since(std::time::UNIX_EPOCH)
.unwrap()
.as_secs() as usize
% user_agents.len();
client_builder = client_builder.user_agent(user_agents[random_index]);
}
if let Some(proxy_url) = &self.proxy {
let mut proxy = reqwest::Proxy::all(proxy_url)?;
if let Some(auth) = &self.proxy_auth {
proxy = proxy.basic_auth(
auth.split(':').next().unwrap_or(""),
auth.split(':').nth(1).unwrap_or(""),
);
}
client_builder = client_builder.proxy(proxy);
}
let client = client_builder.build()?;
let mut last_error = None;
let mut attempt = 0;
while attempt <= self.retries {
if attempt > 0 {
tokio::time::sleep(std::time::Duration::from_millis(500 * attempt as u64))
.await;
}
match client.get(&url).send().await {
Ok(response) => {
if !response.status().is_success() {
attempt += 1;
last_error = Some(anyhow::anyhow!("HTTP error: {}", response.status()));
continue;
}
match response.text().await {
Ok(text) => {
if text.trim().is_empty() {
return Ok(Vec::new());
}
match serde_json::from_str::<Value>(&text) {
Ok(json_data) => {
let mut urls = Vec::new();
if let Value::Array(arrays) = json_data {
for (i, array) in arrays.iter().enumerate() {
if i == 0 {
continue;
}
if let Value::Array(elements) = array {
if let Some(Value::String(url)) =
elements.first()
{
urls.push(url.clone());
}
}
}
}
urls.sort();
urls.dedup();
return Ok(urls);
}
Err(e) => {
attempt += 1;
last_error = Some(e.into());
continue;
}
}
}
Err(e) => {
attempt += 1;
last_error = Some(e.into());
continue;
}
}
}
Err(e) => {
attempt += 1;
last_error = Some(e.into());
continue;
}
}
}
if let Some(e) = last_error {
Err(anyhow::anyhow!(
"Failed after {} attempts: {}",
self.retries + 1,
e
))
} else {
Err(anyhow::anyhow!(
"Failed after {} attempts",
self.retries + 1
))
}
})
}
fn with_subdomains(&mut self, include: bool) {
self.include_subdomains = include;
}
fn with_proxy(&mut self, proxy: Option<String>) {
self.proxy = proxy;
}
fn with_proxy_auth(&mut self, auth: Option<String>) {
self.proxy_auth = auth;
}
fn with_timeout(&mut self, seconds: u64) {
self.timeout = seconds;
}
fn with_retries(&mut self, count: u32) {
self.retries = count;
}
fn with_random_agent(&mut self, enabled: bool) {
self.random_agent = enabled;
}
fn with_insecure(&mut self, enabled: bool) {
self.insecure = enabled;
}
fn with_parallel(&mut self, parallel: u32) {
self.parallel = parallel;
}
fn with_rate_limit(&mut self, rate_limit: Option<f32>) {
self.rate_limit = rate_limit;
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_new_provider() {
let provider = WaybackMachineProvider::new();
assert!(!provider.include_subdomains);
assert_eq!(provider.proxy, None);
assert_eq!(provider.proxy_auth, None);
assert_eq!(provider.timeout, 30);
assert_eq!(provider.retries, 3);
assert!(!provider.random_agent);
assert!(!provider.insecure);
assert_eq!(provider.parallel, 5);
assert_eq!(provider.rate_limit, None);
}
#[test]
fn test_with_subdomains() {
let mut provider = WaybackMachineProvider::new();
provider.with_subdomains(true);
assert!(provider.include_subdomains);
}
#[test]
fn test_with_proxy() {
let mut provider = WaybackMachineProvider::new();
provider.with_proxy(Some("http://proxy.example.com:8080".to_string()));
assert_eq!(
provider.proxy,
Some("http://proxy.example.com:8080".to_string())
);
}
#[test]
fn test_with_proxy_auth() {
let mut provider = WaybackMachineProvider::new();
provider.with_proxy_auth(Some("user:pass".to_string()));
assert_eq!(provider.proxy_auth, Some("user:pass".to_string()));
}
#[test]
fn test_with_timeout() {
let mut provider = WaybackMachineProvider::new();
provider.with_timeout(60);
assert_eq!(provider.timeout, 60);
}
#[test]
fn test_with_retries() {
let mut provider = WaybackMachineProvider::new();
provider.with_retries(5);
assert_eq!(provider.retries, 5);
}
#[test]
fn test_with_random_agent() {
let mut provider = WaybackMachineProvider::new();
provider.with_random_agent(true);
assert!(provider.random_agent);
}
#[test]
fn test_with_insecure() {
let mut provider = WaybackMachineProvider::new();
provider.with_insecure(true);
assert!(provider.insecure);
}
#[test]
fn test_with_parallel() {
let mut provider = WaybackMachineProvider::new();
provider.with_parallel(10);
assert_eq!(provider.parallel, 10);
}
#[test]
fn test_with_rate_limit() {
let mut provider = WaybackMachineProvider::new();
provider.with_rate_limit(Some(2.5));
assert_eq!(provider.rate_limit, Some(2.5));
}
#[test]
fn test_clone_box() {
let provider = WaybackMachineProvider::new();
let _cloned = provider.clone_box();
}
#[tokio::test]
async fn test_fetch_urls_builds_correct_url_without_subdomains() {
let provider = WaybackMachineProvider::new();
let domain = "test-domain-that-does-not-exist-xyz.example";
let expected_url = format!(
"https://web.archive.org/cdx/search/cdx?url={}/*&output=json&fl=original",
domain
);
let url = if provider.include_subdomains {
format!(
"https://web.archive.org/cdx/search/cdx?url=*.{}/*&output=json&fl=original",
domain
)
} else {
format!(
"https://web.archive.org/cdx/search/cdx?url={}/*&output=json&fl=original",
domain
)
};
assert_eq!(url, expected_url);
}
#[tokio::test]
async fn test_fetch_urls_builds_correct_url_with_subdomains() {
let mut provider = WaybackMachineProvider::new();
provider.with_subdomains(true);
let domain = "test-domain-that-does-not-exist-xyz.example";
let expected_url = format!(
"https://web.archive.org/cdx/search/cdx?url=*.{}/*&output=json&fl=original",
domain
);
let url = if provider.include_subdomains {
format!(
"https://web.archive.org/cdx/search/cdx?url=*.{}/*&output=json&fl=original",
domain
)
} else {
format!(
"https://web.archive.org/cdx/search/cdx?url={}/*&output=json&fl=original",
domain
)
};
assert_eq!(url, expected_url);
}
}