1use chrono::{Duration, Utc};
2use reqwest::{Client, Error};
3use std::collections::HashSet;
4use tokio_retry::strategy::ExponentialBackoff;
5use tokio_retry::Retry;
6
7static ARCHIVE_CDX_ENDPOINT: &str = "https://web.archive.org/cdx/search/cdx";
8
9type CDXResumeKey = Option<String>;
10type FetchCDXResult = anyhow::Result<(HashSet<String>, CDXResumeKey)>;
11
12const ARCHIVE_WEB_ENDPOINT: &str = "https://web.archive.org/web";
13
14pub fn create_archive_url(url: &str) -> String {
15 let date = Utc::now();
17 format!(
18 "{}/{}000000id_/{}",
19 ARCHIVE_WEB_ENDPOINT,
20 date.format("%Y%m%d"),
21 url
22 )
23}
24
25pub async fn fetch_cdx(
26 client: &Client,
27 prefix: &str,
28 limit: usize,
29 resume_key: Option<String>,
30) -> FetchCDXResult {
31 let last_year = Utc::now() - Duration::weeks(52);
32 let last_year = last_year.format("%Y").to_string();
33
34 let mut params: Vec<(String, String)> = vec![
37 ("matchType".into(), "prefix".into()),
39 ("filter".into(), "statuscode:200".into()),
41 ("filter".into(), "mimetype:text/html".into()),
43 ("collapse".into(), "urlkey".into()),
45 ("showResumeKey".into(), "true".into()),
47 ("limit".into(), limit.to_string()),
48 ("fl".into(), "original".into()),
50 ("from".into(), last_year),
52 ("url".into(), prefix.into()),
53 ];
54
55 if let Some(resume) = resume_key {
56 params.push(("resumeKey".into(), resume));
57 }
58
59 let response = fetch_cdx_page(client, params).await?;
60
61 let mut urls = HashSet::new();
62 let mut resume_key = None;
63
64 for url in response.split('\n') {
65 if url.is_empty() {
66 continue;
67 }
68
69 if urls.len() >= limit {
71 resume_key = Some(url.to_string());
72 } else {
73 urls.insert(url.into());
74 }
75 }
76
77 Ok((urls, resume_key))
78}
79
80async fn fetch_cdx_page(
81 client: &Client,
82 params: Vec<(String, String)>,
83) -> anyhow::Result<String, Error> {
84 let retry_strat = ExponentialBackoff::from_millis(1000).take(3);
85 Retry::spawn(retry_strat, || async {
87 let req = client.get(ARCHIVE_CDX_ENDPOINT).query(¶ms);
88 let resp = req.send().await;
89 match resp {
90 Ok(resp) => resp.text().await,
91 Err(err) => Err(err),
92 }
93 })
94 .await
95}