libnetrunner/
cdx.rs

1use chrono::{Duration, Utc};
2use reqwest::{Client, Error};
3use std::collections::HashSet;
4use tokio_retry::strategy::ExponentialBackoff;
5use tokio_retry::Retry;
6
7static ARCHIVE_CDX_ENDPOINT: &str = "https://web.archive.org/cdx/search/cdx";
8
9type CDXResumeKey = Option<String>;
10type FetchCDXResult = anyhow::Result<(HashSet<String>, CDXResumeKey)>;
11
12const ARCHIVE_WEB_ENDPOINT: &str = "https://web.archive.org/web";
13
14pub fn create_archive_url(url: &str) -> String {
15    // Always try to grab the latest archived crawl
16    let date = Utc::now();
17    format!(
18        "{}/{}000000id_/{}",
19        ARCHIVE_WEB_ENDPOINT,
20        date.format("%Y%m%d"),
21        url
22    )
23}
24
25pub async fn fetch_cdx(
26    client: &Client,
27    prefix: &str,
28    limit: usize,
29    resume_key: Option<String>,
30) -> FetchCDXResult {
31    let last_year = Utc::now() - Duration::weeks(52);
32    let last_year = last_year.format("%Y").to_string();
33
34    // More docs on parameters here:
35    // https://github.com/internetarchive/wayback/tree/master/wayback-cdx-server#filtering
36    let mut params: Vec<(String, String)> = vec![
37        // TODO: Make this configurable in the lens format?
38        ("matchType".into(), "prefix".into()),
39        // Only successful pages
40        ("filter".into(), "statuscode:200".into()),
41        // Only HTML docs
42        ("filter".into(), "mimetype:text/html".into()),
43        // Remove dupes
44        ("collapse".into(), "urlkey".into()),
45        // If there are too many URLs, let's us paginate
46        ("showResumeKey".into(), "true".into()),
47        ("limit".into(), limit.to_string()),
48        // Only care about the original URL crawled
49        ("fl".into(), "original".into()),
50        // Only look at things that have existed within the last year.
51        ("from".into(), last_year),
52        ("url".into(), prefix.into()),
53    ];
54
55    if let Some(resume) = resume_key {
56        params.push(("resumeKey".into(), resume));
57    }
58
59    let response = fetch_cdx_page(client, params).await?;
60
61    let mut urls = HashSet::new();
62    let mut resume_key = None;
63
64    for url in response.split('\n') {
65        if url.is_empty() {
66            continue;
67        }
68
69        // Text after the limit num is the resume key
70        if urls.len() >= limit {
71            resume_key = Some(url.to_string());
72        } else {
73            urls.insert(url.into());
74        }
75    }
76
77    Ok((urls, resume_key))
78}
79
80async fn fetch_cdx_page(
81    client: &Client,
82    params: Vec<(String, String)>,
83) -> anyhow::Result<String, Error> {
84    let retry_strat = ExponentialBackoff::from_millis(1000).take(3);
85    // If we're hitting the CDX endpoint too fast, wait a little bit before retrying.
86    Retry::spawn(retry_strat, || async {
87        let req = client.get(ARCHIVE_CDX_ENDPOINT).query(&params);
88        let resp = req.send().await;
89        match resp {
90            Ok(resp) => resp.text().await,
91            Err(err) => Err(err),
92        }
93    })
94    .await
95}