Skip to main content

webfetch/
fetch.rs

1use reqwest::header::CONTENT_TYPE;
2use reqwest::{redirect::Policy, Client};
3use std::time::Duration;
4
5const USER_AGENT: &str = concat!("webfetch/", env!("CARGO_PKG_VERSION"));
6const MAX_ATTEMPTS: u32 = 3;
7
8/// Outcome of an HTTP fetch: the body, the URL we actually landed on after
9/// following redirects, and the response's `Content-Type` (if any).
10pub struct FetchedPage {
11    pub body: String,
12    pub final_url: String,
13    pub content_type: Option<String>,
14}
15
16fn build_client(timeout_secs: u64) -> anyhow::Result<Client> {
17    Ok(Client::builder()
18        .timeout(Duration::from_secs(timeout_secs))
19        .redirect(Policy::limited(5))
20        .user_agent(USER_AGENT)
21        .gzip(true)
22        .brotli(true)
23        .build()?)
24}
25
26/// One request attempt. The bool in the error reports whether the failure is
27/// transient (worth retrying): connection/timeout errors, 5xx, and 429.
28async fn attempt(client: &Client, url: &str) -> Result<FetchedPage, (anyhow::Error, bool)> {
29    let resp = match client.get(url).send().await {
30        Ok(r) => r,
31        Err(e) => {
32            let transient = e.is_timeout() || e.is_connect() || e.is_request();
33            return Err((e.into(), transient));
34        }
35    };
36
37    let status = resp.status();
38    let resp = match resp.error_for_status() {
39        Ok(r) => r,
40        Err(e) => {
41            let transient = status.is_server_error() || status.as_u16() == 429;
42            return Err((e.into(), transient));
43        }
44    };
45
46    let final_url = resp.url().to_string();
47    let content_type = resp
48        .headers()
49        .get(CONTENT_TYPE)
50        .and_then(|v| v.to_str().ok())
51        .map(|s| s.to_string());
52
53    match resp.text().await {
54        Ok(body) => Ok(FetchedPage {
55            body,
56            final_url,
57            content_type,
58        }),
59        Err(e) => {
60            let transient = e.is_timeout();
61            Err((e.into(), transient))
62        }
63    }
64}
65
66/// Fetch a URL, following redirects, retrying transient failures with
67/// exponential backoff (200ms, 400ms).
68pub async fn fetch_page(url: &str, timeout_secs: u64) -> anyhow::Result<FetchedPage> {
69    let client = build_client(timeout_secs)?;
70
71    let mut delay = Duration::from_millis(200);
72    for attempt_no in 1..=MAX_ATTEMPTS {
73        match attempt(&client, url).await {
74            Ok(page) => return Ok(page),
75            Err((err, transient)) => {
76                if attempt_no == MAX_ATTEMPTS || !transient {
77                    return Err(err);
78                }
79                tokio::time::sleep(delay).await;
80                delay *= 2;
81            }
82        }
83    }
84    unreachable!("loop returns on the final attempt")
85}