cloudscraper_rs/challenges/solvers/
rate_limit.rs

1//! Handler for Cloudflare rate limiting responses (HTTP 1015).
2//!
3//! Recommends adaptive delays based on headers and page content when 1015
4//! responses are encountered.
5
6use std::time::Duration;
7
8use chrono::{DateTime, Utc};
9use once_cell::sync::Lazy;
10use rand::Rng;
11use regex::{Regex, RegexBuilder};
12use thiserror::Error;
13
14use crate::challenges::core::{ChallengeResponse, is_cloudflare_response};
15
16use super::{ChallengeSolver, FailureRecorder, MitigationPlan};
17
18const DEFAULT_DELAY_MIN_SECS: f32 = 60.0;
19const DEFAULT_DELAY_MAX_SECS: f32 = 180.0;
20
21/// Advises backoff windows for 1015 responses.
22pub struct RateLimitHandler {
23    delay_min: Duration,
24    delay_max: Duration,
25}
26
27impl RateLimitHandler {
28    pub fn new() -> Self {
29        Self {
30            delay_min: Duration::from_secs_f32(DEFAULT_DELAY_MIN_SECS),
31            delay_max: Duration::from_secs_f32(DEFAULT_DELAY_MAX_SECS),
32        }
33    }
34
35    pub fn with_delay_range(mut self, min: Duration, max: Duration) -> Self {
36        self.delay_min = min;
37        self.delay_max = if max < min { min } else { max };
38        self
39    }
40
41    pub fn is_rate_limited(response: &ChallengeResponse<'_>) -> bool {
42        is_cloudflare_response(response)
43            && response.status == 429
44            && RATE_LIMIT_RE.is_match(response.body)
45    }
46
47    pub fn plan(
48        &self,
49        response: &ChallengeResponse<'_>,
50        state_recorder: Option<&dyn FailureRecorder>,
51    ) -> Result<MitigationPlan, RateLimitError> {
52        if !Self::is_rate_limited(response) {
53            return Err(RateLimitError::NotRateLimited);
54        }
55
56        if let Some(recorder) = state_recorder
57            && let Some(domain) = response.url.host_str()
58        {
59            recorder.record_failure(domain, "cf_rate_limit");
60        }
61
62        let (delay, source) = self.determine_delay(response);
63        let mut plan = MitigationPlan::retry_after(delay, "rate_limit");
64        plan.metadata.insert("delay_source".into(), source);
65        plan.metadata.insert("trigger".into(), "cf_1015".into());
66
67        Ok(plan)
68    }
69
70    fn determine_delay(&self, response: &ChallengeResponse<'_>) -> (Duration, String) {
71        if let Some(delay) = self.retry_after_header(response) {
72            return (delay, "header".into());
73        }
74
75        if let Some(delay) = self.delay_from_body(response.body) {
76            return (delay, "body".into());
77        }
78
79        (self.random_delay(), "default".into())
80    }
81
82    fn retry_after_header(&self, response: &ChallengeResponse<'_>) -> Option<Duration> {
83        use http::header::RETRY_AFTER;
84
85        let raw = response.headers.get(RETRY_AFTER)?.to_str().ok()?;
86        if let Ok(seconds) = raw.trim().parse::<f64>()
87            && seconds.is_finite()
88            && seconds >= 0.0
89        {
90            return Some(Duration::from_secs_f64(seconds));
91        }
92
93        if let Ok(date) = DateTime::parse_from_rfc2822(raw.trim())
94            .or_else(|_| DateTime::parse_from_rfc3339(raw.trim()))
95            && let Ok(duration) = (date.with_timezone(&Utc) - Utc::now()).to_std()
96        {
97            return Some(duration);
98        }
99
100        None
101    }
102
103    fn delay_from_body(&self, body: &str) -> Option<Duration> {
104        let caps = RATE_LIMIT_DELAY_RE.captures(body)?;
105        let amount: u64 = caps.get(1)?.as_str().parse().ok()?;
106        let unit = caps.get(2)?.as_str().to_lowercase();
107        let multiplier = match unit.as_str() {
108            "second" | "seconds" => 1,
109            "minute" | "minutes" => 60,
110            "hour" | "hours" => 3600,
111            _ => 1,
112        };
113        Some(Duration::from_secs(amount * multiplier))
114    }
115
116    fn random_delay(&self) -> Duration {
117        if self.delay_max <= self.delay_min {
118            return self.delay_min;
119        }
120        let mut rng = rand::thread_rng();
121        let min = self.delay_min.as_secs_f32();
122        let max = self.delay_max.as_secs_f32();
123        Duration::from_secs_f32(rng.gen_range(min..max))
124    }
125}
126
127impl Default for RateLimitHandler {
128    fn default() -> Self {
129        Self::new()
130    }
131}
132
133impl ChallengeSolver for RateLimitHandler {
134    fn name(&self) -> &'static str {
135        "rate_limit"
136    }
137}
138
139#[derive(Debug, Error)]
140pub enum RateLimitError {
141    #[error("response is not a Cloudflare rate limit challenge")]
142    NotRateLimited,
143}
144
145static RATE_LIMIT_RE: Lazy<Regex> = Lazy::new(|| {
146    RegexBuilder::new(
147        r#"(<span[^>]*class=['"]cf-error-code['"]>1015<|rate limited|You are being rate limited)"#,
148    )
149    .case_insensitive(true)
150    .dot_matches_new_line(true)
151    .build()
152    .expect("invalid rate limit regex")
153});
154
155static RATE_LIMIT_DELAY_RE: Lazy<Regex> = Lazy::new(|| {
156    RegexBuilder::new(r#"(\d+)\s*(second|seconds|minute|minutes|hour|hours)"#)
157        .case_insensitive(true)
158        .build()
159        .expect("invalid delay regex")
160});
161
162#[cfg(test)]
163mod tests {
164    use super::*;
165    use http::{
166        HeaderMap, HeaderValue, Method,
167        header::{HeaderName, RETRY_AFTER, SERVER},
168    };
169    use url::Url;
170
171    struct ResponseFixture {
172        url: Url,
173        headers: HeaderMap,
174        method: Method,
175        body: String,
176        status: u16,
177    }
178
179    impl ResponseFixture {
180        fn new(body: &str, status: u16) -> Self {
181            Self {
182                url: Url::parse("https://example.com/rate-limited").unwrap(),
183                headers: HeaderMap::new(),
184                method: Method::GET,
185                body: body.to_string(),
186                status,
187            }
188        }
189
190        fn insert_header(&mut self, name: HeaderName, value: HeaderValue) {
191            self.headers.insert(name, value);
192        }
193
194        fn response(&self) -> ChallengeResponse<'_> {
195            ChallengeResponse {
196                url: &self.url,
197                status: self.status,
198                headers: &self.headers,
199                body: &self.body,
200                request_method: &self.method,
201            }
202        }
203    }
204
205    #[test]
206    fn detects_rate_limit() {
207        let mut fixture = ResponseFixture::new(
208            "<span class='cf-error-code'>1015</span>You are being rate limited",
209            429,
210        );
211        fixture.insert_header(SERVER, "cloudflare".parse().unwrap());
212        let response = fixture.response();
213        assert!(RateLimitHandler::is_rate_limited(&response));
214    }
215
216    #[test]
217    fn plan_uses_retry_after_header() {
218        let mut fixture =
219            ResponseFixture::new("<span class='cf-error-code'>1015</span> Rate limited", 429);
220        fixture.insert_header(SERVER, "cloudflare".parse().unwrap());
221        fixture.insert_header(RETRY_AFTER, "120".parse().unwrap());
222        let response = fixture.response();
223        let handler = RateLimitHandler::new();
224        let plan = handler.plan(&response, None).expect("plan");
225        assert!(plan.should_retry);
226        assert_eq!(plan.wait.unwrap(), Duration::from_secs(120));
227        assert_eq!(
228            plan.metadata.get("delay_source"),
229            Some(&"header".to_string())
230        );
231    }
232
233    #[test]
234    fn plan_extracts_delay_from_body() {
235        let mut fixture = ResponseFixture::new(
236            "<span class='cf-error-code'>1015</span> Please wait 10 minutes before retrying",
237            429,
238        );
239        fixture.insert_header(SERVER, "cloudflare".parse().unwrap());
240        let response = fixture.response();
241        let handler = RateLimitHandler::new();
242        let plan = handler.plan(&response, None).expect("plan");
243        assert!(plan.wait.unwrap() >= Duration::from_secs(600));
244        assert_eq!(plan.metadata.get("delay_source"), Some(&"body".to_string()));
245    }
246}