cloudscraper_rs/challenges/detectors/
mod.rs

1//! Challenge detection module.
2//!
3//! Provides pattern-based identification of Cloudflare challenges along with
4//! adaptive learning hooks.
5
6use once_cell::sync::Lazy;
7use regex::Regex;
8use std::collections::{HashMap, VecDeque};
9use std::time::SystemTime;
10
11use crate::challenges::core::{ChallengeResponse, is_cloudflare_response};
12
13/// High level challenge categories supported by the detector.
14#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
15pub enum ChallengeType {
16    JavaScriptV1,
17    JavaScriptV2,
18    ManagedV3,
19    Turnstile,
20    RateLimit,
21    AccessDenied,
22    BotManagement,
23    Unknown,
24}
25
26/// Recommended response strategy for a detected challenge.
27#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
28pub enum ResponseStrategy {
29    JsExecution,
30    AdvancedJsExecution,
31    BrowserSimulation,
32    CaptchaSolving,
33    DelayRetry,
34    ProxyRotation,
35    EnhancedEvasion,
36    None,
37}
38
39/// Utility to extract a normalized domain from Cloudflare responses.
40fn response_domain(response: &ChallengeResponse<'_>) -> Option<String> {
41    response.url.host_str().map(|host| host.to_lowercase())
42}
43
44/// Pattern definition used to match responses against known challenge
45/// signatures.
46#[derive(Debug, Clone)]
47struct ChallengePattern {
48    id: String,
49    name: String,
50    challenge_type: ChallengeType,
51    response_strategy: ResponseStrategy,
52    base_confidence: f32,
53    patterns: Vec<Regex>,
54    adaptive: bool,
55}
56
57impl ChallengePattern {
58    fn new(
59        id: impl Into<String>,
60        name: impl Into<String>,
61        challenge_type: ChallengeType,
62        response_strategy: ResponseStrategy,
63        base_confidence: f32,
64        raw_patterns: &[&str],
65    ) -> Self {
66        let patterns = raw_patterns
67            .iter()
68            .map(|pattern| build_regex(pattern))
69            .collect();
70
71        Self {
72            id: id.into(),
73            name: name.into(),
74            challenge_type,
75            response_strategy,
76            base_confidence,
77            patterns,
78            adaptive: false,
79        }
80    }
81
82    fn into_adaptive(mut self) -> Self {
83        self.adaptive = true;
84        self
85    }
86}
87
88/// Static list of known challenge signatures.
89static KNOWN_PATTERNS: Lazy<Vec<ChallengePattern>> = Lazy::new(|| {
90    vec![
91        ChallengePattern::new(
92            "cf_iuam_v1",
93            "Cloudflare IUAM v1",
94            ChallengeType::JavaScriptV1,
95            ResponseStrategy::JsExecution,
96            0.95,
97            &[
98                r#"<title>\s*Just a moment\.\.\.\s*</title>"#,
99                r"var s,t,o,p,b,r,e,a,k,i,n,g,f,u,l,l,y,h,a,r,d,c,o,r,e",
100                r#"setTimeout\(function\(\)\s*\{\s*var.*?\.submit\(\)"#,
101                r#"<form[^>]*id="challenge-form"[^>]*action="/[^"]*__cf_chl_f_tk="#,
102            ],
103        ),
104        ChallengePattern::new(
105            "cf_iuam_v2",
106            "Cloudflare IUAM v2",
107            ChallengeType::JavaScriptV2,
108            ResponseStrategy::AdvancedJsExecution,
109            0.90,
110            &[
111                r#"cpo\.src\s*=\s*['"]/cdn-cgi/challenge-platform/.*?orchestrate/jsch/v1"#,
112                r"window\._cf_chl_opt\s*=",
113                r#"<form[^>]*id="challenge-form"[^>]*action="/[^"]*__cf_chl_rt_tk="#,
114            ],
115        ),
116        ChallengePattern::new(
117            "cf_managed_v3",
118            "Cloudflare Managed Challenge v3",
119            ChallengeType::ManagedV3,
120            ResponseStrategy::BrowserSimulation,
121            0.92,
122            &[
123                r#"cpo\.src\s*=\s*['"]/cdn-cgi/challenge-platform/.*?orchestrate/(?:captcha|managed)/v1"#,
124                r"window\._cf_chl_ctx\s*=",
125                r#"data-ray="[A-Fa-f0-9]+""#,
126                r#"<div[^>]*class="cf-browser-verification"#,
127            ],
128        ),
129        ChallengePattern::new(
130            "cf_turnstile",
131            "Cloudflare Turnstile",
132            ChallengeType::Turnstile,
133            ResponseStrategy::CaptchaSolving,
134            0.98,
135            &[
136                r#"class="cf-turnstile""#,
137                r#"data-sitekey="[0-9A-Za-z]{40}""#,
138                r#"src="https://challenges\.cloudflare\.com/turnstile/v0/api\.js"#,
139                r"cf-turnstile-response",
140            ],
141        ),
142        ChallengePattern::new(
143            "cf_rate_limit",
144            "Cloudflare Rate Limit",
145            ChallengeType::RateLimit,
146            ResponseStrategy::DelayRetry,
147            0.99,
148            &[
149                r#"<span[^>]*class="cf-error-code">1015<"#,
150                r"You are being rate limited",
151                r#"<title>\s*Rate Limited\s*</title>"#,
152            ],
153        ),
154        ChallengePattern::new(
155            "cf_access_denied",
156            "Cloudflare Access Denied",
157            ChallengeType::AccessDenied,
158            ResponseStrategy::ProxyRotation,
159            0.99,
160            &[
161                r#"<span[^>]*class="cf-error-code">1020<"#,
162                r"Access denied",
163                r"The owner of this website has banned your access",
164            ],
165        ),
166        ChallengePattern::new(
167            "cf_bot_management",
168            "Cloudflare Bot Management",
169            ChallengeType::BotManagement,
170            ResponseStrategy::EnhancedEvasion,
171            0.95,
172            &[
173                r#"<span[^>]*class="cf-error-code">1010<"#,
174                r"Bot management",
175                r"has banned you temporarily",
176            ],
177        ),
178    ]
179});
180
181/// Detection output returned to the pipeline.
182#[derive(Debug, Clone)]
183pub struct ChallengeDetection {
184    pub pattern_id: String,
185    pub pattern_name: String,
186    pub challenge_type: ChallengeType,
187    pub response_strategy: ResponseStrategy,
188    pub confidence: f32,
189    pub is_adaptive: bool,
190    pub status_code: u16,
191    pub url: String,
192    pub matched_indicators: Vec<String>,
193}
194
195#[derive(Debug, Clone)]
196struct PatternStats {
197    attempts: u32,
198    successes: u32,
199}
200
201impl PatternStats {
202    fn record(&mut self, success: bool) {
203        self.attempts = self.attempts.saturating_add(1);
204        if success {
205            self.successes = self.successes.saturating_add(1);
206        }
207    }
208
209    fn success_rate(&self) -> f32 {
210        if self.attempts == 0 {
211            0.0
212        } else {
213            self.successes as f32 / self.attempts as f32
214        }
215    }
216}
217
218#[derive(Debug, Clone)]
219struct DetectionRecord {
220    timestamp: SystemTime,
221    pattern_id: String,
222    confidence: f32,
223    url: String,
224}
225
226/// Public view of a recorded challenge detection.
227#[derive(Debug, Clone)]
228pub struct DetectionLogEntry {
229    pub timestamp: SystemTime,
230    pub pattern_id: String,
231    pub confidence: f32,
232    pub url: String,
233}
234
235impl From<&DetectionRecord> for DetectionLogEntry {
236    fn from(record: &DetectionRecord) -> Self {
237        Self {
238            timestamp: record.timestamp,
239            pattern_id: record.pattern_id.clone(),
240            confidence: record.confidence,
241            url: record.url.clone(),
242        }
243    }
244}
245
246/// Pattern-based challenge detector with adaptive learning support.
247#[derive(Debug)]
248pub struct ChallengeDetector {
249    known_patterns: Vec<ChallengePattern>,
250    adaptive_patterns: HashMap<String, Vec<ChallengePattern>>, // domain -> patterns
251    stats: HashMap<String, PatternStats>,
252    history: VecDeque<DetectionRecord>,
253    max_history: usize,
254}
255
256impl Default for ChallengeDetector {
257    fn default() -> Self {
258        Self::new()
259    }
260}
261
262impl ChallengeDetector {
263    pub fn new() -> Self {
264        Self {
265            known_patterns: KNOWN_PATTERNS.clone(),
266            adaptive_patterns: HashMap::new(),
267            stats: HashMap::new(),
268            history: VecDeque::with_capacity(128),
269            max_history: 1000,
270        }
271    }
272
273    /// Detect a challenge in the provided HTTP response context.
274    pub fn detect(&mut self, response: &ChallengeResponse<'_>) -> Option<ChallengeDetection> {
275        if !self.is_cloudflare_challenge(response) {
276            return None;
277        }
278
279        let mut best: Option<(ChallengeDetection, f32)> = None;
280
281        for pattern in &self.known_patterns {
282            if let Some((confidence, matched)) = self.evaluate_pattern(pattern, response)
283                && best
284                    .as_ref()
285                    .is_none_or(|(_, current)| confidence > *current)
286            {
287                best = Some((
288                    ChallengeDetection {
289                        pattern_id: pattern.id.clone(),
290                        pattern_name: pattern.name.clone(),
291                        challenge_type: pattern.challenge_type,
292                        response_strategy: pattern.response_strategy,
293                        confidence,
294                        is_adaptive: pattern.adaptive,
295                        status_code: response.status,
296                        url: response.url.as_str().to_string(),
297                        matched_indicators: matched,
298                    },
299                    confidence,
300                ));
301            }
302        }
303
304        if let Some(domain) = response_domain(response)
305            && let Some(patterns) = self.adaptive_patterns.get(&domain)
306        {
307            for pattern in patterns {
308                if let Some((confidence, matched)) = self.evaluate_pattern(pattern, response)
309                    && best
310                        .as_ref()
311                        .is_none_or(|(_, current)| confidence > *current)
312                {
313                    best = Some((
314                        ChallengeDetection {
315                            pattern_id: pattern.id.clone(),
316                            pattern_name: pattern.name.clone(),
317                            challenge_type: pattern.challenge_type,
318                            response_strategy: pattern.response_strategy,
319                            confidence,
320                            is_adaptive: true,
321                            status_code: response.status,
322                            url: response.url.as_str().to_string(),
323                            matched_indicators: matched,
324                        },
325                        confidence,
326                    ));
327                }
328            }
329        }
330
331        let result = best.map(|(detection, _)| detection);
332
333        if let Some(ref detection) = result {
334            self.record_detection(detection.clone());
335        }
336
337        result
338    }
339
340    fn evaluate_pattern(
341        &self,
342        pattern: &ChallengePattern,
343        response: &ChallengeResponse<'_>,
344    ) -> Option<(f32, Vec<String>)> {
345        let matches: Vec<_> = pattern
346            .patterns
347            .iter()
348            .filter(|regex| regex.is_match(response.body))
349            .map(|regex| regex.as_str().to_string())
350            .collect();
351
352        if matches.is_empty() {
353            return None;
354        }
355
356        let total = pattern.patterns.len() as f32;
357        let mut confidence = (matches.len() as f32 / total) * pattern.base_confidence;
358
359        if let Some(stats) = self.stats.get(&pattern.id) {
360            confidence += stats.success_rate() * 0.1;
361        }
362
363        confidence = confidence.min(1.0);
364
365        if confidence < 0.5 {
366            return None;
367        }
368
369        Some((confidence, matches))
370    }
371
372    fn is_cloudflare_challenge(&self, response: &ChallengeResponse<'_>) -> bool {
373        is_cloudflare_response(response) && matches!(response.status, 403 | 429 | 503)
374    }
375
376    fn record_detection(&mut self, detection: ChallengeDetection) {
377        if self.history.len() == self.max_history {
378            self.history.pop_front();
379        }
380        self.history.push_back(DetectionRecord {
381            timestamp: SystemTime::now(),
382            pattern_id: detection.pattern_id,
383            confidence: detection.confidence,
384            url: detection.url,
385        });
386    }
387
388    /// Iterate over historical detections (oldest -> newest).
389    pub fn detection_history(&self) -> impl Iterator<Item = DetectionLogEntry> + '_ {
390        self.history.iter().map(DetectionLogEntry::from)
391    }
392
393    /// Update success metrics for a pattern to influence future confidence scores.
394    pub fn learn_from_outcome(&mut self, pattern_id: &str, success: bool) {
395        let entry = self
396            .stats
397            .entry(pattern_id.to_string())
398            .or_insert(PatternStats {
399                attempts: 0,
400                successes: 0,
401            });
402        entry.record(success);
403    }
404
405    /// Register an adaptive, domain-specific pattern discovered at runtime.
406    pub fn add_adaptive_pattern(
407        &mut self,
408        domain: &str,
409        pattern_name: &str,
410        raw_patterns: Vec<&str>,
411        challenge_type: ChallengeType,
412        response_strategy: ResponseStrategy,
413    ) {
414        let pattern = ChallengePattern::new(
415            format!("adaptive_{}_{}", domain, raw_patterns.len()),
416            pattern_name,
417            challenge_type,
418            response_strategy,
419            0.8,
420            &raw_patterns,
421        )
422        .into_adaptive();
423
424        self.adaptive_patterns
425            .entry(domain.to_lowercase())
426            .or_default()
427            .push(pattern);
428    }
429}
430
431fn build_regex(pattern: &str) -> Regex {
432    regex::RegexBuilder::new(pattern)
433        .case_insensitive(true)
434        .multi_line(true)
435        .dot_matches_new_line(true)
436        .build()
437        .unwrap_or_else(|err| panic!("invalid challenge detection regex `{}`: {}", pattern, err))
438}
439
440#[cfg(test)]
441mod tests {
442    use super::*;
443    use http::header::SERVER;
444    use http::{HeaderMap, Method};
445    use url::Url;
446
447    struct ResponseFixture {
448        url: Url,
449        headers: HeaderMap,
450        method: Method,
451        body: String,
452        status: u16,
453    }
454
455    impl ResponseFixture {
456        fn new(body: &str, status: u16) -> Self {
457            let mut headers = HeaderMap::new();
458            headers.insert(SERVER, "cloudflare".parse().unwrap());
459            Self {
460                url: Url::parse("https://example.com/").unwrap(),
461                headers,
462                method: Method::GET,
463                body: body.to_string(),
464                status,
465            }
466        }
467
468        fn response(&self) -> ChallengeResponse<'_> {
469            ChallengeResponse {
470                url: &self.url,
471                status: self.status,
472                headers: &self.headers,
473                body: &self.body,
474                request_method: &self.method,
475            }
476        }
477    }
478
479    #[test]
480    fn detects_turnstile() {
481        let html = r#"
482			<html><head><title>Test</title></head>
483			<body>
484				<div class="cf-turnstile" data-sitekey="0123456789ABCDEFGHIJ0123456789ABCDEFGHIJ"></div>
485				<script src="https://challenges.cloudflare.com/turnstile/v0/api.js"></script>
486			</body>
487			</html>
488		"#;
489
490        let mut detector = ChallengeDetector::new();
491        let fixture = ResponseFixture::new(html, 403);
492        let response = fixture.response();
493        let detection = detector.detect(&response).expect("should detect");
494
495        assert_eq!(detection.challenge_type, ChallengeType::Turnstile);
496        assert_eq!(
497            detection.response_strategy,
498            ResponseStrategy::CaptchaSolving
499        );
500    }
501}