cloudscraper_rs/challenges/solvers/
turnstile.rs

1//! Solver for Cloudflare Turnstile captcha challenges.
2//!
3//! Detects the Turnstile widget, delegates solving to a configurable captcha
4//! provider, and prepares the submission payload consumed by the shared
5//! executor.
6//!
7//! This solver supports multiple methods for extracting the site key:
8//! - Primary: `data-sitekey` attribute in HTML
9//! - Fallback 1: `cFPWv` property in window._cf_chl_opt
10//! - Fallback 2: `sitekey` property in script tag JSON
11//!
12//! Includes randomized delays (1-5s by default) to mimic browser behavior.
13
14use std::collections::HashMap;
15use std::sync::Arc;
16use std::time::Duration;
17
18use html_escape::decode_html_entities;
19use once_cell::sync::Lazy;
20use rand::Rng;
21use regex::{Regex, RegexBuilder};
22use thiserror::Error;
23
24use crate::challenges::core::{
25    ChallengeExecutionError, ChallengeHttpClient, ChallengeHttpResponse, ChallengeResponse,
26    ChallengeSubmission, OriginalRequest, execute_challenge_submission, is_cloudflare_response,
27    origin_from_url,
28};
29use crate::external_deps::captcha::{CaptchaError, CaptchaProvider, CaptchaTask};
30
31use super::ChallengeSolver;
32
33const DEFAULT_DELAY_MIN_SECS: f32 = 1.0;
34const DEFAULT_DELAY_MAX_SECS: f32 = 5.0;
35
36/// Solver capable of handling Cloudflare Turnstile challenges.
37pub struct TurnstileSolver {
38    delay_min: Duration,
39    delay_max: Duration,
40    captcha_provider: Option<Arc<dyn CaptchaProvider>>,
41}
42
43impl TurnstileSolver {
44    /// Create a solver with the default random delay and no captcha provider.
45    pub fn new() -> Self {
46        Self {
47            delay_min: Duration::from_secs_f32(DEFAULT_DELAY_MIN_SECS),
48            delay_max: Duration::from_secs_f32(DEFAULT_DELAY_MAX_SECS),
49            captcha_provider: None,
50        }
51    }
52
53    /// Configure a custom delay range used before posting the solution.
54    pub fn with_delay_range(mut self, min: Duration, max: Duration) -> Self {
55        self.delay_min = min;
56        self.delay_max = if max < min { min } else { max };
57        self
58    }
59
60    /// Attach a captcha provider used to solve Turnstile tokens.
61    pub fn with_captcha_provider(mut self, provider: Arc<dyn CaptchaProvider>) -> Self {
62        self.captcha_provider = Some(provider);
63        self
64    }
65
66    /// Replace or set the captcha provider after construction.
67    pub fn set_captcha_provider(&mut self, provider: Arc<dyn CaptchaProvider>) {
68        self.captcha_provider = Some(provider);
69    }
70
71    /// Remove the configured captcha provider.
72    pub fn clear_captcha_provider(&mut self) {
73        self.captcha_provider = None;
74    }
75
76    /// Returns `true` when the response resembles a Turnstile challenge page.
77    pub fn is_turnstile_challenge(response: &ChallengeResponse<'_>) -> bool {
78        is_cloudflare_response(response)
79            && matches!(response.status, 403 | 429 | 503)
80            && (TURNSTILE_WIDGET_RE.is_match(response.body)
81                || TURNSTILE_SCRIPT_RE.is_match(response.body)
82                || TURNSTILE_SITEKEY_RE.is_match(response.body))
83    }
84
85    /// Solve the Turnstile page and return the planned challenge submission.
86    pub async fn solve(
87        &self,
88        response: &ChallengeResponse<'_>,
89    ) -> Result<ChallengeSubmission, TurnstileError> {
90        if !Self::is_turnstile_challenge(response) {
91            return Err(TurnstileError::NotTurnstileChallenge);
92        }
93
94        let provider = self
95            .captcha_provider
96            .as_ref()
97            .ok_or(TurnstileError::CaptchaProviderMissing)?;
98
99        let info = Self::extract_turnstile_info(response)?;
100        let task =
101            CaptchaTask::new(info.site_key.clone(), response.url.clone()).with_action("turnstile");
102        let solution = provider
103            .solve(&task)
104            .await
105            .map_err(TurnstileError::Captcha)?;
106
107        let payload = Self::build_payload(response.body, solution.token);
108        self.build_submission(response, &info.form_action, payload)
109    }
110
111    /// Solve and submit the challenge using the supplied HTTP client.
112    pub async fn solve_and_submit(
113        &self,
114        client: Arc<dyn ChallengeHttpClient>,
115        response: &ChallengeResponse<'_>,
116        original_request: OriginalRequest,
117    ) -> Result<ChallengeHttpResponse, TurnstileError> {
118        let submission = self.solve(response).await?;
119        let result = execute_challenge_submission(client, submission, original_request)
120            .await
121            .map_err(TurnstileError::Submission)?;
122
123        // Check if Cloudflare rejected the Turnstile solution with 403
124        if result.status == 403 {
125            return Err(TurnstileError::ChallengeSolveFailed);
126        }
127
128        Ok(result)
129    }
130
131    fn build_submission(
132        &self,
133        response: &ChallengeResponse<'_>,
134        form_action: &str,
135        mut payload: HashMap<String, String>,
136    ) -> Result<ChallengeSubmission, TurnstileError> {
137        let form_action = decode_html_entities(form_action).into_owned();
138        let target_url = response
139            .url
140            .join(&form_action)
141            .map_err(|err| TurnstileError::InvalidFormAction(form_action.clone(), err))?;
142
143        let mut headers = HashMap::new();
144        headers.insert(
145            "Content-Type".into(),
146            "application/x-www-form-urlencoded".into(),
147        );
148        headers.insert("Referer".into(), response.url.as_str().to_string());
149        headers.insert("Origin".into(), origin_from_url(response.url));
150
151        let wait = self.random_delay();
152        payload.entry("cf-turnstile-response".into()).or_default();
153
154        Ok(ChallengeSubmission::new(
155            http::Method::POST,
156            target_url,
157            payload,
158            headers,
159            wait,
160        ))
161    }
162
163    fn random_delay(&self) -> Duration {
164        if self.delay_max <= self.delay_min {
165            return self.delay_min;
166        }
167        let mut rng = rand::thread_rng();
168        let min = self.delay_min.as_secs_f32();
169        let max = self.delay_max.as_secs_f32();
170        Duration::from_secs_f32(rng.gen_range(min..max))
171    }
172
173    fn extract_turnstile_info(
174        response: &ChallengeResponse<'_>,
175    ) -> Result<TurnstileInfo, TurnstileError> {
176        let body = response.body;
177
178        // Try primary method: data-sitekey attribute
179        let site_key = TURNSTILE_SITEKEY_RE
180            .captures(body)
181            .and_then(|caps| caps.get(1))
182            .map(|m| m.as_str().to_string())
183            // Fallback 1: cFPWv in window._cf_chl_opt
184            .or_else(|| {
185                TURNSTILE_SITEKEY_OPT_RE
186                    .captures(body)
187                    .and_then(|caps| caps.get(1))
188                    .map(|m| m.as_str().to_string())
189            })
190            // Fallback 2: "sitekey": "..." in script tag JSON
191            .or_else(|| {
192                TURNSTILE_SITEKEY_JSON_RE
193                    .captures(body)
194                    .and_then(|caps| caps.get(1))
195                    .map(|m| m.as_str().to_string())
196            })
197            .ok_or(TurnstileError::MissingSiteKey)?;
198
199        let form_action = FORM_ACTION_RE
200            .captures(body)
201            .and_then(|caps| caps.get(1))
202            .map(|m| m.as_str().to_string())
203            .unwrap_or_else(|| response.url.as_str().to_string());
204
205        Ok(TurnstileInfo {
206            site_key,
207            form_action,
208        })
209    }
210
211    fn build_payload(body: &str, token: String) -> HashMap<String, String> {
212        let mut payload = HashMap::new();
213        payload.insert("cf-turnstile-response".into(), token);
214
215        for caps in INPUT_FIELD_RE.captures_iter(body) {
216            if let (Some(name), Some(value)) = (caps.get(1), caps.get(2)) {
217                let key = name.as_str();
218                if key != "cf-turnstile-response" && !payload.contains_key(key) {
219                    payload.insert(key.to_string(), value.as_str().to_string());
220                }
221            }
222        }
223
224        payload
225    }
226}
227
228impl Default for TurnstileSolver {
229    fn default() -> Self {
230        Self::new()
231    }
232}
233
234impl ChallengeSolver for TurnstileSolver {
235    fn name(&self) -> &'static str {
236        "turnstile"
237    }
238}
239
240struct TurnstileInfo {
241    site_key: String,
242    form_action: String,
243}
244
245#[derive(Debug, Error)]
246pub enum TurnstileError {
247    #[error("response is not a Cloudflare Turnstile challenge")]
248    NotTurnstileChallenge,
249    #[error("captcha provider missing for Turnstile challenge")]
250    CaptchaProviderMissing,
251    #[error("missing Turnstile site key")]
252    MissingSiteKey,
253    #[error("invalid form action '{0}': {1}")]
254    InvalidFormAction(String, url::ParseError),
255    #[error("captcha provider error: {0}")]
256    Captcha(#[source] CaptchaError),
257    #[error("failed to solve Cloudflare Turnstile challenge - received 403 status")]
258    ChallengeSolveFailed,
259    #[error("challenge submission failed: {0}")]
260    Submission(#[source] ChallengeExecutionError),
261}
262
263static TURNSTILE_WIDGET_RE: Lazy<Regex> = Lazy::new(|| {
264    RegexBuilder::new(r#"class=['"][^'"]*cf-turnstile[^'"]*['"]"#)
265        .case_insensitive(true)
266        .dot_matches_new_line(true)
267        .build()
268        .expect("invalid turnstile widget regex")
269});
270
271static TURNSTILE_SCRIPT_RE: Lazy<Regex> = Lazy::new(|| {
272    RegexBuilder::new(r#"src=['"]https://challenges\.cloudflare\.com/turnstile/v0/api\.js"#)
273        .case_insensitive(true)
274        .dot_matches_new_line(true)
275        .build()
276        .expect("invalid turnstile script regex")
277});
278
279static TURNSTILE_SITEKEY_RE: Lazy<Regex> = Lazy::new(|| {
280    RegexBuilder::new(r#"data-sitekey=['\"]([0-9A-Za-z_-]{20,50})['\"]"#)
281        .case_insensitive(true)
282        .dot_matches_new_line(true)
283        .build()
284        .expect("invalid turnstile site key regex")
285});
286
287// Alternative patterns for site key extraction (fallbacks)
288static TURNSTILE_SITEKEY_OPT_RE: Lazy<Regex> = Lazy::new(|| {
289    RegexBuilder::new(r#"cFPWv\s?:\s?['\"]([^'\"]+)['\"]"#)
290        .case_insensitive(true)
291        .dot_matches_new_line(true)
292        .build()
293        .expect("invalid turnstile opt sitekey regex")
294});
295
296static TURNSTILE_SITEKEY_JSON_RE: Lazy<Regex> = Lazy::new(|| {
297    RegexBuilder::new(r#"['\"]sitekey['\"]\s*:\s*['\"]([^'\"]+)['\"]"#)
298        .case_insensitive(true)
299        .dot_matches_new_line(true)
300        .build()
301        .expect("invalid turnstile json sitekey regex")
302});
303
304static FORM_ACTION_RE: Lazy<Regex> = Lazy::new(|| {
305    RegexBuilder::new(r#"<form[^>]*action=['"]([^'"]+)['"]"#)
306        .case_insensitive(true)
307        .dot_matches_new_line(true)
308        .build()
309        .expect("invalid turnstile form action regex")
310});
311
312static INPUT_FIELD_RE: Lazy<Regex> = Lazy::new(|| {
313    RegexBuilder::new(r#"<input[^>]*name=['"]([^'"]+)['"][^>]*value=['"]([^'"]*)['"]"#)
314        .case_insensitive(true)
315        .dot_matches_new_line(true)
316        .build()
317        .expect("invalid input field regex")
318});
319
320#[cfg(test)]
321mod tests {
322    use super::*;
323    use async_trait::async_trait;
324    use http::{HeaderMap, Method, header::SERVER};
325    use url::Url;
326
327    use crate::external_deps::captcha::{CaptchaResult, CaptchaSolution};
328
329    struct ResponseFixture {
330        url: Url,
331        headers: HeaderMap,
332        method: Method,
333        body: String,
334        status: u16,
335    }
336
337    impl ResponseFixture {
338        fn new(body: &str, status: u16) -> Self {
339            let mut headers = HeaderMap::new();
340            headers.insert(SERVER, "cloudflare".parse().unwrap());
341            Self {
342                url: Url::parse("https://example.com/turnstile").unwrap(),
343                headers,
344                method: Method::GET,
345                body: body.to_string(),
346                status,
347            }
348        }
349
350        fn response(&self) -> ChallengeResponse<'_> {
351            ChallengeResponse {
352                url: &self.url,
353                status: self.status,
354                headers: &self.headers,
355                body: &self.body,
356                request_method: &self.method,
357            }
358        }
359    }
360
361    struct StubCaptchaProvider;
362
363    #[async_trait]
364    impl CaptchaProvider for StubCaptchaProvider {
365        fn name(&self) -> &'static str {
366            "stub"
367        }
368
369        async fn solve(&self, _task: &CaptchaTask) -> CaptchaResult {
370            Ok(CaptchaSolution::new("turnstile-token"))
371        }
372    }
373
374    fn sample_html(with_form_action: bool) -> String {
375        let form_attr = if with_form_action {
376            r#"action="/submit/turnstile""#
377        } else {
378            ""
379        };
380
381        format!(
382            r#"
383            <html>
384              <body>
385                <form id="challenge-form" {form_attr} method="POST">
386                  <input type="hidden" name="foo" value="bar" />
387                  <input type="hidden" name="cf-turnstile-response" value="existing" />
388                </form>
389                <div class="cf-turnstile" data-sitekey="ABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890abcd"></div>
390                <script src="https://challenges.cloudflare.com/turnstile/v0/api.js"></script>
391              </body>
392            </html>
393        "#
394        )
395    }
396
397    #[tokio::test]
398    async fn solve_turnstile_builds_submission() {
399        let html = sample_html(true);
400        let fixture = ResponseFixture::new(&html, 403);
401        let solver = TurnstileSolver::new().with_captcha_provider(Arc::new(StubCaptchaProvider));
402        assert!(TurnstileSolver::is_turnstile_challenge(&fixture.response()));
403
404        let submission = solver
405            .solve(&fixture.response())
406            .await
407            .expect("should solve");
408        assert_eq!(submission.method, Method::POST);
409        assert_eq!(
410            submission.url.as_str(),
411            "https://example.com/submit/turnstile"
412        );
413        assert_eq!(
414            submission.form_fields.get("cf-turnstile-response"),
415            Some(&"turnstile-token".to_string())
416        );
417        assert_eq!(submission.form_fields.get("foo"), Some(&"bar".to_string()));
418        assert!(submission.wait >= Duration::from_secs(1));
419        assert!(submission.wait <= Duration::from_secs(5));
420    }
421
422    #[tokio::test]
423    async fn solve_uses_current_url_when_form_absent() {
424        let html = sample_html(false);
425        let fixture = ResponseFixture::new(&html, 403);
426        let solver = TurnstileSolver::new().with_captcha_provider(Arc::new(StubCaptchaProvider));
427        let submission = solver
428            .solve(&fixture.response())
429            .await
430            .expect("should solve");
431        assert_eq!(submission.url.as_str(), "https://example.com/turnstile");
432    }
433
434    #[tokio::test]
435    async fn solve_requires_provider() {
436        let html = sample_html(true);
437        let fixture = ResponseFixture::new(&html, 403);
438        let solver = TurnstileSolver::new();
439        let err = solver
440            .solve(&fixture.response())
441            .await
442            .expect_err("should fail");
443        assert!(matches!(err, TurnstileError::CaptchaProviderMissing));
444    }
445
446    #[test]
447    fn extracts_sitekey_from_opt_fallback() {
448        let html = r#"
449            <html>
450              <body>
451                <script>
452                  window._cf_chl_opt = {
453                    cFPWv: "alternative_sitekey_from_opt_12345678"
454                  };
455                </script>
456                <div class="cf-turnstile"></div>
457              </body>
458            </html>
459        "#;
460        let fixture = ResponseFixture::new(html, 403);
461        let info = TurnstileSolver::extract_turnstile_info(&fixture.response());
462        assert!(info.is_ok());
463        assert_eq!(
464            info.unwrap().site_key,
465            "alternative_sitekey_from_opt_12345678"
466        );
467    }
468
469    #[test]
470    fn extracts_sitekey_from_json_fallback() {
471        let html = r#"
472            <html>
473              <body>
474                <script>
475                  var config = {
476                    "sitekey": "json_sitekey_fallback_987654321"
477                  };
478                </script>
479                <div class="cf-turnstile"></div>
480              </body>
481            </html>
482        "#;
483        let fixture = ResponseFixture::new(html, 403);
484        let info = TurnstileSolver::extract_turnstile_info(&fixture.response());
485        assert!(info.is_ok());
486        assert_eq!(info.unwrap().site_key, "json_sitekey_fallback_987654321");
487    }
488
489    #[test]
490    fn sitekey_primary_takes_precedence() {
491        let html = r#"
492            <html>
493              <body>
494                <div class="cf-turnstile" data-sitekey="primary_sitekey_12345"></div>
495                <script>window._cf_chl_opt = { cFPWv: "fallback_key" };</script>
496              </body>
497            </html>
498        "#;
499        let fixture = ResponseFixture::new(html, 403);
500        let info = TurnstileSolver::extract_turnstile_info(&fixture.response());
501        assert!(info.is_ok());
502        assert_eq!(info.unwrap().site_key, "primary_sitekey_12345");
503    }
504}