cloudscraper_rs/challenges/solvers/
turnstile.rs1use std::collections::HashMap;
15use std::sync::Arc;
16use std::time::Duration;
17
18use html_escape::decode_html_entities;
19use once_cell::sync::Lazy;
20use rand::Rng;
21use regex::{Regex, RegexBuilder};
22use thiserror::Error;
23
24use crate::challenges::core::{
25 ChallengeExecutionError, ChallengeHttpClient, ChallengeHttpResponse, ChallengeResponse,
26 ChallengeSubmission, OriginalRequest, execute_challenge_submission, is_cloudflare_response,
27 origin_from_url,
28};
29use crate::external_deps::captcha::{CaptchaError, CaptchaProvider, CaptchaTask};
30
31use super::ChallengeSolver;
32
33const DEFAULT_DELAY_MIN_SECS: f32 = 1.0;
34const DEFAULT_DELAY_MAX_SECS: f32 = 5.0;
35
36pub struct TurnstileSolver {
38 delay_min: Duration,
39 delay_max: Duration,
40 captcha_provider: Option<Arc<dyn CaptchaProvider>>,
41}
42
43impl TurnstileSolver {
44 pub fn new() -> Self {
46 Self {
47 delay_min: Duration::from_secs_f32(DEFAULT_DELAY_MIN_SECS),
48 delay_max: Duration::from_secs_f32(DEFAULT_DELAY_MAX_SECS),
49 captcha_provider: None,
50 }
51 }
52
53 pub fn with_delay_range(mut self, min: Duration, max: Duration) -> Self {
55 self.delay_min = min;
56 self.delay_max = if max < min { min } else { max };
57 self
58 }
59
60 pub fn with_captcha_provider(mut self, provider: Arc<dyn CaptchaProvider>) -> Self {
62 self.captcha_provider = Some(provider);
63 self
64 }
65
66 pub fn set_captcha_provider(&mut self, provider: Arc<dyn CaptchaProvider>) {
68 self.captcha_provider = Some(provider);
69 }
70
71 pub fn clear_captcha_provider(&mut self) {
73 self.captcha_provider = None;
74 }
75
76 pub fn is_turnstile_challenge(response: &ChallengeResponse<'_>) -> bool {
78 is_cloudflare_response(response)
79 && matches!(response.status, 403 | 429 | 503)
80 && (TURNSTILE_WIDGET_RE.is_match(response.body)
81 || TURNSTILE_SCRIPT_RE.is_match(response.body)
82 || TURNSTILE_SITEKEY_RE.is_match(response.body))
83 }
84
85 pub async fn solve(
87 &self,
88 response: &ChallengeResponse<'_>,
89 ) -> Result<ChallengeSubmission, TurnstileError> {
90 if !Self::is_turnstile_challenge(response) {
91 return Err(TurnstileError::NotTurnstileChallenge);
92 }
93
94 let provider = self
95 .captcha_provider
96 .as_ref()
97 .ok_or(TurnstileError::CaptchaProviderMissing)?;
98
99 let info = Self::extract_turnstile_info(response)?;
100 let task =
101 CaptchaTask::new(info.site_key.clone(), response.url.clone()).with_action("turnstile");
102 let solution = provider
103 .solve(&task)
104 .await
105 .map_err(TurnstileError::Captcha)?;
106
107 let payload = Self::build_payload(response.body, solution.token);
108 self.build_submission(response, &info.form_action, payload)
109 }
110
111 pub async fn solve_and_submit(
113 &self,
114 client: Arc<dyn ChallengeHttpClient>,
115 response: &ChallengeResponse<'_>,
116 original_request: OriginalRequest,
117 ) -> Result<ChallengeHttpResponse, TurnstileError> {
118 let submission = self.solve(response).await?;
119 let result = execute_challenge_submission(client, submission, original_request)
120 .await
121 .map_err(TurnstileError::Submission)?;
122
123 if result.status == 403 {
125 return Err(TurnstileError::ChallengeSolveFailed);
126 }
127
128 Ok(result)
129 }
130
131 fn build_submission(
132 &self,
133 response: &ChallengeResponse<'_>,
134 form_action: &str,
135 mut payload: HashMap<String, String>,
136 ) -> Result<ChallengeSubmission, TurnstileError> {
137 let form_action = decode_html_entities(form_action).into_owned();
138 let target_url = response
139 .url
140 .join(&form_action)
141 .map_err(|err| TurnstileError::InvalidFormAction(form_action.clone(), err))?;
142
143 let mut headers = HashMap::new();
144 headers.insert(
145 "Content-Type".into(),
146 "application/x-www-form-urlencoded".into(),
147 );
148 headers.insert("Referer".into(), response.url.as_str().to_string());
149 headers.insert("Origin".into(), origin_from_url(response.url));
150
151 let wait = self.random_delay();
152 payload.entry("cf-turnstile-response".into()).or_default();
153
154 Ok(ChallengeSubmission::new(
155 http::Method::POST,
156 target_url,
157 payload,
158 headers,
159 wait,
160 ))
161 }
162
163 fn random_delay(&self) -> Duration {
164 if self.delay_max <= self.delay_min {
165 return self.delay_min;
166 }
167 let mut rng = rand::thread_rng();
168 let min = self.delay_min.as_secs_f32();
169 let max = self.delay_max.as_secs_f32();
170 Duration::from_secs_f32(rng.gen_range(min..max))
171 }
172
173 fn extract_turnstile_info(
174 response: &ChallengeResponse<'_>,
175 ) -> Result<TurnstileInfo, TurnstileError> {
176 let body = response.body;
177
178 let site_key = TURNSTILE_SITEKEY_RE
180 .captures(body)
181 .and_then(|caps| caps.get(1))
182 .map(|m| m.as_str().to_string())
183 .or_else(|| {
185 TURNSTILE_SITEKEY_OPT_RE
186 .captures(body)
187 .and_then(|caps| caps.get(1))
188 .map(|m| m.as_str().to_string())
189 })
190 .or_else(|| {
192 TURNSTILE_SITEKEY_JSON_RE
193 .captures(body)
194 .and_then(|caps| caps.get(1))
195 .map(|m| m.as_str().to_string())
196 })
197 .ok_or(TurnstileError::MissingSiteKey)?;
198
199 let form_action = FORM_ACTION_RE
200 .captures(body)
201 .and_then(|caps| caps.get(1))
202 .map(|m| m.as_str().to_string())
203 .unwrap_or_else(|| response.url.as_str().to_string());
204
205 Ok(TurnstileInfo {
206 site_key,
207 form_action,
208 })
209 }
210
211 fn build_payload(body: &str, token: String) -> HashMap<String, String> {
212 let mut payload = HashMap::new();
213 payload.insert("cf-turnstile-response".into(), token);
214
215 for caps in INPUT_FIELD_RE.captures_iter(body) {
216 if let (Some(name), Some(value)) = (caps.get(1), caps.get(2)) {
217 let key = name.as_str();
218 if key != "cf-turnstile-response" && !payload.contains_key(key) {
219 payload.insert(key.to_string(), value.as_str().to_string());
220 }
221 }
222 }
223
224 payload
225 }
226}
227
228impl Default for TurnstileSolver {
229 fn default() -> Self {
230 Self::new()
231 }
232}
233
234impl ChallengeSolver for TurnstileSolver {
235 fn name(&self) -> &'static str {
236 "turnstile"
237 }
238}
239
240struct TurnstileInfo {
241 site_key: String,
242 form_action: String,
243}
244
245#[derive(Debug, Error)]
246pub enum TurnstileError {
247 #[error("response is not a Cloudflare Turnstile challenge")]
248 NotTurnstileChallenge,
249 #[error("captcha provider missing for Turnstile challenge")]
250 CaptchaProviderMissing,
251 #[error("missing Turnstile site key")]
252 MissingSiteKey,
253 #[error("invalid form action '{0}': {1}")]
254 InvalidFormAction(String, url::ParseError),
255 #[error("captcha provider error: {0}")]
256 Captcha(#[source] CaptchaError),
257 #[error("failed to solve Cloudflare Turnstile challenge - received 403 status")]
258 ChallengeSolveFailed,
259 #[error("challenge submission failed: {0}")]
260 Submission(#[source] ChallengeExecutionError),
261}
262
263static TURNSTILE_WIDGET_RE: Lazy<Regex> = Lazy::new(|| {
264 RegexBuilder::new(r#"class=['"][^'"]*cf-turnstile[^'"]*['"]"#)
265 .case_insensitive(true)
266 .dot_matches_new_line(true)
267 .build()
268 .expect("invalid turnstile widget regex")
269});
270
271static TURNSTILE_SCRIPT_RE: Lazy<Regex> = Lazy::new(|| {
272 RegexBuilder::new(r#"src=['"]https://challenges\.cloudflare\.com/turnstile/v0/api\.js"#)
273 .case_insensitive(true)
274 .dot_matches_new_line(true)
275 .build()
276 .expect("invalid turnstile script regex")
277});
278
279static TURNSTILE_SITEKEY_RE: Lazy<Regex> = Lazy::new(|| {
280 RegexBuilder::new(r#"data-sitekey=['\"]([0-9A-Za-z_-]{20,50})['\"]"#)
281 .case_insensitive(true)
282 .dot_matches_new_line(true)
283 .build()
284 .expect("invalid turnstile site key regex")
285});
286
287static TURNSTILE_SITEKEY_OPT_RE: Lazy<Regex> = Lazy::new(|| {
289 RegexBuilder::new(r#"cFPWv\s?:\s?['\"]([^'\"]+)['\"]"#)
290 .case_insensitive(true)
291 .dot_matches_new_line(true)
292 .build()
293 .expect("invalid turnstile opt sitekey regex")
294});
295
296static TURNSTILE_SITEKEY_JSON_RE: Lazy<Regex> = Lazy::new(|| {
297 RegexBuilder::new(r#"['\"]sitekey['\"]\s*:\s*['\"]([^'\"]+)['\"]"#)
298 .case_insensitive(true)
299 .dot_matches_new_line(true)
300 .build()
301 .expect("invalid turnstile json sitekey regex")
302});
303
304static FORM_ACTION_RE: Lazy<Regex> = Lazy::new(|| {
305 RegexBuilder::new(r#"<form[^>]*action=['"]([^'"]+)['"]"#)
306 .case_insensitive(true)
307 .dot_matches_new_line(true)
308 .build()
309 .expect("invalid turnstile form action regex")
310});
311
312static INPUT_FIELD_RE: Lazy<Regex> = Lazy::new(|| {
313 RegexBuilder::new(r#"<input[^>]*name=['"]([^'"]+)['"][^>]*value=['"]([^'"]*)['"]"#)
314 .case_insensitive(true)
315 .dot_matches_new_line(true)
316 .build()
317 .expect("invalid input field regex")
318});
319
320#[cfg(test)]
321mod tests {
322 use super::*;
323 use async_trait::async_trait;
324 use http::{HeaderMap, Method, header::SERVER};
325 use url::Url;
326
327 use crate::external_deps::captcha::{CaptchaResult, CaptchaSolution};
328
329 struct ResponseFixture {
330 url: Url,
331 headers: HeaderMap,
332 method: Method,
333 body: String,
334 status: u16,
335 }
336
337 impl ResponseFixture {
338 fn new(body: &str, status: u16) -> Self {
339 let mut headers = HeaderMap::new();
340 headers.insert(SERVER, "cloudflare".parse().unwrap());
341 Self {
342 url: Url::parse("https://example.com/turnstile").unwrap(),
343 headers,
344 method: Method::GET,
345 body: body.to_string(),
346 status,
347 }
348 }
349
350 fn response(&self) -> ChallengeResponse<'_> {
351 ChallengeResponse {
352 url: &self.url,
353 status: self.status,
354 headers: &self.headers,
355 body: &self.body,
356 request_method: &self.method,
357 }
358 }
359 }
360
361 struct StubCaptchaProvider;
362
363 #[async_trait]
364 impl CaptchaProvider for StubCaptchaProvider {
365 fn name(&self) -> &'static str {
366 "stub"
367 }
368
369 async fn solve(&self, _task: &CaptchaTask) -> CaptchaResult {
370 Ok(CaptchaSolution::new("turnstile-token"))
371 }
372 }
373
374 fn sample_html(with_form_action: bool) -> String {
375 let form_attr = if with_form_action {
376 r#"action="/submit/turnstile""#
377 } else {
378 ""
379 };
380
381 format!(
382 r#"
383 <html>
384 <body>
385 <form id="challenge-form" {form_attr} method="POST">
386 <input type="hidden" name="foo" value="bar" />
387 <input type="hidden" name="cf-turnstile-response" value="existing" />
388 </form>
389 <div class="cf-turnstile" data-sitekey="ABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890abcd"></div>
390 <script src="https://challenges.cloudflare.com/turnstile/v0/api.js"></script>
391 </body>
392 </html>
393 "#
394 )
395 }
396
397 #[tokio::test]
398 async fn solve_turnstile_builds_submission() {
399 let html = sample_html(true);
400 let fixture = ResponseFixture::new(&html, 403);
401 let solver = TurnstileSolver::new().with_captcha_provider(Arc::new(StubCaptchaProvider));
402 assert!(TurnstileSolver::is_turnstile_challenge(&fixture.response()));
403
404 let submission = solver
405 .solve(&fixture.response())
406 .await
407 .expect("should solve");
408 assert_eq!(submission.method, Method::POST);
409 assert_eq!(
410 submission.url.as_str(),
411 "https://example.com/submit/turnstile"
412 );
413 assert_eq!(
414 submission.form_fields.get("cf-turnstile-response"),
415 Some(&"turnstile-token".to_string())
416 );
417 assert_eq!(submission.form_fields.get("foo"), Some(&"bar".to_string()));
418 assert!(submission.wait >= Duration::from_secs(1));
419 assert!(submission.wait <= Duration::from_secs(5));
420 }
421
422 #[tokio::test]
423 async fn solve_uses_current_url_when_form_absent() {
424 let html = sample_html(false);
425 let fixture = ResponseFixture::new(&html, 403);
426 let solver = TurnstileSolver::new().with_captcha_provider(Arc::new(StubCaptchaProvider));
427 let submission = solver
428 .solve(&fixture.response())
429 .await
430 .expect("should solve");
431 assert_eq!(submission.url.as_str(), "https://example.com/turnstile");
432 }
433
434 #[tokio::test]
435 async fn solve_requires_provider() {
436 let html = sample_html(true);
437 let fixture = ResponseFixture::new(&html, 403);
438 let solver = TurnstileSolver::new();
439 let err = solver
440 .solve(&fixture.response())
441 .await
442 .expect_err("should fail");
443 assert!(matches!(err, TurnstileError::CaptchaProviderMissing));
444 }
445
446 #[test]
447 fn extracts_sitekey_from_opt_fallback() {
448 let html = r#"
449 <html>
450 <body>
451 <script>
452 window._cf_chl_opt = {
453 cFPWv: "alternative_sitekey_from_opt_12345678"
454 };
455 </script>
456 <div class="cf-turnstile"></div>
457 </body>
458 </html>
459 "#;
460 let fixture = ResponseFixture::new(html, 403);
461 let info = TurnstileSolver::extract_turnstile_info(&fixture.response());
462 assert!(info.is_ok());
463 assert_eq!(
464 info.unwrap().site_key,
465 "alternative_sitekey_from_opt_12345678"
466 );
467 }
468
469 #[test]
470 fn extracts_sitekey_from_json_fallback() {
471 let html = r#"
472 <html>
473 <body>
474 <script>
475 var config = {
476 "sitekey": "json_sitekey_fallback_987654321"
477 };
478 </script>
479 <div class="cf-turnstile"></div>
480 </body>
481 </html>
482 "#;
483 let fixture = ResponseFixture::new(html, 403);
484 let info = TurnstileSolver::extract_turnstile_info(&fixture.response());
485 assert!(info.is_ok());
486 assert_eq!(info.unwrap().site_key, "json_sitekey_fallback_987654321");
487 }
488
489 #[test]
490 fn sitekey_primary_takes_precedence() {
491 let html = r#"
492 <html>
493 <body>
494 <div class="cf-turnstile" data-sitekey="primary_sitekey_12345"></div>
495 <script>window._cf_chl_opt = { cFPWv: "fallback_key" };</script>
496 </body>
497 </html>
498 "#;
499 let fixture = ResponseFixture::new(html, 403);
500 let info = TurnstileSolver::extract_turnstile_info(&fixture.response());
501 assert!(info.is_ok());
502 assert_eq!(info.unwrap().site_key, "primary_sitekey_12345");
503 }
504}