cloudscraper_rs/challenges/solvers/
javascript_v2.rs1use std::collections::HashMap;
12use std::sync::Arc;
13use std::time::Duration;
14
15use html_escape::decode_html_entities;
16use once_cell::sync::Lazy;
17use rand::Rng;
18use regex::{Regex, RegexBuilder};
19use serde::Deserialize;
20use thiserror::Error;
21
22use crate::challenges::core::{
23 ChallengeExecutionError, ChallengeHttpClient, ChallengeHttpResponse, ChallengeResponse,
24 ChallengeSubmission, OriginalRequest, execute_challenge_submission, is_cloudflare_response,
25 origin_from_url,
26};
27use crate::external_deps::captcha::{CaptchaError, CaptchaProvider, CaptchaTask};
28
29const DEFAULT_DELAY_MIN_SECS: f32 = 1.0;
31const DEFAULT_DELAY_MAX_SECS: f32 = 5.0;
33
34pub struct JavascriptV2Solver {
36 delay_min: Duration,
37 delay_max: Duration,
38 captcha_provider: Option<Arc<dyn CaptchaProvider>>, }
40
41impl JavascriptV2Solver {
42 pub fn new() -> Self {
44 Self {
45 delay_min: Duration::from_secs_f32(DEFAULT_DELAY_MIN_SECS),
46 delay_max: Duration::from_secs_f32(DEFAULT_DELAY_MAX_SECS),
47 captcha_provider: None,
48 }
49 }
50
51 pub fn with_delay_range(mut self, min: Duration, max: Duration) -> Self {
53 self.delay_min = min;
54 self.delay_max = if max < min { min } else { max };
55 self
56 }
57
58 pub fn with_captcha_provider(mut self, provider: Arc<dyn CaptchaProvider>) -> Self {
60 self.captcha_provider = Some(provider);
61 self
62 }
63
64 pub fn set_captcha_provider(&mut self, provider: Arc<dyn CaptchaProvider>) {
66 self.captcha_provider = Some(provider);
67 }
68
69 pub fn clear_captcha_provider(&mut self) {
71 self.captcha_provider = None;
72 }
73
74 pub fn is_js_challenge(response: &ChallengeResponse<'_>) -> bool {
76 is_cloudflare_response(response)
77 && matches!(response.status, 403 | 429 | 503)
78 && JS_CHALLENGE_RE.is_match(response.body)
79 }
80
81 pub fn is_captcha_challenge(response: &ChallengeResponse<'_>) -> bool {
83 is_cloudflare_response(response)
84 && response.status == 403
85 && CAPTCHA_CHALLENGE_RE.is_match(response.body)
86 }
87
88 pub fn solve(
90 &self,
91 response: &ChallengeResponse<'_>,
92 ) -> Result<ChallengeSubmission, JavascriptV2Error> {
93 if !Self::is_js_challenge(response) {
94 return Err(JavascriptV2Error::NotV2Challenge);
95 }
96
97 let info = Self::extract_challenge_info(response.body)?;
98 let payload = Self::generate_payload(response.body, &info.options)?;
99 self.build_submission(response, &info.form_action, payload)
100 }
101
102 pub async fn solve_with_captcha(
104 &self,
105 response: &ChallengeResponse<'_>,
106 ) -> Result<ChallengeSubmission, JavascriptV2Error> {
107 if !Self::is_captcha_challenge(response) {
108 return Err(JavascriptV2Error::NotCaptchaChallenge);
109 }
110
111 let provider = self
112 .captcha_provider
113 .as_ref()
114 .ok_or(JavascriptV2Error::CaptchaProviderMissing)?;
115
116 let info = Self::extract_challenge_info(response.body)?;
117 let mut payload = Self::generate_payload(response.body, &info.options)?;
118
119 let site_key = Self::extract_site_key(response.body)
120 .ok_or(JavascriptV2Error::MissingToken("data-sitekey"))?;
121
122 let mut task = CaptchaTask::new(site_key, response.url.clone());
123 if let Some(cv_id) = info.options.cv_id.as_ref() {
125 task = task.insert_metadata("cv_id", cv_id.clone());
126 }
127
128 let solution = provider
129 .solve(&task)
130 .await
131 .map_err(JavascriptV2Error::Captcha)?;
132 payload.insert("h-captcha-response".into(), solution.token);
133 for (key, value) in solution.metadata {
134 payload.insert(key, value);
135 }
136
137 self.build_submission(response, &info.form_action, payload)
138 }
139
140 pub async fn solve_and_submit(
142 &self,
143 client: Arc<dyn ChallengeHttpClient>,
144 response: &ChallengeResponse<'_>,
145 original_request: OriginalRequest,
146 ) -> Result<ChallengeHttpResponse, JavascriptV2Error> {
147 let submission = if Self::is_captcha_challenge(response) {
148 self.solve_with_captcha(response).await?
149 } else {
150 self.solve(response)?
151 };
152
153 let result = execute_challenge_submission(client, submission, original_request)
154 .await
155 .map_err(JavascriptV2Error::Submission)?;
156
157 if result.status == 403 {
159 return Err(JavascriptV2Error::ChallengeSolveFailed);
160 }
161
162 Ok(result)
163 }
164
165 fn build_submission(
166 &self,
167 response: &ChallengeResponse<'_>,
168 form_action: &str,
169 mut payload: HashMap<String, String>,
170 ) -> Result<ChallengeSubmission, JavascriptV2Error> {
171 let action = decode_html_entities(form_action).into_owned();
172 let target_url = response
173 .url
174 .join(&action)
175 .map_err(|err| JavascriptV2Error::InvalidFormAction(action.clone(), err))?;
176
177 payload
179 .entry("cf_ch_verify".into())
180 .or_insert_with(|| "plat".into());
181 payload.entry("vc".into()).or_default();
182 payload.entry("captcha_vc".into()).or_default();
183 payload
184 .entry("cf_captcha_kind".into())
185 .or_insert_with(|| "h".into());
186 payload.entry("h-captcha-response".into()).or_default();
187
188 let mut headers = HashMap::new();
189 headers.insert(
190 "Content-Type".into(),
191 "application/x-www-form-urlencoded".into(),
192 );
193 headers.insert("Referer".into(), response.url.as_str().to_string());
194 headers.insert("Origin".into(), origin_from_url(response.url));
195
196 let wait = self.random_delay();
197 let submission =
198 ChallengeSubmission::new(http::Method::POST, target_url, payload, headers, wait);
199 Ok(submission)
200 }
201
202 fn random_delay(&self) -> Duration {
203 if self.delay_max <= self.delay_min {
204 return self.delay_min;
205 }
206 let mut rng = rand::thread_rng();
207 let min = self.delay_min.as_secs_f32();
208 let max = self.delay_max.as_secs_f32();
209 let secs = rng.gen_range(min..=max);
210 Duration::from_secs_f32(secs)
211 }
212
213 fn extract_challenge_info(body: &str) -> Result<ChallengeInfo, JavascriptV2Error> {
214 let options = Self::extract_challenge_options(body)?;
215 let form_action = Self::extract_form_action(body)?;
216 Ok(ChallengeInfo {
217 options,
218 form_action,
219 })
220 }
221
222 fn extract_challenge_options(body: &str) -> Result<ChallengeOptions, JavascriptV2Error> {
223 let captures = CHL_OPT_RE
224 .captures(body)
225 .and_then(|caps| caps.get(1))
226 .ok_or(JavascriptV2Error::ChallengeDataMissing)?;
227 let json = captures.as_str();
228 let options: ChallengeOptions = serde_json::from_str(json)?;
229 Ok(options)
230 }
231
232 fn extract_form_action(body: &str) -> Result<String, JavascriptV2Error> {
233 let action = FORM_ACTION_RE
234 .captures(body)
235 .and_then(|caps| caps.get(1))
236 .map(|m| m.as_str().to_string())
237 .ok_or(JavascriptV2Error::FormActionMissing)?;
238 Ok(action)
239 }
240
241 fn generate_payload(
242 body: &str,
243 options: &ChallengeOptions,
244 ) -> Result<HashMap<String, String>, JavascriptV2Error> {
245 let r_token = R_TOKEN_RE
246 .captures(body)
247 .and_then(|caps| caps.get(1))
248 .map(|m| m.as_str().to_string())
249 .ok_or(JavascriptV2Error::MissingToken("r"))?;
250
251 let mut payload = HashMap::new();
252 payload.insert("r".into(), r_token);
253 if let Some(cv_id) = options.cv_id.as_ref() {
254 payload.insert("cv_chal_id".into(), cv_id.clone());
255 }
256 if let Some(page_data) = options.chl_page_data.as_ref() {
257 payload.insert("cf_chl_page_data".into(), page_data.clone());
258 }
259 Ok(payload)
260 }
261
262 fn extract_site_key(body: &str) -> Option<String> {
263 SITE_KEY_RE
264 .captures(body)
265 .and_then(|caps| caps.get(1))
266 .map(|m| m.as_str().to_string())
267 }
268}
269
270impl Default for JavascriptV2Solver {
271 fn default() -> Self {
272 Self::new()
273 }
274}
275
276impl super::ChallengeSolver for JavascriptV2Solver {
277 fn name(&self) -> &'static str {
278 "javascript_v2"
279 }
280}
281
282#[derive(Debug, Deserialize)]
283struct ChallengeOptions {
284 #[serde(rename = "cvId")]
285 cv_id: Option<String>,
286 #[serde(rename = "chlPageData")]
287 chl_page_data: Option<String>,
288 #[serde(flatten)]
289 _extra: serde_json::Value,
290}
291
292struct ChallengeInfo {
293 options: ChallengeOptions,
294 form_action: String,
295}
296
297#[derive(Debug, Error)]
298pub enum JavascriptV2Error {
299 #[error("response is not a Cloudflare v2 challenge")]
300 NotV2Challenge,
301 #[error("response is not a Cloudflare v2 captcha challenge")]
302 NotCaptchaChallenge,
303 #[error("required challenge data missing")]
304 ChallengeDataMissing,
305 #[error("challenge form action missing")]
306 FormActionMissing,
307 #[error("missing token '{0}' in challenge page")]
308 MissingToken(&'static str),
309 #[error("challenge data could not be parsed: {0}")]
310 ChallengeDataParse(#[from] serde_json::Error),
311 #[error("invalid form action '{0}': {1}")]
312 InvalidFormAction(String, url::ParseError),
313 #[error("captcha provider not configured")]
314 CaptchaProviderMissing,
315 #[error("captcha solving failed: {0}")]
316 Captcha(#[source] CaptchaError),
317 #[error("failed to solve Cloudflare v2 challenge - received 403 status")]
318 ChallengeSolveFailed,
319 #[error("challenge submission failed: {0}")]
320 Submission(#[source] ChallengeExecutionError),
321}
322
323static JS_CHALLENGE_RE: Lazy<Regex> = Lazy::new(|| {
325 RegexBuilder::new(r#"cpo\.src\s*=\s*['"]/cdn-cgi/challenge-platform/\S+orchestrate/jsch/v1"#)
326 .case_insensitive(true)
327 .dot_matches_new_line(true)
328 .build()
329 .expect("invalid JS challenge regex")
330});
331
332static CAPTCHA_CHALLENGE_RE: Lazy<Regex> = Lazy::new(|| {
333 RegexBuilder::new(
334 r#"cpo\.src\s*=\s*['"]/cdn-cgi/challenge-platform/\S+orchestrate/(captcha|managed)/v1"#,
335 )
336 .case_insensitive(true)
337 .dot_matches_new_line(true)
338 .build()
339 .expect("invalid captcha challenge regex")
340});
341
342static CHL_OPT_RE: Lazy<Regex> = Lazy::new(|| {
343 RegexBuilder::new(r#"window\._cf_chl_opt=\(?(\{.*?\})\)?;"#)
344 .dot_matches_new_line(true)
345 .build()
346 .expect("invalid _cf_chl_opt regex")
347});
348
349static FORM_ACTION_RE: Lazy<Regex> = Lazy::new(|| {
350 RegexBuilder::new(r#"<form[^>]+id=['"]challenge-form['"][^>]*action=['"]([^'"]+)['"]"#)
351 .case_insensitive(true)
352 .dot_matches_new_line(true)
353 .build()
354 .expect("invalid form action regex")
355});
356
357static R_TOKEN_RE: Lazy<Regex> = Lazy::new(|| {
358 RegexBuilder::new(r#"name=['"]r['"]\s+value=['"]([^'"]+)['"]"#)
359 .case_insensitive(true)
360 .dot_matches_new_line(true)
361 .build()
362 .expect("invalid r token regex")
363});
364
365static SITE_KEY_RE: Lazy<Regex> = Lazy::new(|| {
366 RegexBuilder::new(r#"data-sitekey=['"]([^'"]+)['"]"#)
367 .case_insensitive(true)
368 .dot_matches_new_line(true)
369 .build()
370 .expect("invalid site key regex")
371});
372
373#[cfg(test)]
374mod tests {
375 use super::*;
376 use async_trait::async_trait;
377 use http::{HeaderMap, Method, header::SERVER};
378 use url::Url;
379
380 use crate::external_deps::captcha::{CaptchaResult, CaptchaSolution};
381
382 struct ResponseFixture {
383 url: Url,
384 headers: HeaderMap,
385 method: Method,
386 body: String,
387 status: u16,
388 }
389
390 impl ResponseFixture {
391 fn new(body: &str, status: u16) -> Self {
392 let mut headers = HeaderMap::new();
393 headers.insert(SERVER, "cloudflare".parse().unwrap());
394 Self {
395 url: Url::parse("https://example.com/").unwrap(),
396 headers,
397 method: Method::GET,
398 body: body.to_string(),
399 status,
400 }
401 }
402
403 fn response(&self) -> ChallengeResponse<'_> {
404 ChallengeResponse {
405 url: &self.url,
406 status: self.status,
407 headers: &self.headers,
408 body: &self.body,
409 request_method: &self.method,
410 }
411 }
412 }
413
414 struct StubCaptchaProvider;
415
416 #[async_trait]
417 impl CaptchaProvider for StubCaptchaProvider {
418 fn name(&self) -> &'static str {
419 "stub"
420 }
421
422 async fn solve(&self, _task: &CaptchaTask) -> CaptchaResult {
423 Ok(CaptchaSolution::new("captcha-token"))
424 }
425 }
426
427 fn sample_html(include_captcha: bool) -> String {
428 let orchestrate_path = if include_captcha {
429 "/cdn-cgi/challenge-platform/h/b/orchestrate/captcha/v1"
430 } else {
431 "/cdn-cgi/challenge-platform/h/b/orchestrate/jsch/v1"
432 };
433 let captcha_snippet = if include_captcha {
434 "<div class='cf-turnstile' data-sitekey='site-key-123'></div>"
435 } else {
436 ""
437 };
438
439 format!(
440 r#"
441 <html>
442 <head>
443 <script>window._cf_chl_opt=({{"cvId":"cv123","chlPageData":"page-data"}});</script>
444 </head>
445 <body>
446 <script>var cpo={{}};cpo.src="{orchestrate_path}";</script>
447 <form id="challenge-form" action="/cdn-cgi/challenge-platform/h/b/orchestrate/form" method="POST">
448 <input type="hidden" name="r" value="token-r"/>
449 </form>
450 {captcha_snippet}
451 </body>
452 </html>
453 "#
454 )
455 }
456
457 #[test]
458 fn solve_builds_submission() {
459 let html = sample_html(false);
460 let fixture = ResponseFixture::new(&html, 403);
461 let solver = JavascriptV2Solver::new();
462 assert!(JavascriptV2Solver::is_js_challenge(&fixture.response()));
463
464 let submission = solver.solve(&fixture.response()).expect("should solve");
465 assert_eq!(submission.method, Method::POST);
466 assert_eq!(
467 submission.url.as_str(),
468 "https://example.com/cdn-cgi/challenge-platform/h/b/orchestrate/form"
469 );
470 assert_eq!(
471 submission.form_fields.get("r"),
472 Some(&"token-r".to_string())
473 );
474 assert_eq!(
475 submission.form_fields.get("cv_chal_id"),
476 Some(&"cv123".to_string())
477 );
478 assert!(submission.wait >= Duration::from_secs(1));
479 assert!(submission.wait <= Duration::from_secs(5));
480 assert_eq!(
481 submission.headers.get("Content-Type"),
482 Some(&"application/x-www-form-urlencoded".to_string())
483 );
484 assert_eq!(
485 submission.headers.get("Referer"),
486 Some(&"https://example.com/".to_string())
487 );
488 }
489
490 #[tokio::test]
491 async fn solve_with_captcha_uses_provider() {
492 let html = sample_html(true);
493 let fixture = ResponseFixture::new(&html, 403);
494 let solver = JavascriptV2Solver::new().with_captcha_provider(Arc::new(StubCaptchaProvider));
495 let submission = solver
496 .solve_with_captcha(&fixture.response())
497 .await
498 .expect("captcha challenge solved");
499 assert_eq!(
500 submission.form_fields.get("h-captcha-response"),
501 Some(&"captcha-token".to_string())
502 );
503 }
504
505 #[tokio::test]
506 async fn solve_with_captcha_requires_provider() {
507 let html = sample_html(true);
508 let fixture = ResponseFixture::new(&html, 403);
509 let solver = JavascriptV2Solver::new();
510 let err = solver
511 .solve_with_captcha(&fixture.response())
512 .await
513 .expect_err("missing provider should fail");
514 matches!(err, JavascriptV2Error::CaptchaProviderMissing);
515 }
516
517 #[test]
518 fn challenge_opt_regex_handles_optional_parens() {
519 let html_with_parens = r#"
521 <script>window._cf_chl_opt=({"cvId":"test123","chlPageData":"data"});</script>
522 "#;
523 assert!(CHL_OPT_RE.is_match(html_with_parens));
524
525 let html_without_parens = r#"
527 <script>window._cf_chl_opt={"cvId":"test123","chlPageData":"data"};</script>
528 "#;
529 assert!(CHL_OPT_RE.is_match(html_without_parens));
530 }
531}