cloudscraper_rs/challenges/solvers/
javascript_v2.rs1use std::collections::HashMap;
8use std::sync::Arc;
9use std::time::Duration;
10
11use html_escape::decode_html_entities;
12use once_cell::sync::Lazy;
13use rand::Rng;
14use regex::{Regex, RegexBuilder};
15use serde::Deserialize;
16use thiserror::Error;
17
18use crate::challenges::core::{
19 ChallengeExecutionError, ChallengeHttpClient, ChallengeHttpResponse, ChallengeResponse,
20 ChallengeSubmission, OriginalRequest, execute_challenge_submission, is_cloudflare_response,
21 origin_from_url,
22};
23use crate::external_deps::captcha::{CaptchaError, CaptchaProvider, CaptchaTask};
24
25const DEFAULT_DELAY_MIN_SECS: f32 = 1.0;
27const DEFAULT_DELAY_MAX_SECS: f32 = 5.0;
29
30pub struct JavascriptV2Solver {
32 delay_min: Duration,
33 delay_max: Duration,
34 captcha_provider: Option<Arc<dyn CaptchaProvider>>, }
36
37impl JavascriptV2Solver {
38 pub fn new() -> Self {
40 Self {
41 delay_min: Duration::from_secs_f32(DEFAULT_DELAY_MIN_SECS),
42 delay_max: Duration::from_secs_f32(DEFAULT_DELAY_MAX_SECS),
43 captcha_provider: None,
44 }
45 }
46
47 pub fn with_delay_range(mut self, min: Duration, max: Duration) -> Self {
49 self.delay_min = min;
50 self.delay_max = if max < min { min } else { max };
51 self
52 }
53
54 pub fn with_captcha_provider(mut self, provider: Arc<dyn CaptchaProvider>) -> Self {
56 self.captcha_provider = Some(provider);
57 self
58 }
59
60 pub fn set_captcha_provider(&mut self, provider: Arc<dyn CaptchaProvider>) {
62 self.captcha_provider = Some(provider);
63 }
64
65 pub fn clear_captcha_provider(&mut self) {
67 self.captcha_provider = None;
68 }
69
70 pub fn is_js_challenge(response: &ChallengeResponse<'_>) -> bool {
72 is_cloudflare_response(response)
73 && matches!(response.status, 403 | 429 | 503)
74 && JS_CHALLENGE_RE.is_match(response.body)
75 }
76
77 pub fn is_captcha_challenge(response: &ChallengeResponse<'_>) -> bool {
79 is_cloudflare_response(response)
80 && response.status == 403
81 && CAPTCHA_CHALLENGE_RE.is_match(response.body)
82 }
83
84 pub fn solve(
86 &self,
87 response: &ChallengeResponse<'_>,
88 ) -> Result<ChallengeSubmission, JavascriptV2Error> {
89 if !Self::is_js_challenge(response) {
90 return Err(JavascriptV2Error::NotV2Challenge);
91 }
92
93 let info = Self::extract_challenge_info(response.body)?;
94 let payload = Self::generate_payload(response.body, &info.options)?;
95 self.build_submission(response, &info.form_action, payload)
96 }
97
98 pub async fn solve_with_captcha(
100 &self,
101 response: &ChallengeResponse<'_>,
102 ) -> Result<ChallengeSubmission, JavascriptV2Error> {
103 if !Self::is_captcha_challenge(response) {
104 return Err(JavascriptV2Error::NotCaptchaChallenge);
105 }
106
107 let provider = self
108 .captcha_provider
109 .as_ref()
110 .ok_or(JavascriptV2Error::CaptchaProviderMissing)?;
111
112 let info = Self::extract_challenge_info(response.body)?;
113 let mut payload = Self::generate_payload(response.body, &info.options)?;
114
115 let site_key = Self::extract_site_key(response.body)
116 .ok_or(JavascriptV2Error::MissingToken("data-sitekey"))?;
117
118 let mut task = CaptchaTask::new(site_key, response.url.clone());
119 if let Some(cv_id) = info.options.cv_id.as_ref() {
121 task = task.insert_metadata("cv_id", cv_id.clone());
122 }
123
124 let solution = provider
125 .solve(&task)
126 .await
127 .map_err(JavascriptV2Error::Captcha)?;
128 payload.insert("h-captcha-response".into(), solution.token);
129 for (key, value) in solution.metadata {
130 payload.insert(key, value);
131 }
132
133 self.build_submission(response, &info.form_action, payload)
134 }
135
136 pub async fn solve_and_submit(
138 &self,
139 client: Arc<dyn ChallengeHttpClient>,
140 response: &ChallengeResponse<'_>,
141 original_request: OriginalRequest,
142 ) -> Result<ChallengeHttpResponse, JavascriptV2Error> {
143 let submission = if Self::is_captcha_challenge(response) {
144 self.solve_with_captcha(response).await?
145 } else {
146 self.solve(response)?
147 };
148
149 execute_challenge_submission(client, submission, original_request)
150 .await
151 .map_err(JavascriptV2Error::Submission)
152 }
153
154 fn build_submission(
155 &self,
156 response: &ChallengeResponse<'_>,
157 form_action: &str,
158 mut payload: HashMap<String, String>,
159 ) -> Result<ChallengeSubmission, JavascriptV2Error> {
160 let action = decode_html_entities(form_action).into_owned();
161 let target_url = response
162 .url
163 .join(&action)
164 .map_err(|err| JavascriptV2Error::InvalidFormAction(action.clone(), err))?;
165
166 payload
168 .entry("cf_ch_verify".into())
169 .or_insert_with(|| "plat".into());
170 payload.entry("vc".into()).or_default();
171 payload.entry("captcha_vc".into()).or_default();
172 payload
173 .entry("cf_captcha_kind".into())
174 .or_insert_with(|| "h".into());
175 payload.entry("h-captcha-response".into()).or_default();
176
177 let mut headers = HashMap::new();
178 headers.insert(
179 "Content-Type".into(),
180 "application/x-www-form-urlencoded".into(),
181 );
182 headers.insert("Referer".into(), response.url.as_str().to_string());
183 headers.insert("Origin".into(), origin_from_url(response.url));
184
185 let wait = self.random_delay();
186 let submission =
187 ChallengeSubmission::new(http::Method::POST, target_url, payload, headers, wait);
188 Ok(submission)
189 }
190
191 fn random_delay(&self) -> Duration {
192 if self.delay_max <= self.delay_min {
193 return self.delay_min;
194 }
195 let mut rng = rand::thread_rng();
196 let min = self.delay_min.as_secs_f32();
197 let max = self.delay_max.as_secs_f32();
198 let secs = rng.gen_range(min..=max);
199 Duration::from_secs_f32(secs)
200 }
201
202 fn extract_challenge_info(body: &str) -> Result<ChallengeInfo, JavascriptV2Error> {
203 let options = Self::extract_challenge_options(body)?;
204 let form_action = Self::extract_form_action(body)?;
205 Ok(ChallengeInfo {
206 options,
207 form_action,
208 })
209 }
210
211 fn extract_challenge_options(body: &str) -> Result<ChallengeOptions, JavascriptV2Error> {
212 let captures = CHL_OPT_RE
213 .captures(body)
214 .and_then(|caps| caps.get(1))
215 .ok_or(JavascriptV2Error::ChallengeDataMissing)?;
216 let json = captures.as_str();
217 let options: ChallengeOptions = serde_json::from_str(json)?;
218 Ok(options)
219 }
220
221 fn extract_form_action(body: &str) -> Result<String, JavascriptV2Error> {
222 let action = FORM_ACTION_RE
223 .captures(body)
224 .and_then(|caps| caps.get(1))
225 .map(|m| m.as_str().to_string())
226 .ok_or(JavascriptV2Error::FormActionMissing)?;
227 Ok(action)
228 }
229
230 fn generate_payload(
231 body: &str,
232 options: &ChallengeOptions,
233 ) -> Result<HashMap<String, String>, JavascriptV2Error> {
234 let r_token = R_TOKEN_RE
235 .captures(body)
236 .and_then(|caps| caps.get(1))
237 .map(|m| m.as_str().to_string())
238 .ok_or(JavascriptV2Error::MissingToken("r"))?;
239
240 let mut payload = HashMap::new();
241 payload.insert("r".into(), r_token);
242 if let Some(cv_id) = options.cv_id.as_ref() {
243 payload.insert("cv_chal_id".into(), cv_id.clone());
244 }
245 if let Some(page_data) = options.chl_page_data.as_ref() {
246 payload.insert("cf_chl_page_data".into(), page_data.clone());
247 }
248 Ok(payload)
249 }
250
251 fn extract_site_key(body: &str) -> Option<String> {
252 SITE_KEY_RE
253 .captures(body)
254 .and_then(|caps| caps.get(1))
255 .map(|m| m.as_str().to_string())
256 }
257}
258
259impl Default for JavascriptV2Solver {
260 fn default() -> Self {
261 Self::new()
262 }
263}
264
265impl super::ChallengeSolver for JavascriptV2Solver {
266 fn name(&self) -> &'static str {
267 "javascript_v2"
268 }
269}
270
271#[derive(Debug, Deserialize)]
272struct ChallengeOptions {
273 #[serde(rename = "cvId")]
274 cv_id: Option<String>,
275 #[serde(rename = "chlPageData")]
276 chl_page_data: Option<String>,
277 #[serde(flatten)]
278 _extra: serde_json::Value,
279}
280
281struct ChallengeInfo {
282 options: ChallengeOptions,
283 form_action: String,
284}
285
286#[derive(Debug, Error)]
287pub enum JavascriptV2Error {
288 #[error("response is not a Cloudflare v2 challenge")]
289 NotV2Challenge,
290 #[error("response is not a Cloudflare v2 captcha challenge")]
291 NotCaptchaChallenge,
292 #[error("required challenge data missing")]
293 ChallengeDataMissing,
294 #[error("challenge form action missing")]
295 FormActionMissing,
296 #[error("missing token '{0}' in challenge page")]
297 MissingToken(&'static str),
298 #[error("challenge data could not be parsed: {0}")]
299 ChallengeDataParse(#[from] serde_json::Error),
300 #[error("invalid form action '{0}': {1}")]
301 InvalidFormAction(String, url::ParseError),
302 #[error("captcha provider not configured")]
303 CaptchaProviderMissing,
304 #[error("captcha solving failed: {0}")]
305 Captcha(#[source] CaptchaError),
306 #[error("challenge submission failed: {0}")]
307 Submission(#[source] ChallengeExecutionError),
308}
309
310static JS_CHALLENGE_RE: Lazy<Regex> = Lazy::new(|| {
312 RegexBuilder::new(r#"cpo\.src\s*=\s*['"]/cdn-cgi/challenge-platform/\S+orchestrate/jsch/v1"#)
313 .case_insensitive(true)
314 .dot_matches_new_line(true)
315 .build()
316 .expect("invalid JS challenge regex")
317});
318
319static CAPTCHA_CHALLENGE_RE: Lazy<Regex> = Lazy::new(|| {
320 RegexBuilder::new(
321 r#"cpo\.src\s*=\s*['"]/cdn-cgi/challenge-platform/\S+orchestrate/(captcha|managed)/v1"#,
322 )
323 .case_insensitive(true)
324 .dot_matches_new_line(true)
325 .build()
326 .expect("invalid captcha challenge regex")
327});
328
329static CHL_OPT_RE: Lazy<Regex> = Lazy::new(|| {
330 RegexBuilder::new(r#"window\._cf_chl_opt=\((\{[^;]+\})\);"#)
331 .dot_matches_new_line(true)
332 .build()
333 .expect("invalid _cf_chl_opt regex")
334});
335
336static FORM_ACTION_RE: Lazy<Regex> = Lazy::new(|| {
337 RegexBuilder::new(r#"<form[^>]+id=['"]challenge-form['"][^>]*action=['"]([^'"]+)['"]"#)
338 .case_insensitive(true)
339 .dot_matches_new_line(true)
340 .build()
341 .expect("invalid form action regex")
342});
343
344static R_TOKEN_RE: Lazy<Regex> = Lazy::new(|| {
345 RegexBuilder::new(r#"name=['"]r['"]\s+value=['"]([^'"]+)['"]"#)
346 .case_insensitive(true)
347 .dot_matches_new_line(true)
348 .build()
349 .expect("invalid r token regex")
350});
351
352static SITE_KEY_RE: Lazy<Regex> = Lazy::new(|| {
353 RegexBuilder::new(r#"data-sitekey=['"]([^'"]+)['"]"#)
354 .case_insensitive(true)
355 .dot_matches_new_line(true)
356 .build()
357 .expect("invalid site key regex")
358});
359
360#[cfg(test)]
361mod tests {
362 use super::*;
363 use async_trait::async_trait;
364 use http::{HeaderMap, Method, header::SERVER};
365 use url::Url;
366
367 use crate::external_deps::captcha::{CaptchaResult, CaptchaSolution};
368
369 struct ResponseFixture {
370 url: Url,
371 headers: HeaderMap,
372 method: Method,
373 body: String,
374 status: u16,
375 }
376
377 impl ResponseFixture {
378 fn new(body: &str, status: u16) -> Self {
379 let mut headers = HeaderMap::new();
380 headers.insert(SERVER, "cloudflare".parse().unwrap());
381 Self {
382 url: Url::parse("https://example.com/").unwrap(),
383 headers,
384 method: Method::GET,
385 body: body.to_string(),
386 status,
387 }
388 }
389
390 fn response(&self) -> ChallengeResponse<'_> {
391 ChallengeResponse {
392 url: &self.url,
393 status: self.status,
394 headers: &self.headers,
395 body: &self.body,
396 request_method: &self.method,
397 }
398 }
399 }
400
401 struct StubCaptchaProvider;
402
403 #[async_trait]
404 impl CaptchaProvider for StubCaptchaProvider {
405 fn name(&self) -> &'static str {
406 "stub"
407 }
408
409 async fn solve(&self, _task: &CaptchaTask) -> CaptchaResult {
410 Ok(CaptchaSolution::new("captcha-token"))
411 }
412 }
413
414 fn sample_html(include_captcha: bool) -> String {
415 let orchestrate_path = if include_captcha {
416 "/cdn-cgi/challenge-platform/h/b/orchestrate/captcha/v1"
417 } else {
418 "/cdn-cgi/challenge-platform/h/b/orchestrate/jsch/v1"
419 };
420 let captcha_snippet = if include_captcha {
421 "<div class='cf-turnstile' data-sitekey='site-key-123'></div>"
422 } else {
423 ""
424 };
425
426 format!(
427 r#"
428 <html>
429 <head>
430 <script>window._cf_chl_opt=({{"cvId":"cv123","chlPageData":"page-data"}});</script>
431 </head>
432 <body>
433 <script>var cpo={{}};cpo.src="{orchestrate_path}";</script>
434 <form id="challenge-form" action="/cdn-cgi/challenge-platform/h/b/orchestrate/form" method="POST">
435 <input type="hidden" name="r" value="token-r"/>
436 </form>
437 {captcha_snippet}
438 </body>
439 </html>
440 "#
441 )
442 }
443
444 #[test]
445 fn solve_builds_submission() {
446 let html = sample_html(false);
447 let fixture = ResponseFixture::new(&html, 403);
448 let solver = JavascriptV2Solver::new();
449 assert!(JavascriptV2Solver::is_js_challenge(&fixture.response()));
450
451 let submission = solver.solve(&fixture.response()).expect("should solve");
452 assert_eq!(submission.method, Method::POST);
453 assert_eq!(
454 submission.url.as_str(),
455 "https://example.com/cdn-cgi/challenge-platform/h/b/orchestrate/form"
456 );
457 assert_eq!(
458 submission.form_fields.get("r"),
459 Some(&"token-r".to_string())
460 );
461 assert_eq!(
462 submission.form_fields.get("cv_chal_id"),
463 Some(&"cv123".to_string())
464 );
465 assert!(submission.wait >= Duration::from_secs(1));
466 assert!(submission.wait <= Duration::from_secs(5));
467 assert_eq!(
468 submission.headers.get("Content-Type"),
469 Some(&"application/x-www-form-urlencoded".to_string())
470 );
471 assert_eq!(
472 submission.headers.get("Referer"),
473 Some(&"https://example.com/".to_string())
474 );
475 }
476
477 #[tokio::test]
478 async fn solve_with_captcha_uses_provider() {
479 let html = sample_html(true);
480 let fixture = ResponseFixture::new(&html, 403);
481 let solver = JavascriptV2Solver::new().with_captcha_provider(Arc::new(StubCaptchaProvider));
482 let submission = solver
483 .solve_with_captcha(&fixture.response())
484 .await
485 .expect("captcha challenge solved");
486 assert_eq!(
487 submission.form_fields.get("h-captcha-response"),
488 Some(&"captcha-token".to_string())
489 );
490 }
491
492 #[tokio::test]
493 async fn solve_with_captcha_requires_provider() {
494 let html = sample_html(true);
495 let fixture = ResponseFixture::new(&html, 403);
496 let solver = JavascriptV2Solver::new();
497 let err = solver
498 .solve_with_captcha(&fixture.response())
499 .await
500 .expect_err("missing provider should fail");
501 matches!(err, JavascriptV2Error::CaptchaProviderMissing);
502 }
503}