1use std::collections::HashMap;
9use std::sync::Arc;
10use std::time::{Duration, Instant};
11
12use bytes::Bytes;
13use http::{HeaderMap, HeaderName, HeaderValue, Method};
14use thiserror::Error;
15use tokio::sync::Mutex;
16use tokio::time::sleep;
17use url::Url;
18
19use crate::challenges::core::{
20 ChallengeExecutionError, ChallengeHttpClient, ChallengeResponse, ChallengeSubmission,
21 OriginalRequest, ReqwestChallengeHttpClient, execute_challenge_submission,
22};
23use crate::challenges::detectors::ChallengeDetection;
24use crate::challenges::pipeline::{
25 ChallengePipeline, ChallengePipelineResult, PipelineContext, PipelineError, UnsupportedReason,
26};
27use crate::challenges::solvers::access_denied::ProxyPool;
28use crate::challenges::solvers::{
29 MitigationPlan, TlsProfileManager, access_denied::AccessDeniedHandler,
30 bot_management::BotManagementHandler, javascript_v1::JavascriptV1Solver,
31 javascript_v2::JavascriptV2Solver, managed_v3::ManagedV3Solver, rate_limit::RateLimitHandler,
32 turnstile::TurnstileSolver,
33};
34use crate::challenges::user_agents::{
35 UserAgentError, UserAgentOptions, UserAgentProfile, get_user_agent_profile,
36};
37use crate::external_deps::captcha::CaptchaProvider;
38use crate::external_deps::interpreters::{BoaJavascriptInterpreter, JavascriptInterpreter};
39use crate::modules::adaptive_timing::{
40 AdaptiveTimingStrategy, BehaviorProfile, DefaultAdaptiveTiming, RequestKind, TimingOutcome,
41 TimingRequest,
42};
43use crate::modules::anti_detection::{
44 AntiDetectionContext, AntiDetectionStrategy, DefaultAntiDetection,
45};
46use crate::modules::events::{
47 ChallengeEvent, EventDispatcher, LoggingHandler, MetricsHandler, PostResponseEvent,
48 PreRequestEvent, RetryEvent, ScraperEvent,
49};
50use crate::modules::metrics::MetricsCollector;
51use crate::modules::ml::{FeatureVector, MLOptimizer};
52use crate::modules::performance::PerformanceMonitor;
53use crate::modules::proxy::{ProxyConfig, ProxyManager};
54use crate::modules::spoofing::{ConsistencyLevel, FingerprintGenerator};
55use crate::modules::state::StateManager;
56use crate::modules::tls::{DefaultTLSManager, TLSConfig};
57
58pub type CloudScraperResult<T> = Result<T, CloudScraperError>;
60
61#[derive(Debug, Error)]
63pub enum CloudScraperError {
64 #[error("http error: {0}")]
65 Http(#[from] reqwest::Error),
66 #[error("url parse error: {0}")]
67 Url(#[from] url::ParseError),
68 #[error("user-agent initialisation failed: {0}")]
69 UserAgent(#[from] UserAgentError),
70 #[error("challenge execution failed: {0}")]
71 ChallengeExecution(#[from] ChallengeExecutionError),
72 #[error("challenge pipeline error: {0}")]
73 Pipeline(#[from] PipelineError),
74 #[error("unsupported challenge ({0})")]
75 Unsupported(UnsupportedReason),
76 #[error("utf8 conversion failed: {0}")]
77 Utf8(#[from] std::string::FromUtf8Error),
78 #[error("header conversion failed: {0}")]
79 InvalidHeader(String),
80 #[error("mitigation required but retries exhausted: {0:?}")]
81 Mitigation(Box<MitigationPlan>),
82 #[error("challenge handling aborted: {0}")]
83 Aborted(String),
84}
85
86#[derive(Debug, Clone)]
88pub struct ScraperResponse {
89 status: u16,
90 headers: HeaderMap,
91 body: Bytes,
92 url: Url,
93}
94
95impl ScraperResponse {
96 fn new(status: u16, headers: HeaderMap, body: Bytes, url: Url) -> Self {
97 Self {
98 status,
99 headers,
100 body,
101 url,
102 }
103 }
104
105 pub fn status(&self) -> u16 {
107 self.status
108 }
109
110 pub fn url(&self) -> &Url {
112 &self.url
113 }
114
115 pub fn headers(&self) -> &HeaderMap {
117 &self.headers
118 }
119
120 pub async fn text(&self) -> CloudScraperResult<String> {
122 Ok(String::from_utf8(self.body.to_vec())?)
123 }
124
125 pub async fn bytes(&self) -> Bytes {
127 self.body.clone()
128 }
129}
130
131#[derive(Clone)]
133pub struct CloudScraperConfig {
134 pub user_agent: UserAgentOptions,
135 pub content_type: Option<String>,
136 pub proxies: Vec<String>,
137 pub proxy_config: ProxyConfig,
138 pub enable_metrics: bool,
139 pub enable_performance_monitoring: bool,
140 pub enable_tls_fingerprinting: bool,
141 pub enable_anti_detection: bool,
142 pub enable_spoofing: bool,
143 pub enable_adaptive_timing: bool,
144 pub enable_ml_optimization: bool,
145 pub behavior_profile: BehaviorProfile,
146 pub spoofing_consistency: ConsistencyLevel,
147 pub captcha_provider: Option<Arc<dyn CaptchaProvider>>,
148 pub interpreter: Option<Arc<dyn JavascriptInterpreter>>,
149 pub tls_config: TLSConfig,
150 pub max_challenge_attempts: usize,
151}
152
153impl Default for CloudScraperConfig {
154 fn default() -> Self {
155 Self {
156 user_agent: UserAgentOptions::default(),
157 content_type: None,
158 proxies: Vec::new(),
159 proxy_config: ProxyConfig::default(),
160 enable_metrics: true,
161 enable_performance_monitoring: true,
162 enable_tls_fingerprinting: true,
163 enable_anti_detection: true,
164 enable_spoofing: true,
165 enable_adaptive_timing: true,
166 enable_ml_optimization: true,
167 behavior_profile: BehaviorProfile::Casual,
168 spoofing_consistency: ConsistencyLevel::Domain,
169 captcha_provider: None,
170 interpreter: None,
171 tls_config: TLSConfig::default(),
172 max_challenge_attempts: 3,
173 }
174 }
175}
176
177pub struct CloudScraperBuilder {
179 config: CloudScraperConfig,
180}
181
182impl CloudScraperBuilder {
183 pub fn new() -> Self {
184 Self {
185 config: CloudScraperConfig::default(),
186 }
187 }
188
189 pub fn with_user_agent_options(mut self, options: UserAgentOptions) -> Self {
190 self.config.user_agent = options;
191 self
192 }
193
194 pub fn with_content_type(mut self, content_type: String) -> Self {
195 self.config.content_type = Some(content_type);
196 self
197 }
198
199 pub fn with_proxies<I, S>(mut self, proxies: I) -> Self
200 where
201 I: IntoIterator<Item = S>,
202 S: Into<String>,
203 {
204 self.config.proxies = proxies.into_iter().map(Into::into).collect();
205 self
206 }
207
208 pub fn with_proxy_config(mut self, config: ProxyConfig) -> Self {
209 self.config.proxy_config = config;
210 self
211 }
212
213 pub fn with_captcha_provider(mut self, provider: Arc<dyn CaptchaProvider>) -> Self {
214 self.config.captcha_provider = Some(provider);
215 self
216 }
217
218 pub fn with_interpreter(mut self, interpreter: Arc<dyn JavascriptInterpreter>) -> Self {
219 self.config.interpreter = Some(interpreter);
220 self
221 }
222
223 pub fn disable_metrics(mut self) -> Self {
224 self.config.enable_metrics = false;
225 self
226 }
227
228 pub fn disable_performance_monitoring(mut self) -> Self {
229 self.config.enable_performance_monitoring = false;
230 self
231 }
232
233 pub fn disable_tls_fingerprinting(mut self) -> Self {
234 self.config.enable_tls_fingerprinting = false;
235 self
236 }
237
238 pub fn disable_anti_detection(mut self) -> Self {
239 self.config.enable_anti_detection = false;
240 self
241 }
242
243 pub fn disable_spoofing(mut self) -> Self {
244 self.config.enable_spoofing = false;
245 self
246 }
247
248 pub fn disable_adaptive_timing(mut self) -> Self {
249 self.config.enable_adaptive_timing = false;
250 self
251 }
252
253 pub fn disable_ml_optimization(mut self) -> Self {
254 self.config.enable_ml_optimization = false;
255 self
256 }
257
258 pub fn with_behavior_profile(mut self, profile: BehaviorProfile) -> Self {
259 self.config.behavior_profile = profile;
260 self
261 }
262
263 pub fn with_spoofing_consistency(mut self, level: ConsistencyLevel) -> Self {
264 self.config.spoofing_consistency = level;
265 self
266 }
267
268 pub fn with_tls_config(mut self, config: TLSConfig) -> Self {
269 self.config.tls_config = config;
270 self
271 }
272
273 pub fn with_max_challenge_attempts(mut self, attempts: usize) -> Self {
274 self.config.max_challenge_attempts = attempts.max(1);
275 self
276 }
277
278 pub fn build(self) -> CloudScraperResult<CloudScraper> {
279 CloudScraper::with_config(self.config)
280 }
281}
282
283impl Default for CloudScraperBuilder {
284 fn default() -> Self {
285 Self::new()
286 }
287}
288
289struct CloudScraperInner {
291 pipeline: ChallengePipeline,
292 proxy_manager: Option<ProxyManager>,
293 current_proxy: Option<String>,
294 tls_manager: Option<DefaultTLSManager>,
295 fingerprint: Option<FingerprintGenerator>,
296 anti_detection: Option<DefaultAntiDetection>,
297 adaptive_timing: Option<DefaultAdaptiveTiming>,
298 performance_monitor: Option<PerformanceMonitor>,
299 ml_optimizer: Option<MLOptimizer>,
300}
301
302impl CloudScraperInner {
303 fn new(pipeline: ChallengePipeline) -> Self {
304 Self {
305 pipeline,
306 proxy_manager: None,
307 current_proxy: None,
308 tls_manager: None,
309 fingerprint: None,
310 anti_detection: None,
311 adaptive_timing: None,
312 performance_monitor: None,
313 ml_optimizer: None,
314 }
315 }
316}
317
318struct ClientPool {
320 base_headers: reqwest::header::HeaderMap,
321 clients: Mutex<HashMap<Option<String>, reqwest::Client>>,
322}
323
324impl ClientPool {
325 fn new(base_headers: reqwest::header::HeaderMap) -> Self {
326 Self {
327 base_headers,
328 clients: Mutex::new(HashMap::new()),
329 }
330 }
331
332 async fn client(&self, proxy: Option<&str>) -> CloudScraperResult<reqwest::Client> {
333 let mut guard = self.clients.lock().await;
334 let key = proxy.map(|p| p.to_string());
335 if let Some(client) = guard.get(&key) {
336 return Ok(client.clone());
337 }
338
339 let mut builder = reqwest::Client::builder()
340 .cookie_store(true)
341 .default_headers(self.base_headers.clone());
342
343 if let Some(endpoint) = proxy {
344 builder = builder.proxy(reqwest::Proxy::all(endpoint)?);
345 }
346
347 let client = builder.build()?;
348 guard.insert(key.clone(), client.clone());
349 Ok(client)
350 }
351}
352
353pub struct CloudScraper {
355 config: CloudScraperConfig,
356 base_headers_http: HeaderMap,
357 client_pool: Arc<ClientPool>,
358 challenge_client: Arc<dyn ChallengeHttpClient>,
359 state: StateManager,
360 metrics: Option<MetricsCollector>,
361 events: Arc<EventDispatcher>,
362 inner: Mutex<CloudScraperInner>,
363}
364
365impl CloudScraper {
366 pub fn new() -> CloudScraperResult<Self> {
368 CloudScraper::with_config(CloudScraperConfig::default())
369 }
370
371 pub fn builder() -> CloudScraperBuilder {
373 CloudScraperBuilder::new()
374 }
375
376 fn with_config(config: CloudScraperConfig) -> CloudScraperResult<Self> {
377 let profile = get_user_agent_profile(config.user_agent.clone())?;
378 let base_headers_http = to_http_headers(&profile)?;
379 let base_headers_reqwest = to_reqwest_headers(&base_headers_http)?;
380
381 let mut pipeline = ChallengePipeline::default();
382 let interpreter: Arc<dyn JavascriptInterpreter> = config
383 .interpreter
384 .clone()
385 .unwrap_or_else(|| Arc::new(BoaJavascriptInterpreter::new()));
386
387 let mut js_v2 = JavascriptV2Solver::new();
388 let mut turnstile = TurnstileSolver::new();
389 if let Some(provider) = &config.captcha_provider {
390 js_v2 = js_v2.with_captcha_provider(provider.clone());
391 turnstile = turnstile.with_captcha_provider(provider.clone());
392 }
393
394 pipeline = pipeline
395 .with_javascript_v1(JavascriptV1Solver::new(interpreter.clone()))
396 .with_javascript_v2(js_v2)
397 .with_managed_v3(ManagedV3Solver::new(interpreter))
398 .with_turnstile(turnstile)
399 .with_rate_limit(RateLimitHandler::new())
400 .with_access_denied(AccessDeniedHandler::new())
401 .with_bot_management(BotManagementHandler::new());
402
403 let mut inner = CloudScraperInner::new(pipeline);
404
405 if !config.proxies.is_empty() {
406 let mut manager = ProxyManager::new(config.proxy_config.clone());
407 manager.load(config.proxies.iter().cloned());
408 inner.proxy_manager = Some(manager);
409 }
410
411 if config.enable_tls_fingerprinting {
412 inner.tls_manager = Some(DefaultTLSManager::new(config.tls_config.clone()));
413 }
414
415 if config.enable_spoofing {
416 let mut generator = FingerprintGenerator::default();
417 generator = generator.with_consistency(config.spoofing_consistency);
418 inner.fingerprint = Some(generator);
419 }
420
421 if config.enable_anti_detection {
422 inner.anti_detection = Some(DefaultAntiDetection::new(Default::default()));
423 }
424
425 if config.enable_adaptive_timing {
426 let mut timing = DefaultAdaptiveTiming::new();
427 timing.set_behavior_profile(config.behavior_profile);
428 inner.adaptive_timing = Some(timing);
429 }
430
431 if config.enable_performance_monitoring {
432 inner.performance_monitor = Some(PerformanceMonitor::new(Default::default()));
433 }
434
435 if config.enable_ml_optimization {
436 inner.ml_optimizer = Some(MLOptimizer::default());
437 }
438
439 let client_pool = Arc::new(ClientPool::new(base_headers_reqwest));
440 let challenge_client = Arc::new(ReqwestChallengeHttpClient::new()?);
441 let state = StateManager::new();
442 let metrics = config.enable_metrics.then(MetricsCollector::new);
443
444 let mut events = EventDispatcher::new();
445 events.register_handler(Arc::new(LoggingHandler));
446 if let Some(ref collector) = metrics {
447 events.register_handler(Arc::new(MetricsHandler::new(collector.clone())));
448 }
449
450 Ok(Self {
451 config,
452 base_headers_http,
453 client_pool,
454 challenge_client,
455 state,
456 metrics,
457 events: Arc::new(events),
458 inner: Mutex::new(inner),
459 })
460 }
461
462 pub async fn get(&self, url: &str) -> CloudScraperResult<ScraperResponse> {
464 let url = Url::parse(url)?;
465 self.request(Method::GET, url, None).await
466 }
467
468 pub async fn request(
470 &self,
471 method: Method,
472 url: Url,
473 body: Option<Vec<u8>>,
474 ) -> CloudScraperResult<ScraperResponse> {
475 let mut forced_proxy: Option<String> = None;
476 let mut attempt = 0usize;
477
478 loop {
479 attempt += 1;
480
481 let (mut headers_http, anti_ctx, proxy, mut delay) = self
482 .prepare_request(
483 &method,
484 &url,
485 body.as_ref().map(|b| b.len()).unwrap_or(0),
486 forced_proxy.take(),
487 )
488 .await?;
489
490 if let Some(ref ct) = self.config.content_type {
491 headers_http.insert(
492 HeaderName::from_static("content-type"),
493 HeaderValue::from_str(ct)
494 .map_err(|_| CloudScraperError::InvalidHeader("content-type".into()))?,
495 );
496 }
497
498 if let Some(hint) = anti_ctx.delay_hint()
499 && hint > delay
500 {
501 delay = hint;
502 }
503
504 self.events
505 .dispatch(ScraperEvent::PreRequest(PreRequestEvent {
506 url: url.clone(),
507 method: method.clone(),
508 headers: headers_http.clone(),
509 timestamp: chrono::Utc::now(),
510 }));
511
512 let client = self.client_pool.client(proxy.as_deref()).await?;
513
514 if delay > Duration::from_millis(0) {
515 sleep(delay).await;
516 }
517
518 let req_headers = to_reqwest_headers(&headers_http)?;
519 let mut builder = client
520 .request(method.clone(), url.clone())
521 .headers(req_headers);
522 if let Some(ref body) = body {
523 builder = builder.body(body.clone());
524 }
525
526 let started = Instant::now();
527 let resp = builder.send().await?;
528 let latency = started.elapsed();
529
530 let final_url = resp.url().clone();
531 let status = resp.status().as_u16();
532 let headers_raw = resp.headers().clone();
533 let body_bytes = resp.bytes().await?.to_vec();
534 let body_text = String::from_utf8_lossy(&body_bytes).to_string();
535
536 let http_headers = reqwest_to_http(&headers_raw)?;
537 let challenge_response = ChallengeResponse {
538 url: &final_url,
539 status,
540 headers: &http_headers,
541 body: &body_text,
542 request_method: &method,
543 };
544
545 self.events
546 .dispatch(ScraperEvent::PostResponse(PostResponseEvent {
547 url: final_url.clone(),
548 method: method.clone(),
549 status,
550 latency,
551 timestamp: chrono::Utc::now(),
552 }));
553
554 let result = {
555 let mut guard = self.inner.lock().await;
556 let CloudScraperInner {
557 pipeline,
558 proxy_manager,
559 current_proxy,
560 tls_manager,
561 fingerprint,
562 ..
563 } = &mut *guard;
564
565 pipeline
566 .evaluate(
567 &challenge_response,
568 PipelineContext {
569 proxy_pool: proxy_manager.as_mut().map(|pm| pm as &mut dyn ProxyPool),
570 current_proxy: current_proxy.as_deref(),
571 failure_recorder: Some(&self.state),
572 fingerprint_manager: fingerprint.as_mut().map(|fp| {
573 fp as &mut dyn crate::challenges::solvers::FingerprintManager
574 }),
575 tls_manager: tls_manager
576 .as_mut()
577 .map(|tls| tls as &mut dyn TlsProfileManager),
578 },
579 )
580 .await
581 };
582
583 match result {
584 ChallengePipelineResult::NoChallenge => {
585 self.record_outcome(true, status, latency, delay, &final_url)
586 .await;
587 let response = ScraperResponse::new(
588 status,
589 http_headers.clone(),
590 Bytes::from(body_bytes),
591 final_url,
592 );
593 return Ok(response);
594 }
595 ChallengePipelineResult::Submission {
596 detection,
597 submission,
598 } => {
599 let (response, challenge_latency) = self
600 .handle_submission(
601 submission,
602 detection,
603 &method,
604 &url,
605 headers_http.clone(),
606 body.clone(),
607 )
608 .await?;
609 self.record_outcome(
610 response.status() < 500,
611 response.status(),
612 latency + challenge_latency,
613 delay,
614 response.url(),
615 )
616 .await;
617 return Ok(response);
618 }
619 ChallengePipelineResult::Mitigation { detection, plan } => {
620 self.record_outcome(false, status, latency, delay, &final_url)
621 .await;
622 self.events
623 .dispatch(ScraperEvent::Challenge(ChallengeEvent {
624 domain: detection.url.clone(),
625 challenge_type: format!("{:?}", detection.challenge_type),
626 success: false,
627 metadata: vec![
628 ("reason".into(), plan.reason.clone()),
629 ("pattern".into(), detection.pattern_id.clone()),
630 ],
631 timestamp: chrono::Utc::now(),
632 }));
633
634 if let Some(wait) = plan.wait {
635 sleep(wait).await;
636 }
637
638 if let Some(ref proxy_hint) = plan.new_proxy {
639 forced_proxy = Some(proxy_hint.clone());
640 }
641
642 let should_retry =
643 plan.should_retry && attempt < self.config.max_challenge_attempts;
644 if should_retry {
645 self.events.dispatch(ScraperEvent::Retry(RetryEvent {
646 domain: detection.url,
647 attempt: (attempt + 1) as u32,
648 reason: plan.reason.clone(),
649 scheduled_after: plan.wait.unwrap_or_default(),
650 timestamp: chrono::Utc::now(),
651 }));
652 continue;
653 } else {
654 return Err(CloudScraperError::Mitigation(Box::new(plan)));
655 }
656 }
657 ChallengePipelineResult::Unsupported { detection, reason } => {
658 self.record_outcome(false, status, latency, delay, &final_url)
659 .await;
660 self.events
661 .dispatch(ScraperEvent::Challenge(ChallengeEvent {
662 domain: detection.url,
663 challenge_type: detection.pattern_name,
664 success: false,
665 metadata: vec![("reason".into(), reason.to_string())],
666 timestamp: chrono::Utc::now(),
667 }));
668 return Err(CloudScraperError::Unsupported(reason));
669 }
670 ChallengePipelineResult::Failed { detection, error } => {
671 self.record_outcome(false, status, latency, delay, &final_url)
672 .await;
673 self.events
674 .dispatch(ScraperEvent::Error(crate::modules::events::ErrorEvent {
675 domain: detection.url,
676 error: error.to_string(),
677 timestamp: chrono::Utc::now(),
678 }));
679 return Err(CloudScraperError::Pipeline(error));
680 }
681 }
682 }
683 }
684
685 async fn handle_submission(
686 &self,
687 submission: ChallengeSubmission,
688 detection: ChallengeDetection,
689 method: &Method,
690 url: &Url,
691 headers: HeaderMap,
692 body: Option<Vec<u8>>,
693 ) -> CloudScraperResult<(ScraperResponse, Duration)> {
694 let original = OriginalRequest::new(method.clone(), url.clone())
695 .with_headers(headers)
696 .with_body(body);
697
698 let started = Instant::now();
699 let result =
700 execute_challenge_submission(self.challenge_client.clone(), submission, original).await;
701 let challenge_latency = started.elapsed();
702
703 let success = result.is_ok();
704 {
705 let mut guard = self.inner.lock().await;
706 guard
707 .pipeline
708 .record_outcome(&detection.pattern_id, success);
709 }
710
711 let final_response = result?;
712 let response = ScraperResponse::new(
713 final_response.status,
714 final_response.headers.clone(),
715 Bytes::from(final_response.body.clone()),
716 final_response.url.clone(),
717 );
718
719 self.events
720 .dispatch(ScraperEvent::Challenge(ChallengeEvent {
721 domain: detection.url,
722 challenge_type: detection.pattern_name,
723 success,
724 metadata: vec![
725 ("pattern".into(), detection.pattern_id),
726 ("status".into(), final_response.status.to_string()),
727 ],
728 timestamp: chrono::Utc::now(),
729 }));
730
731 self.events
732 .dispatch(ScraperEvent::PostResponse(PostResponseEvent {
733 url: response.url().clone(),
734 method: method.clone(),
735 status: response.status(),
736 latency: challenge_latency,
737 timestamp: chrono::Utc::now(),
738 }));
739
740 Ok((response, challenge_latency))
741 }
742
743 async fn record_outcome(
744 &self,
745 success: bool,
746 status: u16,
747 latency: Duration,
748 delay: Duration,
749 url: &Url,
750 ) {
751 let domain = url.host_str().unwrap_or_default();
752 if success {
753 self.state.record_success(domain);
754 } else {
755 self.state
756 .record_failure(domain, format!("status_{status}"));
757 }
758
759 if let Some(ref collector) = self.metrics {
760 collector.record_response(domain, status, latency);
761 }
762
763 let mut guard = self.inner.lock().await;
764 if let Some(timing) = guard.adaptive_timing.as_mut() {
765 let outcome = TimingOutcome {
766 success,
767 response_time: latency,
768 applied_delay: delay,
769 };
770 timing.record_outcome(domain, &outcome);
771 }
772
773 if let Some(anti) = guard.anti_detection.as_mut() {
774 anti.record_response(domain, status, latency);
775 }
776
777 if let Some(perf) = guard.performance_monitor.as_mut()
778 && let Some(report) = perf.record(domain, latency, success)
779 && !report.alerts.is_empty()
780 {
781 log::warn!("performance alerts: {:#?}", report.alerts);
782 }
783
784 if let Some(ml) = guard.ml_optimizer.as_mut() {
785 let mut features = FeatureVector::new();
786 features.insert("latency".into(), latency.as_secs_f64());
787 features.insert("delay".into(), delay.as_secs_f64());
788 ml.record_attempt(domain, features, success, Some(delay.as_secs_f64()));
789 }
790 }
791
792 async fn prepare_request(
793 &self,
794 method: &Method,
795 url: &Url,
796 body_size: usize,
797 forced_proxy: Option<String>,
798 ) -> CloudScraperResult<(HeaderMap, AntiDetectionContext, Option<String>, Duration)> {
799 let mut headers = self.base_headers_http.clone();
800 if let Some(state) = self.state.get(url.host_str().unwrap_or("")) {
801 for (name, value) in state.sticky_headers {
802 let header_name = HeaderName::from_bytes(name.as_bytes())
803 .map_err(|_| CloudScraperError::InvalidHeader(name.clone()))?;
804 let header_value = HeaderValue::from_str(&value)
805 .map_err(|_| CloudScraperError::InvalidHeader(name.clone()))?;
806 headers.insert(header_name, header_value);
807 }
808 }
809
810 let mut anti_ctx =
811 AntiDetectionContext::new(url.clone(), method.clone()).with_headers(headers.clone());
812 anti_ctx.set_body_size(body_size);
813
814 let mut proxy = forced_proxy;
815 let mut delay = Duration::from_millis(0);
816
817 {
818 let mut guard = self.inner.lock().await;
819
820 if let Some(ref mut generator) = guard.fingerprint
821 && let Some(domain) = url.host_str()
822 {
823 let fp = generator.generate_for(domain);
824 anti_ctx.set_user_agent(fp.user_agent.clone());
825 headers.insert(
826 HeaderName::from_static("user-agent"),
827 HeaderValue::from_str(&fp.user_agent)
828 .map_err(|_| CloudScraperError::InvalidHeader("user-agent".into()))?,
829 );
830 headers.insert(
831 HeaderName::from_static("accept-language"),
832 HeaderValue::from_str(&fp.accept_language)
833 .map_err(|_| CloudScraperError::InvalidHeader("accept-language".into()))?,
834 );
835 }
836
837 if let Some(ref mut anti) = guard.anti_detection {
838 anti.prepare_request(url.host_str().unwrap_or(""), &mut anti_ctx);
839 headers = anti_ctx.headers.clone();
840 }
841
842 if proxy.is_none() {
843 let next = guard.proxy_manager.as_mut().and_then(|pm| pm.next_proxy());
844 guard.current_proxy = next.clone();
845 proxy = next;
846 } else {
847 guard.current_proxy = proxy.clone();
848 }
849
850 if let Some(ref mut timing) = guard.adaptive_timing {
851 let request = TimingRequest::new(request_kind(method), body_size);
852 delay = timing.calculate_delay(url.host_str().unwrap_or(""), &request);
853 }
854 }
855
856 Ok((headers, anti_ctx, proxy, delay))
857 }
858}
859
860fn request_kind(method: &Method) -> RequestKind {
861 match *method {
862 Method::GET => RequestKind::Get,
863 Method::POST => RequestKind::Post,
864 Method::PUT => RequestKind::Put,
865 Method::PATCH => RequestKind::Patch,
866 Method::DELETE => RequestKind::Delete,
867 Method::HEAD => RequestKind::Head,
868 Method::OPTIONS => RequestKind::Options,
869 _ => RequestKind::Other,
870 }
871}
872
873fn to_http_headers(profile: &UserAgentProfile) -> CloudScraperResult<HeaderMap> {
874 let mut headers = HeaderMap::new();
875 for (name, value) in &profile.headers {
876 let header_name = HeaderName::from_bytes(name.as_bytes())
877 .map_err(|_| CloudScraperError::InvalidHeader(name.clone()))?;
878 let header_value = HeaderValue::from_str(value)
879 .map_err(|_| CloudScraperError::InvalidHeader(name.clone()))?;
880 headers.insert(header_name, header_value);
881 }
882 Ok(headers)
883}
884
885fn to_reqwest_headers(headers: &HeaderMap) -> CloudScraperResult<reqwest::header::HeaderMap> {
886 let mut map = reqwest::header::HeaderMap::new();
887 for (name, value) in headers.iter() {
888 let header_name = reqwest::header::HeaderName::from_bytes(name.as_str().as_bytes())
889 .map_err(|_| CloudScraperError::InvalidHeader(name.to_string()))?;
890 let header_value = reqwest::header::HeaderValue::from_bytes(value.as_bytes())
891 .map_err(|_| CloudScraperError::InvalidHeader(name.to_string()))?;
892 map.insert(header_name, header_value);
893 }
894 Ok(map)
895}
896
897fn reqwest_to_http(headers: &reqwest::header::HeaderMap) -> CloudScraperResult<HeaderMap> {
898 let mut map = HeaderMap::new();
899 for (name, value) in headers.iter() {
900 let header_name = HeaderName::from_bytes(name.as_str().as_bytes())
901 .map_err(|_| CloudScraperError::InvalidHeader(name.to_string()))?;
902 let header_value = HeaderValue::from_bytes(value.as_bytes())
903 .map_err(|_| CloudScraperError::InvalidHeader(name.to_string()))?;
904 map.insert(header_name, header_value);
905 }
906 Ok(map)
907}