1use std::collections::{HashMap, HashSet};
30use std::time::Duration;
31
32use tracing::{debug, error, info, warn};
33
34use crate::config::{BrowserConfig, FetchParams, ResolvedFetchParams, StealthConfig, WaitState};
35use crate::engine::{build_launch_options, launch_playwright};
36use crate::error::{BrowserError, Result};
37use crate::intercept::should_block_request;
38use crate::page_pool::PagePool;
39use crate::response_factory;
40
41use scrapling_fetch::Response;
42
43async fn setup_page(
48 page: &playwright_rs::Page,
49 timeout_ms: f64,
50 extra_headers: &HashMap<String, String>,
51 disable_resources: bool,
52 blocked_domains: &HashSet<String>,
53) -> Result<()> {
54 page.set_default_timeout(timeout_ms).await;
55 page.set_default_navigation_timeout(timeout_ms).await;
56
57 if !extra_headers.is_empty() {
58 page.set_extra_http_headers(extra_headers.clone()).await?;
59 }
60
61 if disable_resources || !blocked_domains.is_empty() {
62 let disable = disable_resources;
63 let domains = blocked_domains.clone();
64
65 page.route("**/*", move |route| {
66 let domains = domains.clone();
67 async move {
68 let request = route.request();
69 let resource_type = request.resource_type();
70 let req_url = request.url();
71
72 if should_block_request(resource_type, req_url, disable, &domains) {
73 route.abort(Some("blockedbyclient")).await
74 } else {
75 route.continue_(None).await
76 }
77 }
78 })
79 .await?;
80 }
81
82 Ok(())
83}
84
85async fn wait_for_stability(
86 page: &playwright_rs::Page,
87 load_dom: bool,
88 network_idle: bool,
89) -> Result<()> {
90 let _ = page
91 .wait_for_load_state(Some(playwright_rs::WaitUntil::Load))
92 .await;
93
94 if load_dom {
95 let _ = page
96 .wait_for_load_state(Some(playwright_rs::WaitUntil::DomContentLoaded))
97 .await;
98 }
99
100 if network_idle {
101 let _ = page
102 .wait_for_load_state(Some(playwright_rs::WaitUntil::NetworkIdle))
103 .await;
104 }
105
106 Ok(())
107}
108
109async fn wait_for_selector(
110 page: &playwright_rs::Page,
111 selector: &str,
112 _state: WaitState,
113 timeout_ms: f64,
114) -> Result<()> {
115 let locator = page.locator(selector).await;
116 let wait_opts = playwright_rs::protocol::WaitForOptions::builder()
117 .timeout(timeout_ms)
118 .build();
119 locator
120 .wait_for(Some(wait_opts))
121 .await
122 .map_err(|e| BrowserError::Timeout(format!("wait_for_selector({selector}): {e}")))?;
123 Ok(())
124}
125
126async fn initialize_context(
127 context: &playwright_rs::BrowserContext,
128 config: &BrowserConfig,
129) -> Result<()> {
130 if let Some(ref script_path) = config.init_script {
131 let script = std::fs::read_to_string(script_path)
132 .map_err(|e| BrowserError::Config(format!("failed to read init_script: {e}")))?;
133 context.add_init_script(&script).await?;
134 }
135
136 if !config.cookies.is_empty() {
137 let pw_cookies: Vec<playwright_rs::protocol::Cookie> = config
138 .cookies
139 .iter()
140 .map(|c| playwright_rs::protocol::Cookie {
141 name: c.name.clone(),
142 value: c.value.clone(),
143 domain: c.domain.clone().unwrap_or_default(),
144 path: c.path.clone().unwrap_or_else(|| "/".into()),
145 expires: -1.0,
146 http_only: false,
147 secure: false,
148 same_site: None,
149 })
150 .collect();
151 context.add_cookies(&pw_cookies).await?;
152 }
153
154 Ok(())
155}
156
157fn make_goto_options(timeout_ms: f64, network_idle: bool) -> playwright_rs::GotoOptions {
158 let mut opts =
159 playwright_rs::GotoOptions::new().timeout(Duration::from_millis(timeout_ms as u64));
160 if network_idle {
161 opts = opts.wait_until(playwright_rs::WaitUntil::NetworkIdle);
162 }
163 opts
164}
165
166pub struct DynamicSession {
180 config: BrowserConfig,
181 playwright: Option<playwright_rs::Playwright>,
182 browser: Option<playwright_rs::Browser>,
183 context: Option<playwright_rs::BrowserContext>,
184 page_pool: PagePool,
185 is_alive: bool,
186}
187
188impl DynamicSession {
189 pub fn new(mut config: BrowserConfig) -> Result<Self> {
193 config.validate()?;
194 let max_pages = config.max_pages;
195 Ok(Self {
196 config,
197 playwright: None,
198 browser: None,
199 context: None,
200 page_pool: PagePool::new(max_pages),
201 is_alive: false,
202 })
203 }
204
205 pub async fn start(&mut self) -> Result<()> {
212 let pw = launch_playwright().await?;
213 let chromium = pw.chromium();
214 let launch_opts = build_launch_options(&self.config, false, &[]);
215
216 if let Some(ref cdp_url) = self.config.cdp_url {
217 let browser = chromium.connect_over_cdp(cdp_url, None).await?;
218 if !self.config.has_proxy_rotator() {
219 let ctx = browser.new_context().await?;
220 initialize_context(&ctx, &self.config).await?;
221 self.context = Some(ctx);
222 }
223 self.browser = Some(browser);
224 } else if self.config.has_proxy_rotator() {
225 let browser = chromium.launch_with_options(launch_opts).await?;
226 self.browser = Some(browser);
227 } else {
228 let browser = chromium.launch_with_options(launch_opts).await?;
229 let ctx = browser.new_context().await?;
230 initialize_context(&ctx, &self.config).await?;
231 self.context = Some(ctx);
232 self.browser = Some(browser);
233 }
234
235 self.playwright = Some(pw);
236 self.is_alive = true;
237 info!("DynamicSession started");
238 Ok(())
239 }
240
241 pub async fn fetch(&self, url: &str, params: Option<FetchParams>) -> Result<Response> {
247 if !self.is_alive {
248 return Err(BrowserError::Config("session not started".into()));
249 }
250
251 let resolved = params.unwrap_or_default().merge_with_config(&self.config);
252 let mut last_error = None;
253
254 for attempt in 0..self.config.retries {
255 match self.do_fetch(url, &resolved).await {
256 Ok(response) => return Ok(response),
257 Err(e) => {
258 if attempt < self.config.retries - 1 {
259 warn!(attempt = attempt + 1, error = %e, "fetch failed, retrying");
260 tokio::time::sleep(Duration::from_secs_f64(self.config.retry_delay_secs))
261 .await;
262 } else {
263 error!(attempts = self.config.retries, "all retries exhausted");
264 }
265 last_error = Some(e);
266 }
267 }
268 }
269
270 Err(last_error.unwrap_or(BrowserError::Other("unknown error".into())))
271 }
272
273 async fn do_fetch(&self, url: &str, params: &ResolvedFetchParams) -> Result<Response> {
274 let page = self.get_page().await?;
275
276 setup_page(
277 &page,
278 params.timeout_ms,
279 ¶ms.extra_headers,
280 params.disable_resources,
281 ¶ms.blocked_domains,
282 )
283 .await?;
284
285 if let Some(ref cb) = self.config.page_setup {
286 cb(page.clone()).await?;
287 }
288
289 let goto_opts = make_goto_options(params.timeout_ms, params.network_idle);
290
291 debug!(url = %url, "navigating");
292 let nav_response = page.goto(url, Some(goto_opts)).await?;
293
294 wait_for_stability(&page, params.load_dom, params.network_idle).await?;
295
296 if let Some(ref cb) = self.config.page_action {
297 cb(page.clone()).await?;
298 }
299
300 if let Some(ref selector) = params.wait_selector {
301 wait_for_selector(
302 &page,
303 selector,
304 params.wait_selector_state,
305 params.timeout_ms,
306 )
307 .await?;
308 }
309
310 if params.wait_ms > 0 {
311 tokio::time::sleep(Duration::from_millis(params.wait_ms)).await;
312 }
313
314 let response = response_factory::from_browser_page(
315 &page,
316 nav_response.as_ref(),
317 nav_response.as_ref(),
318 HashMap::new(),
319 Vec::new(),
320 )
321 .await?;
322
323 page.close().await?;
324 info!(status = response.status, url = url, "fetch complete");
325 Ok(response)
326 }
327
328 async fn get_page(&self) -> Result<playwright_rs::Page> {
329 if let Some(ref ctx) = self.context {
330 ctx.new_page().await.map_err(Into::into)
331 } else if let Some(ref browser) = self.browser {
332 let ctx = browser.new_context().await?;
333 ctx.new_page().await.map_err(Into::into)
334 } else {
335 Err(BrowserError::Config("no browser available".into()))
336 }
337 }
338
339 pub async fn close(&mut self) -> Result<()> {
345 if let Some(ctx) = self.context.take() {
346 let _ = ctx.close().await;
347 }
348 if let Some(browser) = self.browser.take() {
349 let _ = browser.close().await;
350 }
351 self.playwright = None;
352 self.is_alive = false;
353 info!("DynamicSession closed");
354 Ok(())
355 }
356
357 pub fn is_alive(&self) -> bool {
361 self.is_alive
362 }
363
364 pub fn pool_stats(&self) -> crate::page_pool::PoolStats {
367 self.page_pool.stats()
368 }
369}
370
371pub struct StealthySession {
384 config: StealthConfig,
385 playwright: Option<playwright_rs::Playwright>,
386 browser: Option<playwright_rs::Browser>,
387 context: Option<playwright_rs::BrowserContext>,
388 page_pool: PagePool,
389 is_alive: bool,
390}
391
392impl StealthySession {
393 pub fn new(mut config: StealthConfig) -> Result<Self> {
397 config.validate()?;
398 let max_pages = config.base.max_pages;
399 Ok(Self {
400 config,
401 playwright: None,
402 browser: None,
403 context: None,
404 page_pool: PagePool::new(max_pages),
405 is_alive: false,
406 })
407 }
408
409 pub async fn start(&mut self) -> Result<()> {
416 let pw = launch_playwright().await?;
417 let chromium = pw.chromium();
418 let extra = self.config.extra_stealth_args();
419 let launch_opts = build_launch_options(&self.config.base, true, &extra);
420
421 if let Some(ref cdp_url) = self.config.base.cdp_url {
422 let browser = chromium.connect_over_cdp(cdp_url, None).await?;
423 if !self.config.base.has_proxy_rotator() {
424 let ctx = browser.new_context().await?;
425 initialize_context(&ctx, &self.config.base).await?;
426 self.context = Some(ctx);
427 }
428 self.browser = Some(browser);
429 } else if self.config.base.has_proxy_rotator() {
430 let browser = chromium.launch_with_options(launch_opts).await?;
431 self.browser = Some(browser);
432 } else {
433 let browser = chromium.launch_with_options(launch_opts).await?;
434 let ctx = browser.new_context().await?;
435 initialize_context(&ctx, &self.config.base).await?;
436 self.context = Some(ctx);
437 self.browser = Some(browser);
438 }
439
440 self.playwright = Some(pw);
441 self.is_alive = true;
442 info!("StealthySession started");
443 Ok(())
444 }
445
446 pub async fn fetch(&self, url: &str, params: Option<FetchParams>) -> Result<Response> {
452 if !self.is_alive {
453 return Err(BrowserError::Config("session not started".into()));
454 }
455
456 let mut resolved = params
457 .unwrap_or_default()
458 .merge_with_config(&self.config.base);
459 if self.config.solve_cloudflare {
460 resolved.solve_cloudflare = true;
461 }
462
463 let mut last_error = None;
464
465 for attempt in 0..self.config.base.retries {
466 match self.do_fetch(url, &resolved).await {
467 Ok(response) => return Ok(response),
468 Err(e) => {
469 if attempt < self.config.base.retries - 1 {
470 warn!(attempt = attempt + 1, error = %e, "stealth fetch failed, retrying");
471 tokio::time::sleep(Duration::from_secs_f64(
472 self.config.base.retry_delay_secs,
473 ))
474 .await;
475 }
476 last_error = Some(e);
477 }
478 }
479 }
480
481 Err(last_error.unwrap_or(BrowserError::Other("unknown error".into())))
482 }
483
484 async fn do_fetch(&self, url: &str, params: &ResolvedFetchParams) -> Result<Response> {
485 let page = self.get_page().await?;
486
487 setup_page(
488 &page,
489 params.timeout_ms,
490 ¶ms.extra_headers,
491 params.disable_resources,
492 ¶ms.blocked_domains,
493 )
494 .await?;
495
496 if let Some(ref cb) = self.config.base.page_setup {
497 cb(page.clone()).await?;
498 }
499
500 let goto_opts = make_goto_options(params.timeout_ms, params.network_idle);
501
502 debug!(url = %url, "stealth navigating");
503 let nav_response = page.goto(url, Some(goto_opts)).await?;
504
505 wait_for_stability(&page, params.load_dom, params.network_idle).await?;
506
507 if params.solve_cloudflare {
508 self.cloudflare_solver(&page).await?;
509 }
510
511 if let Some(ref cb) = self.config.base.page_action {
512 cb(page.clone()).await?;
513 }
514
515 if let Some(ref selector) = params.wait_selector {
516 wait_for_selector(
517 &page,
518 selector,
519 params.wait_selector_state,
520 params.timeout_ms,
521 )
522 .await?;
523 }
524
525 if params.wait_ms > 0 {
526 tokio::time::sleep(Duration::from_millis(params.wait_ms)).await;
527 }
528
529 let response = response_factory::from_browser_page(
530 &page,
531 nav_response.as_ref(),
532 nav_response.as_ref(),
533 HashMap::new(),
534 Vec::new(),
535 )
536 .await?;
537
538 page.close().await?;
539 info!(
540 status = response.status,
541 url = url,
542 "stealth fetch complete"
543 );
544 Ok(response)
545 }
546
547 async fn get_page(&self) -> Result<playwright_rs::Page> {
548 if let Some(ref ctx) = self.context {
549 ctx.new_page().await.map_err(Into::into)
550 } else if let Some(ref browser) = self.browser {
551 let ctx = browser.new_context().await?;
552 ctx.new_page().await.map_err(Into::into)
553 } else {
554 Err(BrowserError::Config("no browser available".into()))
555 }
556 }
557
558 async fn cloudflare_solver(&self, page: &playwright_rs::Page) -> Result<()> {
565 let _ = page
566 .wait_for_load_state(Some(playwright_rs::WaitUntil::NetworkIdle))
567 .await;
568 tokio::time::sleep(Duration::from_secs(2)).await;
569
570 let content = page
571 .content()
572 .await
573 .map_err(|e| BrowserError::Navigation(format!("cloudflare solver: {e}")))?;
574
575 let Some(challenge) = detect_cloudflare_challenge(&content) else {
576 debug!("no Cloudflare challenge detected");
577 return Ok(());
578 };
579
580 info!(challenge = %challenge, "Cloudflare challenge detected");
581
582 match challenge.as_str() {
583 "non-interactive" => {
584 for _ in 0..30 {
585 tokio::time::sleep(Duration::from_secs(2)).await;
586 let title = page.title().await.unwrap_or_default();
587 if !title.contains("Just a moment") {
588 debug!("Cloudflare non-interactive challenge resolved");
589 return Ok(());
590 }
591 }
592 warn!("Cloudflare non-interactive challenge did not resolve");
593 }
594 "managed" | "interactive" | "embedded" => {
595 let selectors = [
596 "iframe[src*='challenges.cloudflare.com']",
597 "#turnstile-wrapper iframe",
598 ".cf-turnstile iframe",
599 ];
600
601 for _ in 0..10 {
602 for selector in &selectors {
603 let locator = page.locator(selector).await;
604 if let Ok(count) = locator.count().await {
605 if count > 0 {
606 debug!(selector, "found Cloudflare iframe, clicking");
607 let _ = locator.first().click(None).await;
608 tokio::time::sleep(Duration::from_secs(3)).await;
609
610 let new_content = page.content().await.unwrap_or_default();
611 if detect_cloudflare_challenge(&new_content).is_none() {
612 info!("Cloudflare challenge solved");
613 return Ok(());
614 }
615 }
616 }
617 }
618 tokio::time::sleep(Duration::from_secs(2)).await;
619 }
620 warn!("Cloudflare challenge could not be solved after retries");
621 }
622 _ => {}
623 }
624
625 Ok(())
626 }
627
628 pub async fn close(&mut self) -> Result<()> {
634 if let Some(ctx) = self.context.take() {
635 let _ = ctx.close().await;
636 }
637 if let Some(browser) = self.browser.take() {
638 let _ = browser.close().await;
639 }
640 self.playwright = None;
641 self.is_alive = false;
642 info!("StealthySession closed");
643 Ok(())
644 }
645
646 pub fn is_alive(&self) -> bool {
650 self.is_alive
651 }
652
653 pub fn pool_stats(&self) -> crate::page_pool::PoolStats {
656 self.page_pool.stats()
657 }
658}
659
660fn detect_cloudflare_challenge(content: &str) -> Option<String> {
661 if content.contains("cType: 'non-interactive'") {
662 return Some("non-interactive".into());
663 }
664 if content.contains("cType: 'managed'") {
665 return Some("managed".into());
666 }
667 if content.contains("cType: 'interactive'") {
668 return Some("interactive".into());
669 }
670 if content.contains("challenges.cloudflare.com/turnstile/v") {
671 return Some("embedded".into());
672 }
673 None
674}