1pub mod blocklist;
37pub mod breaker;
38#[cfg(feature = "auto-browser")]
39pub mod browser;
40#[cfg(feature = "cdp")]
41pub mod browser_pool;
42#[cfg(feature = "camoufox")]
43pub mod camoufox;
44#[cfg(feature = "cdp")]
45pub mod cdp;
46#[cfg(feature = "cdp")]
47pub mod cdp_conn;
48pub mod detector;
49#[cfg(feature = "cdp")]
50pub mod health_telemetry;
51pub mod host_limiter;
52pub mod http_only;
53pub mod preference;
54pub mod traits;
55
56use crate::breaker::{
57 AttemptContext, BreakerOutcome, BreakerRegistry, Permit, ProbeGuard, classify_outcome,
58};
59use crate::preference::HostPreferences;
60use crw_core::config::{BUILTIN_UA_POOL, RendererConfig, RendererMode, StealthConfig};
61use crw_core::error::{CrwError, CrwResult};
62use crw_core::metrics::metrics;
63use crw_core::types::{
64 FailoverErrorKind, FetchResult, RenderDecision, RendererKind, resolve_render_js,
65};
66use std::collections::HashMap;
67use std::sync::Arc;
68use std::time::Duration;
69use traits::PageFetcher;
70
71tokio::task_local! {
72 pub static REQUEST_COUNTRY: Option<String>;
78}
79
80tokio::task_local! {
81 pub static REQUEST_PROXY: Option<Arc<crw_core::ProxyEntry>>;
87}
88
89#[derive(Debug, Clone, Copy)]
94pub struct ScreenshotReq {
95 pub full_page: bool,
98}
99
100tokio::task_local! {
101 pub static REQUEST_SCREENSHOT: Option<ScreenshotReq>;
107}
108
109pub fn screenshot_requested() -> bool {
112 REQUEST_SCREENSHOT
113 .try_with(|s| s.is_some())
114 .unwrap_or(false)
115}
116
117pub fn current_screenshot_req() -> Option<ScreenshotReq> {
119 REQUEST_SCREENSHOT.try_with(|s| *s).ok().flatten()
120}
121
122fn renderer_kind_for(name: &str) -> Option<RendererKind> {
126 match name {
127 "http" | "http_only_fallback" => Some(RendererKind::Http),
128 "lightpanda" => Some(RendererKind::Lightpanda),
129 "chrome" => Some(RendererKind::Chrome),
130 "chrome_proxy" => Some(RendererKind::ChromeProxy),
131 "camoufox" => Some(RendererKind::Camoufox),
132 _ => None,
133 }
134}
135
136fn classify_renderer_error(err: &CrwError) -> FailoverErrorKind {
146 match err {
147 CrwError::Timeout(_) => FailoverErrorKind::LightpandaTimeout,
148 CrwError::TargetUnreachable(_) => FailoverErrorKind::NetworkError,
149 CrwError::HttpError(_) => FailoverErrorKind::NetworkError,
150 CrwError::RendererError(_) => FailoverErrorKind::LightpandaCrash,
153 _ => FailoverErrorKind::Other,
154 }
155}
156
157fn tier_timeouts_from(
160 config: &RendererConfig,
161) -> std::collections::HashMap<RendererKind, std::time::Duration> {
162 let mut m = std::collections::HashMap::new();
163 m.insert(
164 RendererKind::Http,
165 std::time::Duration::from_millis(config.http_timeout()),
166 );
167 m.insert(
168 RendererKind::Lightpanda,
169 std::time::Duration::from_millis(config.lightpanda_timeout()),
170 );
171 m.insert(
172 RendererKind::Chrome,
173 std::time::Duration::from_millis(config.chrome_timeout()),
174 );
175 m.insert(
176 RendererKind::ChromeProxy,
177 std::time::Duration::from_millis(config.chrome_proxy_timeout()),
178 );
179 m.insert(
184 RendererKind::Camoufox,
185 std::time::Duration::from_millis(config.camoufox_timeout()),
186 );
187 m
188}
189
190fn credit_for(kind: RendererKind) -> u32 {
193 match kind {
194 RendererKind::Http => 1,
195 RendererKind::Lightpanda => 1,
196 RendererKind::Chrome => 2,
197 RendererKind::ChromeProxy => 2,
200 RendererKind::Camoufox => 3,
206 }
207}
208
209fn stamp_http_decision(result: &mut FetchResult, requested_renderer: Option<&str>) {
213 if result.render_decision.is_some() {
214 return;
215 }
216 let kind = RendererKind::Http;
217 result.credit_cost = credit_for(kind);
218 result.render_decision = Some(match requested_renderer {
219 Some("http") => RenderDecision::UserPinned { renderer: kind },
220 _ => RenderDecision::AutoDefault { chosen: kind },
221 });
222 metrics()
224 .render_route_decision_total
225 .with_label_values(&[kind.as_str(), "success"])
226 .inc();
227}
228
229fn host_of(url: &str) -> String {
231 url::Url::parse(url)
232 .ok()
233 .and_then(|u| u.host_str().map(|h| h.to_string()))
234 .unwrap_or_default()
235}
236
237fn pick_ua<'a>(default_ua: &'a str, stealth: &'a StealthConfig) -> String {
239 if stealth.enabled {
240 let pool: &[&str] = if stealth.user_agents.is_empty() {
241 BUILTIN_UA_POOL
242 } else {
243 return stealth.user_agents[rand::random_range(0..stealth.user_agents.len())].clone();
245 };
246 pool[rand::random_range(0..pool.len())].to_string()
247 } else {
248 default_ua.to_string()
249 }
250}
251
252#[allow(dead_code)] struct JsAttemptClass {
257 text_len: usize,
258 is_placeholder: bool,
259 failed_render: Option<detector::FailedRenderReason>,
260 is_bot_wall: bool,
261 vendor_block: Option<&'static str>,
262 is_status_blocked: bool,
263 antibot: crw_extract::antibot::AntibotResult,
264 antibot_blocked: bool,
265 hard_block: bool,
267 acceptable: bool,
269}
270
271enum HedgeOutcome {
273 Accepted(FetchResult),
275 Thin(FetchResult, bool),
279}
280
281pub struct FallbackRenderer {
283 http: Arc<dyn PageFetcher>,
284 js_renderers: Vec<Arc<dyn PageFetcher>>,
285 render_js_default: Option<bool>,
287 latency_breakdown: bool,
289 auto_egress_escalation: bool,
292 chrome_hedge: bool,
294 hedge_sem: Arc<tokio::sync::Semaphore>,
299 preferences: Arc<HostPreferences>,
301 breakers: Arc<BreakerRegistry>,
303 tier_timeouts: std::collections::HashMap<RendererKind, std::time::Duration>,
307 requests_per_second: f64,
311 per_host_max_concurrent: u32,
313 antibot: crw_core::config::AntibotConfig,
317 proxy_rotator: Option<Arc<crw_core::ProxyRotator>>,
321 http_ua: String,
324 http_inject_stealth: bool,
325 http_timeout_ms: u64,
326 proxy_client_cache: std::sync::Mutex<std::collections::HashMap<String, Arc<dyn PageFetcher>>>,
331 #[cfg(feature = "cdp")]
334 chrome_pool: Option<Arc<browser_pool::BrowserContextPool<cdp_conn::CdpConnection>>>,
335 #[cfg(feature = "camoufox")]
340 camoufox_in_auto: bool,
341}
342
343impl std::fmt::Debug for FallbackRenderer {
344 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
345 f.debug_struct("FallbackRenderer")
346 .field("http", &self.http.name())
347 .field(
348 "js_renderers",
349 &self
350 .js_renderers
351 .iter()
352 .map(|r| r.name())
353 .collect::<Vec<_>>(),
354 )
355 .field("render_js_default", &self.render_js_default)
356 .finish()
357 }
358}
359
360impl FallbackRenderer {
361 pub fn new(
362 config: &RendererConfig,
363 user_agent: &str,
364 proxy: Option<&str>,
365 stealth: &StealthConfig,
366 ) -> CrwResult<Self> {
367 let effective_ua = pick_ua(user_agent, stealth);
368 let inject_headers = stealth.enabled && stealth.inject_headers;
369 let http_timeout_ms = config.http_timeout();
370 if let Some(p) = proxy {
373 crw_core::ProxyEntry::parse(p).map_err(CrwError::ConfigError)?;
374 }
375 let http = Arc::new(http_only::HttpFetcher::with_timeout(
376 &effective_ua,
377 proxy,
378 inject_headers,
379 std::time::Duration::from_millis(http_timeout_ms),
380 )) as Arc<dyn PageFetcher>;
381
382 #[cfg(not(feature = "cdp"))]
386 if matches!(
387 config.mode,
388 RendererMode::Lightpanda | RendererMode::Chrome | RendererMode::Playwright
389 ) {
390 return Err(CrwError::ConfigError(format!(
391 "renderer.mode = {:?} requires the 'cdp' feature, but this build was \
392 compiled without it. Rebuild with --features cdp or set mode = \"auto\"/\"none\".",
393 config.mode
394 )));
395 }
396
397 #[cfg(not(feature = "camoufox"))]
401 if matches!(config.mode, RendererMode::Camoufox) {
402 return Err(CrwError::ConfigError(
403 "renderer.mode = \"camoufox\" requires the 'camoufox' feature, but this build \
404 was compiled without it. Rebuild with --features camoufox or set mode = \
405 \"auto\"/\"none\"."
406 .into(),
407 ));
408 }
409
410 #[allow(unused_mut)]
411 let mut js_renderers: Vec<Arc<dyn PageFetcher>> = Vec::new();
412
413 if matches!(config.mode, RendererMode::None) {
414 if config.render_js_default == Some(true) {
415 tracing::warn!(
416 "render_js_default=true has no effect with mode=none; \
417 requests will fall back to HTTP via http_only_fallback"
418 );
419 }
420 return Ok(Self {
421 http,
422 js_renderers,
423 render_js_default: config.render_js_default,
424 latency_breakdown: config.latency_breakdown,
425 auto_egress_escalation: config.auto_egress_escalation,
426 chrome_hedge: config.chrome_hedge,
427 hedge_sem: Arc::new(tokio::sync::Semaphore::new((config.pool_size / 2).max(1))),
428 preferences: Arc::new(HostPreferences::with_defaults()),
429 breakers: Arc::new(BreakerRegistry::with_defaults()),
430 tier_timeouts: tier_timeouts_from(config),
431 requests_per_second: 0.0,
432 per_host_max_concurrent: 1,
433 antibot: config.antibot.clone(),
434 proxy_rotator: None,
435 http_ua: effective_ua.clone(),
436 http_inject_stealth: inject_headers,
437 http_timeout_ms,
438 proxy_client_cache: std::sync::Mutex::new(std::collections::HashMap::new()),
439 #[cfg(feature = "cdp")]
440 chrome_pool: None,
441 #[cfg(feature = "camoufox")]
444 camoufox_in_auto: false,
445 });
446 }
447
448 #[cfg(feature = "cdp")]
449 let mut chrome_pool: Option<
450 Arc<browser_pool::BrowserContextPool<cdp_conn::CdpConnection>>,
451 > = None;
452
453 #[cfg(feature = "cdp")]
454 {
455 let want = |m: RendererMode| -> bool {
456 matches!(config.mode, RendererMode::Auto) || config.mode == m
457 };
458
459 if want(RendererMode::Lightpanda) {
460 if let Some(lp) = &config.lightpanda {
461 js_renderers.push(Arc::new(
462 cdp::CdpRenderer::new(
463 "lightpanda",
464 &lp.ws_url,
465 config.lightpanda_timeout(),
466 config.pool_size,
467 )
468 .with_user_agent(&effective_ua),
469 ));
470 } else if matches!(config.mode, RendererMode::Lightpanda) {
471 return Err(CrwError::ConfigError(
472 "renderer.mode = \"lightpanda\" but [renderer.lightpanda] ws_url is not \
473 configured"
474 .into(),
475 ));
476 }
477 }
478 if want(RendererMode::Playwright) {
479 if let Some(pw) = &config.playwright {
480 js_renderers.push(Arc::new(
483 cdp::CdpRenderer::new(
484 "playwright",
485 &pw.ws_url,
486 config.chrome_timeout(),
487 config.pool_size,
488 )
489 .with_user_agent(&effective_ua),
490 ));
491 } else if matches!(config.mode, RendererMode::Playwright) {
492 return Err(CrwError::ConfigError(
493 "renderer.mode = \"playwright\" but [renderer.playwright] ws_url is not \
494 configured"
495 .into(),
496 ));
497 }
498 }
499 if want(RendererMode::Chrome) {
500 if let Some(ch) = &config.chrome {
501 let blocklist = blocklist::Blocklist::defaults()
502 .with_stylesheets(config.chrome_intercept_stylesheets);
503 let mut renderer = cdp::CdpRenderer::new(
504 "chrome",
505 &ch.ws_url,
506 config.chrome_timeout(),
507 config.pool_size,
508 )
509 .with_user_agent(&effective_ua)
510 .with_nav_budget(config.chrome_nav_budget_ms)
511 .with_challenge_retries(
512 config
513 .chrome_challenge_max_retries
514 .unwrap_or(cdp::CHALLENGE_MAX_RETRIES),
515 )
516 .with_spa_selector_max(
517 config
518 .chrome_spa_selector_max_ms
519 .unwrap_or(cdp::SPA_SELECTOR_MAX_MS),
520 )
521 .with_fast_ready(config.chrome_fast_ready)
522 .with_interception(
523 config.chrome_intercept_resources,
524 blocklist,
525 config.chrome_host_intercept_disable.clone(),
526 );
527
528 if config.chrome_context_pool_enabled {
532 match config.chrome_backend {
533 crw_core::config::ChromeBackend::Vanilla => {
534 let pcfg = &config.chrome_pool;
535 let size = pcfg.size.unwrap_or_else(|| {
536 let n = std::thread::available_parallelism()
537 .map(|p| p.get())
538 .unwrap_or(2);
539 std::cmp::max(2, n / 2)
540 });
541 renderer = renderer.with_pool(browser_pool::PoolCfg {
542 size,
543 recycle_after_navs: pcfg.recycle_after_navs,
544 idle_timeout: std::time::Duration::from_secs(
545 pcfg.idle_timeout_secs,
546 ),
547 health_check_after: std::time::Duration::from_secs(
548 pcfg.health_check_secs,
549 ),
550 shutdown_drain: std::time::Duration::from_secs(
551 pcfg.shutdown_drain_secs,
552 ),
553 close_target_timeout: std::time::Duration::from_secs(2),
554 dispose_ctx_timeout: std::time::Duration::from_secs(1),
555 create_ctx_timeout: std::time::Duration::from_secs(1),
556 });
557 tracing::info!(
558 pool_size = size,
559 "chrome browser-context pool enabled"
560 );
561 }
562 crw_core::config::ChromeBackend::Browserless => {
563 tracing::warn!(
564 "chrome_context_pool_enabled = true but \
565 chrome_backend = browserless — pool unsupported on \
566 this backend in v1, falling back to legacy path"
567 );
568 }
569 }
570 }
571 chrome_pool = renderer.pool();
572 js_renderers.push(Arc::new(renderer));
573 } else if matches!(config.mode, RendererMode::Chrome) {
574 return Err(CrwError::ConfigError(
575 "renderer.mode = \"chrome\" but [renderer.chrome] ws_url is not configured"
576 .into(),
577 ));
578 }
579 if let Some(cp) = config
587 .chrome_proxy
588 .as_ref()
589 .filter(|c| !c.ws_url.trim().is_empty())
590 {
591 let blocklist = blocklist::Blocklist::defaults()
592 .with_stylesheets(config.chrome_intercept_stylesheets);
593 let mut renderer = cdp::CdpRenderer::new(
594 "chrome_proxy",
595 &cp.ws_url,
596 config.chrome_proxy_timeout(),
597 config.pool_size,
598 )
599 .with_user_agent(&effective_ua)
600 .with_nav_budget(config.chrome_nav_budget_ms)
601 .with_challenge_retries(
602 config
603 .chrome_challenge_max_retries
604 .unwrap_or(cdp::CHALLENGE_MAX_RETRIES),
605 )
606 .with_spa_selector_max(
607 config
608 .chrome_spa_selector_max_ms
609 .unwrap_or(cdp::SPA_SELECTOR_MAX_MS),
610 )
611 .with_fast_ready(config.chrome_fast_ready)
612 .with_interception(
613 config.chrome_intercept_resources,
614 blocklist,
615 config.chrome_host_intercept_disable.clone(),
616 );
617 if let (Some(u), Some(p)) = (&config.proxy_base_user, &config.proxy_base_pass) {
622 renderer = renderer.with_proxy_auth_base(
623 u.clone(),
624 p.clone(),
625 config.proxy_default_country.clone(),
626 );
627 }
628 tracing::info!(
629 ws_url = %cp.ws_url,
630 proxy_auth = config.proxy_base_user.is_some(),
631 default_country = ?config.proxy_default_country,
632 "chrome_proxy tier enabled"
633 );
634 js_renderers.push(Arc::new(renderer));
635 }
636 }
637 }
638
639 #[cfg(feature = "camoufox")]
647 {
648 if let Some(cf) = config
649 .camoufox
650 .as_ref()
651 .filter(|c| !c.base_url.trim().is_empty())
652 {
653 js_renderers.push(Arc::new(camoufox::CamoufoxRenderer::new(
654 "camoufox",
655 &cf.base_url,
656 &cf.api_key,
657 config.camoufox_timeout(),
658 )) as Arc<dyn PageFetcher>);
659 tracing::info!(
660 base_url = %cf.base_url,
661 include_in_auto = cf.include_in_auto,
662 "camoufox tier enabled"
663 );
664 } else if matches!(config.mode, RendererMode::Camoufox) {
665 return Err(CrwError::ConfigError(
666 "renderer.mode = \"camoufox\" but [renderer.camoufox] base_url is not configured"
667 .into(),
668 ));
669 }
670 }
671
672 #[cfg(feature = "cdp")]
676 health_telemetry::spawn_once();
677
678 if config.render_js_default == Some(true) && js_renderers.is_empty() {
679 tracing::warn!(
680 "render_js_default=true but no JS renderer is available; \
681 requests will fall back to HTTP via http_only_fallback"
682 );
683 }
684
685 Ok(Self {
686 http,
687 js_renderers,
688 render_js_default: config.render_js_default,
689 latency_breakdown: config.latency_breakdown,
690 auto_egress_escalation: config.auto_egress_escalation,
691 chrome_hedge: config.chrome_hedge,
692 hedge_sem: Arc::new(tokio::sync::Semaphore::new((config.pool_size / 2).max(1))),
693 preferences: Arc::new(HostPreferences::with_defaults()),
694 breakers: Arc::new(BreakerRegistry::with_defaults()),
695 tier_timeouts: tier_timeouts_from(config),
696 requests_per_second: 0.0,
697 per_host_max_concurrent: 1,
698 antibot: config.antibot.clone(),
699 proxy_rotator: None,
700 http_ua: effective_ua.clone(),
701 http_inject_stealth: inject_headers,
702 http_timeout_ms,
703 proxy_client_cache: std::sync::Mutex::new(std::collections::HashMap::new()),
704 #[cfg(feature = "cdp")]
705 chrome_pool,
706 #[cfg(feature = "camoufox")]
710 camoufox_in_auto: config.camoufox_in_ladder(),
711 })
712 }
713
714 pub fn with_proxy_rotator(
720 mut self,
721 rotator: Option<Arc<crw_core::ProxyRotator>>,
722 ) -> CrwResult<Self> {
723 self.proxy_rotator = rotator;
724 Ok(self)
725 }
726
727 fn http_fetcher_for_request(&self) -> CrwResult<Arc<dyn PageFetcher>> {
734 let Some(entry) = REQUEST_PROXY.try_with(|p| p.clone()).ok().flatten() else {
735 return Ok(self.http.clone());
736 };
737 if let Some(f) = self
739 .proxy_client_cache
740 .lock()
741 .unwrap_or_else(|e| e.into_inner())
742 .get(entry.raw())
743 .cloned()
744 {
745 return Ok(f);
746 }
747 let fetcher: Arc<dyn PageFetcher> = Arc::new(http_only::HttpFetcher::with_proxy(
748 &self.http_ua,
749 entry.raw(),
750 self.http_inject_stealth,
751 std::time::Duration::from_millis(self.http_timeout_ms),
752 )?);
753 let mut cache = self
754 .proxy_client_cache
755 .lock()
756 .unwrap_or_else(|e| e.into_inner());
757 if cache.len() >= 512 {
759 cache.clear();
760 }
761 cache.insert(entry.raw().to_string(), fetcher.clone());
762 Ok(fetcher)
763 }
764
765 pub fn pick_proxy(&self, host: Option<&str>) -> Option<Arc<crw_core::ProxyEntry>> {
770 self.proxy_rotator
771 .as_ref()
772 .map(|r| Arc::new(r.pick(host).clone()))
773 }
774
775 pub fn pick_proxy_for_url(&self, url: &str) -> Option<Arc<crw_core::ProxyEntry>> {
779 self.proxy_rotator.as_ref()?;
780 let host = url::Url::parse(url)
781 .ok()
782 .and_then(|u| u.host_str().map(crate::preference::normalize_host));
783 self.pick_proxy(host.as_deref())
784 }
785
786 #[cfg(feature = "cdp")]
790 pub async fn shutdown_chrome_pool(&self, drain: std::time::Duration) {
791 if let Some(pool) = self.chrome_pool.clone() {
792 tracing::info!(
793 drain_secs = drain.as_secs(),
794 "draining chrome browser-context pool"
795 );
796 pool.shutdown(drain).await;
797 }
798 }
799
800 #[cfg(not(feature = "cdp"))]
802 pub async fn shutdown_chrome_pool(&self, _drain: std::time::Duration) {}
803
804 pub fn with_host_limits(
808 mut self,
809 requests_per_second: f64,
810 per_host_max_concurrent: u32,
811 ) -> Self {
812 self.requests_per_second = requests_per_second;
813 self.per_host_max_concurrent = per_host_max_concurrent;
814 self
815 }
816
817 pub fn preferences(&self) -> Arc<HostPreferences> {
819 Arc::clone(&self.preferences)
820 }
821
822 pub fn breakers(&self) -> Arc<BreakerRegistry> {
824 Arc::clone(&self.breakers)
825 }
826
827 pub fn js_renderer_names(&self) -> Vec<&str> {
830 self.js_renderers.iter().map(|r| r.name()).collect()
831 }
832
833 pub async fn fetch(
841 &self,
842 url: &str,
843 headers: &HashMap<String, String>,
844 render_js: Option<bool>,
845 wait_for_ms: Option<u64>,
846 requested_renderer: Option<&str>,
847 deadline: crw_core::Deadline,
848 ) -> CrwResult<FetchResult> {
849 if !self.latency_breakdown {
855 return self
856 .fetch_inner(
857 url,
858 headers,
859 render_js,
860 wait_for_ms,
861 requested_renderer,
862 deadline,
863 )
864 .await;
865 }
866 let t0 = std::time::Instant::now();
867 let out = self
868 .fetch_inner(
869 url,
870 headers,
871 render_js,
872 wait_for_ms,
873 requested_renderer,
874 deadline,
875 )
876 .await;
877 let total_ms = t0.elapsed().as_millis() as u64;
878 match &out {
879 Ok(r) => tracing::info!(
880 target: "latency_breakdown",
881 url,
882 total_ms,
883 rendered_with = r.rendered_with.as_deref().unwrap_or("unknown"),
884 content_len = r.html.len(),
885 "scrape latency breakdown"
886 ),
887 Err(e) => tracing::info!(
888 target: "latency_breakdown",
889 url,
890 total_ms,
891 error = %e,
892 "scrape latency breakdown (error)"
893 ),
894 }
895 out
896 }
897
898 async fn fetch_inner(
899 &self,
900 url: &str,
901 headers: &HashMap<String, String>,
902 render_js: Option<bool>,
903 wait_for_ms: Option<u64>,
904 requested_renderer: Option<&str>,
905 deadline: crw_core::Deadline,
906 ) -> CrwResult<FetchResult> {
907 let host_key = url::Url::parse(url)
911 .ok()
912 .and_then(|u| u.host_str().map(crate::preference::normalize_host));
913 let _host_permit = if let Some(key) = host_key.as_deref() {
914 let remaining = deadline.remaining();
915 if remaining.is_zero() {
916 return Err(CrwError::Timeout(
917 deadline.overrun().as_millis().max(1) as u64
918 ));
919 }
920 match tokio::time::timeout(
921 remaining,
922 crate::host_limiter::acquire(
923 key,
924 self.requests_per_second,
925 self.per_host_max_concurrent as usize,
926 ),
927 )
928 .await
929 {
930 Ok(Ok((permit, sleep))) => {
931 if !sleep.is_zero() {
932 let budget = deadline.remaining();
933 if sleep > budget {
934 return Err(CrwError::Timeout(sleep.as_millis().max(1) as u64));
935 }
936 tokio::time::sleep(sleep).await;
937 }
938 Some(permit)
939 }
940 Ok(Err(_)) => return Err(CrwError::RendererError("host limiter closed".into())),
941 Err(_) => {
942 return Err(CrwError::Timeout(
943 deadline.overrun().as_millis().max(1) as u64
944 ));
945 }
946 }
947 } else {
948 None
949 };
950
951 let mut effective = resolve_render_js(render_js, self.render_js_default);
952 if effective != Some(true) && screenshot_requested() {
958 effective = Some(true);
959 }
960 tracing::debug!(
961 url,
962 request_render_js = ?render_js,
963 default_render_js = ?self.render_js_default,
964 effective_render_js = ?effective,
965 requested_renderer,
966 "FallbackRenderer::fetch dispatching"
967 );
968 let is_hard_pinned = matches!(requested_renderer, Some(name) if name != "auto");
970 match effective {
971 Some(false) => {
972 let mut r = self
973 .http_fetcher_for_request()?
974 .fetch(url, headers, None, deadline)
975 .await?;
976 stamp_http_decision(&mut r, requested_renderer);
977 Ok(r)
978 }
979 Some(true) => {
980 let mut http_result = self
982 .http_fetcher_for_request()?
983 .fetch(url, headers, None, deadline)
984 .await?;
985 if http_result.content_type.as_deref() == Some("application/pdf") {
986 stamp_http_decision(&mut http_result, requested_renderer);
991 return Ok(http_result);
992 }
993
994 if self.js_renderers.is_empty() {
995 if screenshot_requested() {
999 return Err(CrwError::RendererError(
1000 "a screenshot was requested but no JS renderer is available; \
1001 configure a chrome/chrome_proxy tier"
1002 .into(),
1003 ));
1004 }
1005 tracing::warn!(
1006 url,
1007 "JS rendering requested but no renderer available — falling back to HTTP"
1008 );
1009 let mut result = http_result;
1010 result.rendered_with = Some("http_only_fallback".to_string());
1011 result.warning = Some("JS rendering was requested but no renderer is available. Content was fetched via HTTP only.".to_string());
1012 result.warnings.push(
1013 "JS rendering requested but no renderer available; HTTP fallback used"
1014 .into(),
1015 );
1016 stamp_http_decision(&mut result, requested_renderer);
1017 Ok(result)
1018 } else {
1019 self.fetch_with_js(url, headers, wait_for_ms, requested_renderer, deadline)
1020 .await
1021 }
1022 }
1023 None => {
1024 let mut result = match self
1031 .http_fetcher_for_request()?
1032 .fetch(url, headers, None, deadline)
1033 .await
1034 {
1035 Ok(r) => r,
1036 Err(e) if !self.js_renderers.is_empty() => {
1037 tracing::info!(
1038 url,
1039 error = %e,
1040 "HTTP fetch failed, escalating to JS renderer"
1041 );
1042 return self
1043 .fetch_with_js(url, headers, wait_for_ms, requested_renderer, deadline)
1044 .await
1045 .map_err(|js_err| {
1046 tracing::warn!("Both HTTP and JS failed: http={e}, js={js_err}");
1047 js_err
1048 });
1049 }
1050 Err(e) => return Err(e),
1051 };
1052
1053 if result.content_type.as_deref() == Some("application/pdf") {
1055 stamp_http_decision(&mut result, requested_renderer);
1056 return Ok(result);
1057 }
1058
1059 let needs_js = detector::needs_js_rendering(&result.html);
1060 let cf_header_signal = result.warning.as_deref() == Some("cloudflare_mitigated");
1061 let is_generic_bot_wall = detector::looks_like_generic_bot_wall(&result.html);
1062 let is_blocked = cf_header_signal
1063 || detector::looks_like_cloudflare_challenge(&result.html)
1064 || is_generic_bot_wall;
1065 let is_auth_blocked = matches!(
1078 result.status_code,
1079 401 | 403 | 404 | 405 | 406 | 410 | 412 | 429 | 451 | 500 | 503
1080 );
1081 let is_2xx = (200..300).contains(&result.status_code);
1088 let is_thin_content = is_2xx && detector::looks_like_thin_html(&result.html);
1089
1090 if !self.js_renderers.is_empty()
1091 && (needs_js || is_blocked || is_auth_blocked || is_thin_content)
1092 {
1093 if is_auth_blocked {
1094 tracing::info!(
1095 url,
1096 status_code = result.status_code,
1097 "HTTP {} received, escalating to JS renderer",
1098 result.status_code
1099 );
1100 } else if is_blocked {
1101 tracing::info!(
1102 url,
1103 "Anti-bot challenge detected in HTTP response, escalating to JS renderer"
1104 );
1105 if is_generic_bot_wall {
1106 tracing::info!(
1107 url,
1108 "Generic anti-bot interstitial detected, escalating to JS renderer"
1109 );
1110 }
1111 } else if needs_js {
1112 tracing::info!(url, "SPA shell detected, retrying with JS renderer");
1113 } else {
1114 tracing::info!(
1115 url,
1116 html_len = result.html.len(),
1117 "HTTP 2xx but body is thin, escalating to JS renderer"
1118 );
1119 }
1120 match self
1121 .fetch_with_js(url, headers, wait_for_ms, requested_renderer, deadline)
1122 .await
1123 {
1124 Ok(js_result) => Ok(js_result),
1125 Err(e) if is_hard_pinned => {
1126 Err(e)
1129 }
1130 Err(e) => {
1131 if is_auth_blocked {
1139 tracing::error!(
1140 url,
1141 status_code = result.status_code,
1142 "JS escalation failed for soft-block status; surfacing HTTP shell with warning: {e}"
1143 );
1144 let warning = format!("js_escalation_failed: {e}");
1145 result.warning = Some(match result.warning.take() {
1146 Some(prev) => format!("{warning}; {prev}"),
1147 None => warning,
1148 });
1149 } else {
1150 tracing::warn!(
1151 "JS rendering failed, falling back to HTTP result: {e}"
1152 );
1153 }
1154 stamp_http_decision(&mut result, requested_renderer);
1155 Ok(result)
1156 }
1157 }
1158 } else {
1159 stamp_http_decision(&mut result, requested_renderer);
1160 Ok(result)
1161 }
1162 }
1163 }
1164 }
1165
1166 const MIN_RENDERED_TEXT_LEN: usize = 50;
1170
1171 fn classify_js_attempt(&self, result: &FetchResult) -> JsAttemptClass {
1176 let text_len = html_body_text_len(&result.html);
1177 let is_placeholder = detector::looks_like_loading_placeholder(&result.html);
1178 let failed_render = detector::looks_like_failed_render(&result.html);
1179 let is_bot_wall = detector::looks_like_generic_bot_wall(&result.html);
1180 let vendor_block = detector::looks_like_vendor_block(&result.html);
1181 let is_status_blocked = matches!(
1182 result.status_code,
1183 401 | 403 | 404 | 405 | 406 | 410 | 412 | 429 | 451 | 500 | 503
1184 );
1185 let antibot = if self.antibot.enabled {
1186 crw_extract::antibot::classify(Some(result.status_code), &result.html)
1187 } else {
1188 crw_extract::antibot::AntibotResult::none()
1189 };
1190 let antibot_blocked = self.antibot.escalate_in_failover && antibot.signal.is_blocked();
1191 let hard_block = matches!(result.status_code, 401 | 403 | 429 | 503)
1193 || (520..=530).contains(&result.status_code)
1194 || is_bot_wall
1195 || vendor_block.is_some()
1196 || antibot.signal.is_blocked();
1197 let acceptable = text_len >= Self::MIN_RENDERED_TEXT_LEN
1198 && !is_placeholder
1199 && failed_render.is_none()
1200 && !is_bot_wall
1201 && vendor_block.is_none()
1202 && !is_status_blocked
1203 && !antibot_blocked;
1204 JsAttemptClass {
1205 text_len,
1206 is_placeholder,
1207 failed_render,
1208 is_bot_wall,
1209 vendor_block,
1210 is_status_blocked,
1211 antibot,
1212 antibot_blocked,
1213 hard_block,
1214 acceptable,
1215 }
1216 }
1217
1218 #[allow(clippy::too_many_arguments)] async fn try_hedge(
1233 &self,
1234 lp: &Arc<dyn PageFetcher>,
1235 chrome: &Arc<dyn PageFetcher>,
1236 url: &str,
1237 headers: &HashMap<String, String>,
1238 wait_for_ms: Option<u64>,
1239 deadline: crw_core::Deadline,
1240 host: &str,
1241 ) -> CrwResult<Option<HedgeOutcome>> {
1242 let (lp_permit, lp_guard) = self
1245 .breakers
1246 .acquire_with_guard(host, RendererKind::Lightpanda)
1247 .await;
1248 if lp_permit == Permit::Rejected {
1249 drop(lp_guard);
1250 return Ok(None);
1251 }
1252 let (ch_permit, ch_guard) = self
1253 .breakers
1254 .acquire_with_guard(host, RendererKind::Chrome)
1255 .await;
1256 if ch_permit == Permit::Rejected {
1257 drop(lp_guard);
1258 drop(ch_guard);
1259 return Ok(None);
1260 }
1261 let mut lp_guard = Some(lp_guard);
1262 let mut ch_guard = Some(ch_guard);
1263
1264 let lp_fut = lp.fetch(url, headers, wait_for_ms, deadline);
1267 let chrome_fut = chrome.fetch(url, headers, wait_for_ms, deadline);
1268 tokio::pin!(lp_fut, chrome_fut);
1269 let (mut lp_done, mut ch_done) = (false, false);
1270 let mut lp_res: Option<CrwResult<FetchResult>> = None;
1271 let mut ch_res: Option<CrwResult<FetchResult>> = None;
1272 while !(lp_done && ch_done) {
1273 tokio::select! {
1274 biased;
1275 r = &mut lp_fut, if !lp_done => {
1276 lp_done = true;
1277 let accept = matches!(&r, Ok(res) if self.classify_js_attempt(res).acceptable);
1278 lp_res = Some(r);
1279 if accept {
1281 break;
1282 }
1283 }
1284 r = &mut chrome_fut, if !ch_done => {
1285 ch_done = true;
1286 let ch_accept = matches!(&r, Ok(res) if self.classify_js_attempt(res).acceptable);
1287 ch_res = Some(r);
1288 if lp_done {
1291 let lp_accept = matches!(&lp_res, Some(Ok(res)) if self.classify_js_attempt(res).acceptable);
1292 if !lp_accept && ch_accept {
1293 break;
1294 }
1295 }
1296 }
1297 }
1298 }
1299 let lp_accept =
1303 matches!(&lp_res, Some(Ok(res)) if self.classify_js_attempt(res).acceptable);
1304 let ch_accept =
1305 matches!(&ch_res, Some(Ok(res)) if self.classify_js_attempt(res).acceptable);
1306
1307 if lp_accept {
1309 let mut r = lp_res.unwrap().unwrap();
1310 self.record_hedge_success(host, RendererKind::Lightpanda, &r, &mut lp_guard)
1311 .await;
1312 r.credit_cost = credit_for(RendererKind::Lightpanda);
1314 r.render_decision = Some(RenderDecision::AutoDefault {
1315 chosen: RendererKind::Lightpanda,
1316 });
1317 return Ok(Some(HedgeOutcome::Accepted(r)));
1318 }
1319 let mut saw_hard_block = false;
1321 if let Some(Ok(res)) = &lp_res {
1322 let cls = self.classify_js_attempt(res);
1323 saw_hard_block |= cls.hard_block;
1324 self.record_hedge_thin(host, RendererKind::Lightpanda, &cls, &mut lp_guard)
1325 .await;
1326 }
1327 if ch_accept {
1328 let mut r = ch_res.unwrap().unwrap();
1329 self.record_hedge_success(host, RendererKind::Chrome, &r, &mut ch_guard)
1330 .await;
1331 r.credit_cost = credit_for(RendererKind::Chrome);
1332 r.render_decision = Some(RenderDecision::Failover {
1333 chain: vec![RendererKind::Lightpanda, RendererKind::Chrome],
1334 reason: FailoverErrorKind::Other,
1335 });
1336 return Ok(Some(HedgeOutcome::Accepted(r)));
1337 }
1338 if let Some(Ok(res)) = &ch_res {
1340 let cls = self.classify_js_attempt(res);
1341 saw_hard_block |= cls.hard_block;
1342 self.record_hedge_thin(host, RendererKind::Chrome, &cls, &mut ch_guard)
1343 .await;
1344 }
1345
1346 let thin = [lp_res, ch_res]
1348 .into_iter()
1349 .flatten()
1350 .filter_map(|r| r.ok())
1351 .max_by_key(|r| r.html.len());
1352 match thin {
1353 Some(r) => Ok(Some(HedgeOutcome::Thin(r, saw_hard_block))),
1354 None => Ok(None),
1357 }
1358 }
1359
1360 async fn record_hedge_success(
1362 &self,
1363 host: &str,
1364 k: RendererKind,
1365 result: &FetchResult,
1366 guard: &mut Option<ProbeGuard>,
1367 ) {
1368 if !host.is_empty() {
1369 let outcome = if result.truncated {
1370 BreakerOutcome::Truncated
1371 } else {
1372 BreakerOutcome::Success
1373 };
1374 self.breakers.record_outcome(host, k, outcome).await;
1375 self.preferences.record_success(host).await;
1376 }
1377 if let Some(g) = guard.take() {
1378 g.disarm();
1379 }
1380 }
1381
1382 async fn record_hedge_thin(
1384 &self,
1385 host: &str,
1386 k: RendererKind,
1387 cls: &JsAttemptClass,
1388 guard: &mut Option<ProbeGuard>,
1389 ) {
1390 if !host.is_empty() {
1391 self.breakers
1392 .record_outcome(host, k, BreakerOutcome::RenderError)
1393 .await;
1394 if k == RendererKind::Lightpanda {
1395 let err_kind = if cls.is_status_blocked || cls.is_bot_wall || cls.antibot_blocked {
1396 FailoverErrorKind::AntibotBlock
1397 } else {
1398 FailoverErrorKind::PlaceholderContent
1399 };
1400 let _ = self.preferences.record_failure(host, &err_kind).await;
1401 }
1402 }
1403 let _ = guard;
1405 }
1406
1407 async fn fetch_with_js(
1408 &self,
1409 url: &str,
1410 headers: &HashMap<String, String>,
1411 wait_for_ms: Option<u64>,
1412 requested_renderer: Option<&str>,
1413 deadline: crw_core::Deadline,
1414 ) -> CrwResult<FetchResult> {
1415 let host = host_of(url);
1416 let is_user_pinned = matches!(requested_renderer, Some(name) if name != "auto");
1417 if let Some(pinned) = requested_renderer
1418 && let Some(kind) = renderer_kind_for(pinned)
1419 {
1420 metrics()
1421 .user_pin_total
1422 .with_label_values(&[kind.as_str()])
1423 .inc();
1424 }
1425
1426 let mut renderers: Vec<&Arc<dyn PageFetcher>> = match requested_renderer {
1435 Some(name) if name != "auto" => self
1436 .js_renderers
1437 .iter()
1438 .filter(|r| r.name() == name)
1439 .collect(),
1440 _ => {
1441 #[cfg(feature = "camoufox")]
1442 {
1443 let in_auto = self.camoufox_in_auto;
1444 self.js_renderers
1445 .iter()
1446 .filter(|r| in_auto || r.name() != "camoufox")
1447 .collect()
1448 }
1449 #[cfg(not(feature = "camoufox"))]
1450 {
1451 self.js_renderers.iter().collect()
1452 }
1453 }
1454 };
1455
1456 let proxy_active = REQUEST_PROXY.try_with(|p| p.is_some()).unwrap_or(false);
1463 if proxy_active {
1464 renderers.retain(|r| r.name() != "lightpanda");
1465 if renderers.is_empty() {
1466 return Err(CrwError::RendererError(
1467 "a proxy is required for this request but the only available JS \
1468 renderer (lightpanda) cannot route through a proxy; configure a \
1469 chrome/chrome_proxy tier to use proxies with JS rendering"
1470 .into(),
1471 ));
1472 }
1473 }
1474
1475 if screenshot_requested() {
1483 renderers.retain(|r| r.name() != "lightpanda" && r.name() != "camoufox");
1484 if renderers.is_empty() {
1485 return Err(CrwError::RendererError(
1486 "a screenshot was requested but no CDP-capable Chrome renderer is \
1487 available; lightpanda and camoufox cannot capture screenshots — \
1488 configure a chrome/chrome_proxy tier"
1489 .into(),
1490 ));
1491 }
1492 }
1493 let auto_egress_arm: Option<Arc<dyn PageFetcher>> =
1502 if self.auto_egress_escalation && !is_user_pinned && !proxy_active {
1503 let arm = self
1504 .js_renderers
1505 .iter()
1506 .find(|r| r.name() == "chrome_proxy")
1507 .cloned();
1508 renderers.retain(|r| r.name() != "chrome_proxy");
1509 arm
1510 } else {
1511 None
1512 };
1513
1514 if !is_user_pinned
1516 && let Some(RendererKind::Chrome) = self.preferences.preferred(&host).await
1517 {
1518 renderers.sort_by_key(|r| match r.name() {
1524 "chrome" => 0,
1525 "chrome_proxy" => 1,
1526 _ => 2,
1527 });
1528 tracing::debug!(host = %host, "host promoted to chrome by preference learner");
1529 }
1530
1531 if renderers.is_empty() {
1532 let available = self.js_renderer_names();
1533 return Err(CrwError::RendererError(format!(
1534 "requested renderer '{}' not in pool [{}]",
1535 requested_renderer.unwrap_or("auto"),
1536 available.join(", ")
1537 )));
1538 }
1539
1540 let mut chain: Vec<RendererKind> = Vec::new();
1543 let mut breaker_skipped: Vec<RendererKind> = Vec::new();
1544 let mut last_error = None;
1545 let mut last_failover_reason: Option<FailoverErrorKind> = None;
1546 let mut thin_result: Option<FetchResult> = None;
1547 let mut saw_hard_block = false;
1552 let renderers_snapshot: Vec<&Arc<dyn PageFetcher>> = renderers.clone();
1557
1558 let mut hedge_done = false;
1564 if self.chrome_hedge
1565 && !is_user_pinned
1566 && !proxy_active
1567 && renderers.first().map(|r| r.name()) == Some("lightpanda")
1568 && renderers.iter().any(|r| r.name() == "chrome")
1569 && let Ok(_permit) = self.hedge_sem.clone().try_acquire_owned()
1570 {
1571 let lp = renderers
1572 .iter()
1573 .find(|r| r.name() == "lightpanda")
1574 .expect("checked above");
1575 let chrome = renderers
1576 .iter()
1577 .find(|r| r.name() == "chrome")
1578 .expect("checked above");
1579 match self
1580 .try_hedge(lp, chrome, url, headers, wait_for_ms, deadline, &host)
1581 .await
1582 {
1583 Ok(Some(HedgeOutcome::Accepted(r))) => return Ok(r),
1584 Ok(Some(HedgeOutcome::Thin(r, hb))) => {
1585 thin_result = Some(r);
1586 saw_hard_block |= hb;
1587 chain.push(RendererKind::Lightpanda);
1588 chain.push(RendererKind::Chrome);
1589 hedge_done = true;
1590 }
1591 Ok(None) => {}
1593 Err(e) => last_error = Some(e),
1594 }
1595 }
1596
1597 for renderer in renderers {
1598 if hedge_done {
1599 break;
1600 }
1601 let kind = renderer_kind_for(renderer.name());
1602
1603 let trackable = kind.filter(|_| !host.is_empty());
1606
1607 let mut probe_guard: Option<ProbeGuard> = None;
1618 if let Some(k) = trackable {
1619 let (permit, guard) = self.breakers.acquire_with_guard(&host, k).await;
1620 if permit == Permit::Rejected {
1621 tracing::info!(
1622 renderer = renderer.name(),
1623 host = %host,
1624 "circuit breaker open, skipping renderer"
1625 );
1626 metrics()
1627 .render_route_decision_total
1628 .with_label_values(&[k.as_str(), "breakerSkipped"])
1629 .inc();
1630 breaker_skipped.push(k);
1631 drop(guard); continue;
1633 }
1634 probe_guard = Some(guard);
1635 }
1636 if let Some(k) = kind {
1637 chain.push(k);
1638 }
1639
1640 let attempt_ctx = {
1643 let remaining = deadline.remaining();
1644 let tier_budget = kind
1645 .and_then(|k| self.tier_timeouts.get(&k).copied())
1646 .unwrap_or(remaining);
1647 AttemptContext::capture(remaining, tier_budget)
1648 };
1649 let attempt_start = std::time::Instant::now();
1656 let attempt_outcome = renderer.fetch(url, headers, wait_for_ms, deadline).await;
1657 if self.latency_breakdown {
1658 let attempt_ms = attempt_start.elapsed().as_millis() as u64;
1659 let tier = renderer.name();
1660 match &attempt_outcome {
1661 Ok(r) => tracing::info!(
1662 target: "latency_breakdown",
1663 url, tier, attempt_ms,
1664 status = r.status_code,
1665 html_len = r.html.len(),
1666 "hedge attempt"
1667 ),
1668 Err(e) => tracing::info!(
1669 target: "latency_breakdown",
1670 url, tier, attempt_ms,
1671 error = %e,
1672 "hedge attempt (error)"
1673 ),
1674 }
1675 }
1676 match attempt_outcome {
1677 Ok(mut result) => {
1678 let text_len = html_body_text_len(&result.html);
1679 let is_placeholder = detector::looks_like_loading_placeholder(&result.html);
1680 let failed_render = detector::looks_like_failed_render(&result.html);
1681 let is_bot_wall = detector::looks_like_generic_bot_wall(&result.html);
1682 let vendor_block = detector::looks_like_vendor_block(&result.html);
1683 let is_status_blocked = matches!(
1687 result.status_code,
1688 401 | 403 | 404 | 405 | 406 | 410 | 412 | 429 | 451 | 500 | 503
1689 );
1690 let antibot = if self.antibot.enabled {
1699 crw_extract::antibot::classify(Some(result.status_code), &result.html)
1700 } else {
1701 crw_extract::antibot::AntibotResult::none()
1702 };
1703 let antibot_blocked =
1704 self.antibot.escalate_in_failover && antibot.signal.is_blocked();
1705 if matches!(result.status_code, 401 | 403 | 429 | 503)
1709 || (520..=530).contains(&result.status_code)
1710 || is_bot_wall
1711 || vendor_block.is_some()
1712 || antibot.signal.is_blocked()
1713 {
1714 saw_hard_block = true;
1715 }
1716 if text_len >= Self::MIN_RENDERED_TEXT_LEN
1717 && !is_placeholder
1718 && failed_render.is_none()
1719 && !is_bot_wall
1720 && vendor_block.is_none()
1721 && !is_status_blocked
1722 && !antibot_blocked
1723 {
1724 let was_promoted = matches!(
1728 self.preferences.preferred(&host).await,
1729 Some(RendererKind::Chrome)
1730 );
1731 if let Some(k) = trackable {
1732 let outcome = if result.truncated {
1735 BreakerOutcome::Truncated
1736 } else {
1737 BreakerOutcome::Success
1738 };
1739 self.breakers.record_outcome(&host, k, outcome).await;
1740 self.preferences.record_success(&host).await;
1741 metrics()
1742 .render_route_decision_total
1743 .with_label_values(&[k.as_str(), "success"])
1744 .inc();
1745 metrics()
1746 .host_preferences_size
1747 .set(self.preferences.size() as i64);
1748 }
1749 if let Some(g) = probe_guard.take() {
1750 g.disarm();
1751 }
1752 if let Some(k) = kind {
1754 result.credit_cost = credit_for(k);
1755 result.render_decision = Some(if is_user_pinned {
1756 RenderDecision::UserPinned { renderer: k }
1757 } else if !breaker_skipped.is_empty() {
1758 RenderDecision::BreakerSkipped {
1759 skipped: breaker_skipped[0],
1760 chosen: k,
1761 }
1762 } else if chain.len() > 1 {
1763 RenderDecision::Failover {
1764 chain: chain.clone(),
1765 reason: last_failover_reason
1766 .clone()
1767 .unwrap_or(FailoverErrorKind::Other),
1768 }
1769 } else if was_promoted && k == RendererKind::Chrome {
1770 RenderDecision::AutoPromoted {
1771 chosen: k,
1772 from: RendererKind::Lightpanda,
1773 reason: "host preference learner".into(),
1774 }
1775 } else {
1776 RenderDecision::AutoDefault { chosen: k }
1777 });
1778 }
1779 return Ok(result);
1780 }
1781 let err_kind = match failed_render {
1784 Some(detector::FailedRenderReason::NextJsClientError) => {
1785 FailoverErrorKind::NextJsClientError
1786 }
1787 Some(detector::FailedRenderReason::ReactMinifiedError) => {
1788 FailoverErrorKind::NextJsClientError
1789 }
1790 Some(detector::FailedRenderReason::EmptyNextRoot) => {
1791 FailoverErrorKind::EmptyNextRoot
1792 }
1793 None if vendor_block.is_some() => FailoverErrorKind::VendorBlock,
1794 None if is_status_blocked => FailoverErrorKind::StatusBlocked,
1795 None if is_placeholder => FailoverErrorKind::PlaceholderContent,
1796 None if is_bot_wall => FailoverErrorKind::PlaceholderContent,
1797 None if antibot_blocked => FailoverErrorKind::AntibotBlock,
1799 None => FailoverErrorKind::PlaceholderContent,
1800 };
1801 last_failover_reason = Some(err_kind.clone());
1802 if let Some(k) = trackable {
1803 let outcome = classify_outcome(false, false, false, &attempt_ctx);
1807 self.breakers.record_outcome(&host, k, outcome).await;
1808 if k == RendererKind::Lightpanda
1809 && let Some(target) =
1810 self.preferences.record_failure(&host, &err_kind).await
1811 {
1812 metrics()
1813 .host_preferences_promotions_total
1814 .with_label_values(&[k.as_str(), target.as_str()])
1815 .inc();
1816 tracing::info!(
1817 host = %host,
1818 "host promoted by preference learner: {} -> {}",
1819 k.as_str(),
1820 target.as_str()
1821 );
1822 }
1823 }
1824 if let Some(g) = probe_guard.take() {
1825 g.disarm();
1826 }
1827 if let Some(vendor) = vendor_block {
1828 metrics()
1829 .vendor_block_total
1830 .with_label_values(&[vendor])
1831 .inc();
1832 tracing::warn!(
1833 renderer = renderer.name(),
1834 url,
1835 vendor,
1836 "vendor anti-bot block detected"
1837 );
1838 }
1839 if antibot.signal.is_blocked() {
1842 metrics()
1843 .antibot_escalation_total
1844 .with_label_values(&[antibot.signal.class_name()])
1845 .inc();
1846 tracing::warn!(
1847 renderer = renderer.name(),
1848 url,
1849 signal = antibot.signal.class_name(),
1850 reason = %antibot.reason,
1851 status_code = result.status_code,
1852 text_len,
1853 escalated = antibot_blocked,
1854 "antibot classifier flagged a block"
1855 );
1856 }
1857 tracing::info!(
1858 renderer = renderer.name(),
1859 text_len,
1860 is_placeholder,
1861 is_bot_wall,
1862 vendor_block,
1863 is_status_blocked,
1864 antibot_signal = antibot.signal.class_name(),
1865 antibot_blocked,
1866 status_code = result.status_code,
1867 failed_render = ?failed_render,
1868 "JS renderer returned thin/placeholder/failed content, trying next renderer"
1869 );
1870 let mut annotated = result;
1877 let attempt_warning = if let Some(reason) = failed_render {
1878 format!(
1879 "{} returned a failed render ({})",
1880 renderer.name(),
1881 reason.as_str()
1882 )
1883 } else if is_placeholder {
1884 format!("{} returned a loading placeholder", renderer.name())
1885 } else if let Some(vendor) = vendor_block {
1886 format!(
1887 "{} returned a vendor anti-bot block ({vendor})",
1888 renderer.name()
1889 )
1890 } else if is_bot_wall {
1891 format!(
1892 "{} returned a generic anti-bot interstitial",
1893 renderer.name()
1894 )
1895 } else if is_status_blocked {
1896 format!(
1897 "{} returned HTTP {} (treated as blocked)",
1898 renderer.name(),
1899 annotated.status_code
1900 )
1901 } else if antibot_blocked {
1902 format!(
1903 "{} returned an anti-bot block ({}: {})",
1904 renderer.name(),
1905 antibot.signal.class_name(),
1906 antibot.reason
1907 )
1908 } else {
1909 format!(
1910 "{} returned thin content (text_len={text_len})",
1911 renderer.name()
1912 )
1913 };
1914 if is_bot_wall || vendor_block.is_some() || is_status_blocked || antibot_blocked
1915 {
1916 let msg = if let Some(v) = vendor_block {
1925 format!("{} returned a vendor anti-bot block ({v})", renderer.name())
1926 } else if is_status_blocked {
1927 format!(
1928 "{} returned HTTP {} (treated as blocked)",
1929 renderer.name(),
1930 annotated.status_code
1931 )
1932 } else if is_bot_wall {
1933 format!(
1934 "{} returned a generic anti-bot interstitial",
1935 renderer.name()
1936 )
1937 } else {
1938 format!(
1939 "{} returned an anti-bot block ({}: {})",
1940 renderer.name(),
1941 antibot.signal.class_name(),
1942 antibot.reason
1943 )
1944 };
1945 last_error = Some(CrwError::RendererError(msg));
1946 }
1947 annotated.warnings.push(attempt_warning.clone());
1948 annotated.warning = Some(match annotated.warning {
1949 Some(prev) => format!("{prev}; {attempt_warning}"),
1950 None => attempt_warning.clone(),
1951 });
1952 thin_result = Some(match thin_result {
1953 None => annotated,
1954 Some(existing) => {
1955 let (mut keeper, dropped) =
1962 if annotated.html.len() > existing.html.len() {
1963 (annotated, existing)
1964 } else {
1965 (existing, annotated)
1966 };
1967 keeper.warnings.push(attempt_warning.clone());
1968 keeper.warning = Some(match keeper.warning {
1969 Some(prev) => format!("{prev}; {attempt_warning}"),
1970 None => attempt_warning,
1971 });
1972 for w in dropped.warnings {
1975 if !keeper.warnings.contains(&w) {
1976 keeper.warnings.push(w);
1977 }
1978 }
1979 keeper
1980 }
1981 });
1982 }
1983 Err(e) => {
1984 tracing::warn!(renderer = renderer.name(), "JS renderer failed: {e}");
1985 let err_kind = classify_renderer_error(&e);
1986 last_failover_reason = Some(err_kind.clone());
1987 if let Some(k) = trackable {
1988 let was_timeout = matches!(e, CrwError::Timeout(_));
1989 let outcome = classify_outcome(false, false, was_timeout, &attempt_ctx);
1990 self.breakers.record_outcome(&host, k, outcome).await;
1991 if k == RendererKind::Lightpanda {
1992 let _ = self.preferences.record_failure(&host, &err_kind).await;
1993 }
1994 }
1995 if let Some(g) = probe_guard.take() {
1996 g.disarm();
1997 }
1998 last_error = Some(e);
1999 continue;
2000 }
2001 }
2002 }
2003 const LEAK_MIN_BUDGET: Duration = Duration::from_millis(500);
2021 if thin_result.is_none()
2022 && !breaker_skipped.is_empty()
2023 && !is_user_pinned
2024 && deadline.remaining() >= LEAK_MIN_BUDGET
2025 {
2026 for renderer in &renderers_snapshot {
2027 let kind = renderer_kind_for(renderer.name());
2028 let trackable = kind.filter(|_| !host.is_empty());
2029 let Some(k) = trackable else { continue };
2030 if !breaker_skipped.contains(&k) {
2031 continue;
2032 }
2033 let permit = self.breakers.try_acquire_host_only(&host, k).await;
2034 if permit == Permit::Rejected {
2035 continue;
2036 }
2037 tracing::info!(
2038 renderer = renderer.name(),
2039 host = %host,
2040 "global breaker open, host clean — leaking through one attempt"
2041 );
2042 metrics()
2043 .render_route_decision_total
2044 .with_label_values(&[k.as_str(), "leakThrough"])
2045 .inc();
2046 let attempt_ctx = {
2047 let remaining = deadline.remaining();
2048 let tier_budget = self.tier_timeouts.get(&k).copied().unwrap_or(remaining);
2049 AttemptContext::capture(remaining, tier_budget)
2050 };
2051 let res = renderer.fetch(url, headers, wait_for_ms, deadline).await;
2052 match res {
2053 Ok(mut result) => {
2054 let text_len = html_body_text_len(&result.html);
2055 let is_placeholder = detector::looks_like_loading_placeholder(&result.html);
2056 let failed_render = detector::looks_like_failed_render(&result.html);
2057 let truncated = result.truncated;
2058 let content_ok = text_len >= Self::MIN_RENDERED_TEXT_LEN
2059 && !is_placeholder
2060 && failed_render.is_none();
2061 let outcome = classify_outcome(content_ok, truncated, false, &attempt_ctx);
2062 self.breakers
2065 .record_scoped_outcome(&host, k, None, Some(outcome))
2066 .await;
2067 if content_ok {
2068 result.credit_cost = credit_for(k);
2069 result.render_decision =
2070 Some(RenderDecision::AutoDefault { chosen: k });
2071 return Ok(result);
2072 }
2073 last_error = Some(CrwError::RendererError(format!(
2076 "leak attempt on {} returned thin content (text_len={text_len})",
2077 renderer.name()
2078 )));
2079 break;
2080 }
2081 Err(e) => {
2082 let was_timeout = matches!(e, CrwError::Timeout(_));
2083 let outcome = classify_outcome(false, false, was_timeout, &attempt_ctx);
2084 self.breakers
2085 .record_scoped_outcome(&host, k, None, Some(outcome))
2086 .await;
2087 last_error = Some(e);
2088 break;
2089 }
2090 }
2091 }
2092 }
2093
2094 if let Some(arm) = auto_egress_arm {
2101 let kind = RendererKind::ChromeProxy;
2102 let tier_budget = self
2103 .tier_timeouts
2104 .get(&kind)
2105 .copied()
2106 .unwrap_or_else(|| std::time::Duration::from_secs(30));
2107 if saw_hard_block && deadline.remaining() >= tier_budget {
2108 chain.push(kind);
2109 let entry = self.pick_proxy_for_url(url);
2110 let attempt = REQUEST_PROXY
2111 .scope(entry, arm.fetch(url, headers, wait_for_ms, deadline))
2112 .await;
2113 match attempt {
2114 Ok(r) => {
2115 let r_text = html_body_text_len(&r.html);
2116 let r_ok = r_text >= Self::MIN_RENDERED_TEXT_LEN
2117 && detector::looks_like_failed_render(&r.html).is_none()
2118 && !detector::looks_like_loading_placeholder(&r.html);
2119 if !host.is_empty() {
2120 let outcome = if r_ok {
2121 BreakerOutcome::Success
2122 } else {
2123 BreakerOutcome::RenderError
2124 };
2125 self.breakers.record_outcome(&host, kind, outcome).await;
2126 }
2127 let better = r_ok
2134 && match &thin_result {
2135 Some(prev) => r.html.len() > prev.html.len(),
2136 None => true,
2137 };
2138 if self.latency_breakdown {
2139 tracing::info!(
2140 target: "latency_breakdown",
2141 url, tier = "chrome_proxy",
2142 ok = r_ok, consumed = better,
2143 "auto_egress fired"
2144 );
2145 }
2146 if better {
2147 thin_result = Some(r);
2148 }
2149 }
2150 Err(e) => {
2151 if !host.is_empty() {
2152 self.breakers
2153 .record_outcome(&host, kind, BreakerOutcome::ConnectionError)
2154 .await;
2155 }
2156 if self.latency_breakdown {
2157 tracing::info!(
2158 target: "latency_breakdown",
2159 url, tier = "chrome_proxy", error = %e,
2160 "auto_egress fired (error)"
2161 );
2162 }
2163 }
2164 }
2165 }
2166 }
2167
2168 if let Some(mut result) = thin_result {
2170 if let Some(last) = chain.last().copied() {
2173 result.credit_cost = credit_for(last);
2174 result.render_decision = Some(RenderDecision::Failover {
2175 chain: chain.clone(),
2176 reason: last_failover_reason
2177 .clone()
2178 .unwrap_or(FailoverErrorKind::Other),
2179 });
2180 }
2181 if is_user_pinned
2186 && chain.len() == 1
2187 && let Some(pinned) = chain.first().copied()
2188 {
2189 let reason = last_failover_reason
2190 .as_ref()
2191 .map(|r| r.as_str())
2192 .unwrap_or("unknown");
2193 let hint = format!(
2194 "Pinned renderer '{}' returned a failed render ({}). Content may be unreliable. Retry with renderer=\"chrome\" or omit the renderer field for auto-failover.",
2195 pinned.as_str(),
2196 reason,
2197 );
2198 result.warnings.push(hint);
2199 }
2200 Ok(result)
2201 } else {
2202 Err(last_error
2203 .unwrap_or_else(|| CrwError::RendererError("No JS renderer available".to_string())))
2204 }
2205 }
2206
2207 pub async fn check_health(&self) -> HashMap<String, bool> {
2209 let mut health = HashMap::new();
2210 health.insert("http".to_string(), self.http.is_available().await);
2211 for r in &self.js_renderers {
2212 health.insert(r.name().to_string(), r.is_available().await);
2213 }
2214 health
2215 }
2216}
2217
2218fn html_body_text_len(html: &str) -> usize {
2222 let body = if let Some(start) = html.find("<body") {
2224 let start = html[start..].find('>').map(|i| start + i + 1).unwrap_or(0);
2225 let end = html.find("</body>").unwrap_or(html.len());
2226 &html[start..end]
2227 } else {
2228 html
2229 };
2230 let mut in_tag = false;
2232 let mut text_len = 0;
2233 let mut prev_ws = true;
2234 for ch in body.chars() {
2235 if ch == '<' {
2236 in_tag = true;
2237 } else if ch == '>' {
2238 in_tag = false;
2239 } else if !in_tag {
2240 if ch.is_whitespace() {
2241 if !prev_ws {
2242 text_len += 1;
2243 prev_ws = true;
2244 }
2245 } else {
2246 text_len += 1;
2247 prev_ws = false;
2248 }
2249 }
2250 }
2251 text_len
2252}
2253
2254#[cfg(test)]
2255mod tests {
2256 use super::*;
2257 use crate::breaker::BreakerConfig;
2258 #[cfg(feature = "camoufox")]
2259 use crw_core::config::CamoufoxEndpoint;
2260 #[cfg(feature = "cdp")]
2261 use crw_core::config::CdpEndpoint;
2262 use std::time::Duration;
2263
2264 fn tdl() -> crw_core::Deadline {
2266 crw_core::Deadline::now_plus(Duration::from_secs(60))
2267 }
2268
2269 fn base_cfg(mode: RendererMode) -> RendererConfig {
2270 RendererConfig {
2271 mode,
2272 ..Default::default()
2273 }
2274 }
2275
2276 #[test]
2277 fn new_mode_none_ok_no_js_renderers() {
2278 let cfg = base_cfg(RendererMode::None);
2279 let r = FallbackRenderer::new(&cfg, "crw-test", None, &StealthConfig::default()).unwrap();
2280 assert!(r.js_renderer_names().is_empty());
2281 assert_eq!(r.render_js_default, None);
2282 }
2283
2284 #[test]
2285 fn new_mode_auto_no_endpoints_ok_http_only() {
2286 let cfg = base_cfg(RendererMode::Auto);
2287 let r = FallbackRenderer::new(&cfg, "crw-test", None, &StealthConfig::default()).unwrap();
2288 assert!(r.js_renderer_names().is_empty());
2289 }
2290
2291 #[cfg(feature = "cdp")]
2292 #[test]
2293 fn new_mode_chrome_without_endpoint_errors() {
2294 let cfg = base_cfg(RendererMode::Chrome);
2295 let err =
2296 FallbackRenderer::new(&cfg, "crw-test", None, &StealthConfig::default()).unwrap_err();
2297 let msg = err.to_string().to_lowercase();
2298 assert!(msg.contains("chrome"), "expected chrome in error: {msg}");
2299 assert!(
2300 msg.contains("ws_url") || msg.contains("not configured"),
2301 "expected ws_url hint in error: {msg}"
2302 );
2303 }
2304
2305 #[cfg(feature = "cdp")]
2306 #[test]
2307 fn new_mode_chrome_with_endpoint_ok_only_chrome() {
2308 let cfg = RendererConfig {
2309 mode: RendererMode::Chrome,
2310 chrome: Some(CdpEndpoint {
2311 ws_url: "ws://127.0.0.1:9222/".into(),
2312 }),
2313 lightpanda: Some(CdpEndpoint {
2314 ws_url: "ws://127.0.0.1:9223/".into(),
2315 }),
2316 ..Default::default()
2317 };
2318 let r = FallbackRenderer::new(&cfg, "crw-test", None, &StealthConfig::default()).unwrap();
2319 assert_eq!(r.js_renderer_names(), vec!["chrome"]);
2320 }
2321
2322 #[cfg(feature = "cdp")]
2323 #[test]
2324 fn new_mode_lightpanda_without_endpoint_errors() {
2325 let cfg = base_cfg(RendererMode::Lightpanda);
2326 let err =
2327 FallbackRenderer::new(&cfg, "crw-test", None, &StealthConfig::default()).unwrap_err();
2328 assert!(err.to_string().to_lowercase().contains("lightpanda"));
2329 }
2330
2331 #[cfg(feature = "cdp")]
2332 #[test]
2333 fn new_mode_auto_with_both_endpoints_preserves_order() {
2334 let cfg = RendererConfig {
2335 mode: RendererMode::Auto,
2336 lightpanda: Some(CdpEndpoint {
2337 ws_url: "ws://127.0.0.1:9222/".into(),
2338 }),
2339 chrome: Some(CdpEndpoint {
2340 ws_url: "ws://127.0.0.1:9223/".into(),
2341 }),
2342 ..Default::default()
2343 };
2344 let r = FallbackRenderer::new(&cfg, "crw-test", None, &StealthConfig::default()).unwrap();
2345 assert_eq!(r.js_renderer_names(), vec!["lightpanda", "chrome"]);
2346 }
2347
2348 #[cfg(feature = "cdp")]
2349 #[test]
2350 fn ladder_includes_chrome_proxy_when_configured() {
2351 let cfg = RendererConfig {
2352 mode: RendererMode::Auto,
2353 lightpanda: Some(CdpEndpoint {
2354 ws_url: "ws://127.0.0.1:9222/".into(),
2355 }),
2356 chrome: Some(CdpEndpoint {
2357 ws_url: "ws://127.0.0.1:9223/".into(),
2358 }),
2359 chrome_proxy: Some(CdpEndpoint {
2360 ws_url: "ws://127.0.0.1:9224/".into(),
2361 }),
2362 ..Default::default()
2363 };
2364 let r = FallbackRenderer::new(&cfg, "crw-test", None, &StealthConfig::default()).unwrap();
2365 assert_eq!(
2368 r.js_renderer_names(),
2369 vec!["lightpanda", "chrome", "chrome_proxy"]
2370 );
2371 }
2372
2373 #[cfg(feature = "cdp")]
2374 #[test]
2375 fn ladder_omits_chrome_proxy_when_not_configured() {
2376 let cfg = RendererConfig {
2377 mode: RendererMode::Auto,
2378 chrome: Some(CdpEndpoint {
2379 ws_url: "ws://127.0.0.1:9223/".into(),
2380 }),
2381 chrome_proxy: None,
2382 ..Default::default()
2383 };
2384 let r = FallbackRenderer::new(&cfg, "crw-test", None, &StealthConfig::default()).unwrap();
2385 assert!(!r.js_renderer_names().contains(&"chrome_proxy"));
2386 }
2387
2388 #[cfg(not(feature = "cdp"))]
2389 #[test]
2390 fn new_mode_chrome_errors_without_cdp_feature() {
2391 let cfg = base_cfg(RendererMode::Chrome);
2392 let err =
2393 FallbackRenderer::new(&cfg, "crw-test", None, &StealthConfig::default()).unwrap_err();
2394 let msg = err.to_string().to_lowercase();
2395 assert!(msg.contains("cdp"), "expected cdp in error: {msg}");
2396 }
2397
2398 #[cfg(feature = "camoufox")]
2399 fn camoufox_cfg(mode: RendererMode, include_in_auto: bool) -> RendererConfig {
2400 RendererConfig {
2401 mode,
2402 camoufox: Some(CamoufoxEndpoint {
2403 base_url: "http://127.0.0.1:9377".into(),
2404 api_key: String::new(),
2405 include_in_auto,
2406 }),
2407 ..Default::default()
2408 }
2409 }
2410
2411 #[cfg(feature = "camoufox")]
2415 #[test]
2416 fn camoufox_constructed_for_pin_but_excluded_from_auto() {
2417 let cfg = camoufox_cfg(RendererMode::Auto, false);
2418 let r = FallbackRenderer::new(&cfg, "crw-test", None, &StealthConfig::default()).unwrap();
2419 assert!(
2420 r.js_renderer_names().contains(&"camoufox"),
2421 "configured camoufox must be constructed for pin-reachability"
2422 );
2423 assert!(
2424 !r.camoufox_in_auto,
2425 "include_in_auto=false must keep camoufox out of the auto ladder"
2426 );
2427 }
2428
2429 #[cfg(feature = "camoufox")]
2430 #[test]
2431 fn camoufox_joins_auto_when_include_in_auto_true() {
2432 let cfg = camoufox_cfg(RendererMode::Auto, true);
2433 let r = FallbackRenderer::new(&cfg, "crw-test", None, &StealthConfig::default()).unwrap();
2434 assert!(r.js_renderer_names().contains(&"camoufox"));
2435 assert!(r.camoufox_in_auto);
2436 }
2437
2438 #[cfg(feature = "camoufox")]
2441 #[test]
2442 fn camoufox_pinned_mode_uses_only_camoufox() {
2443 let cfg = camoufox_cfg(RendererMode::Camoufox, false);
2444 let r = FallbackRenderer::new(&cfg, "crw-test", None, &StealthConfig::default()).unwrap();
2445 assert_eq!(r.js_renderer_names(), vec!["camoufox"]);
2446 assert!(r.camoufox_in_auto);
2447 }
2448
2449 #[cfg(feature = "camoufox")]
2450 #[test]
2451 fn camoufox_pinned_mode_without_base_url_errors() {
2452 let cfg = RendererConfig {
2453 mode: RendererMode::Camoufox,
2454 camoufox: Some(CamoufoxEndpoint::default()), ..Default::default()
2456 };
2457 let err =
2458 FallbackRenderer::new(&cfg, "crw-test", None, &StealthConfig::default()).unwrap_err();
2459 assert!(err.to_string().to_lowercase().contains("camoufox"));
2460 }
2461
2462 #[cfg(feature = "camoufox")]
2463 #[test]
2464 fn camoufox_absent_when_not_configured() {
2465 let cfg = base_cfg(RendererMode::Auto);
2466 let r = FallbackRenderer::new(&cfg, "crw-test", None, &StealthConfig::default()).unwrap();
2467 assert!(!r.js_renderer_names().contains(&"camoufox"));
2468 assert!(!r.camoufox_in_auto);
2469 }
2470
2471 #[test]
2472 fn new_render_js_default_stored() {
2473 let cfg = RendererConfig {
2474 mode: RendererMode::None,
2475 render_js_default: Some(true),
2476 ..Default::default()
2477 };
2478 let r = FallbackRenderer::new(&cfg, "crw-test", None, &StealthConfig::default()).unwrap();
2479 assert_eq!(r.render_js_default, Some(true));
2480 }
2481
2482 struct MockFetcher {
2484 name: &'static str,
2485 behavior: MockBehavior,
2486 }
2487
2488 #[derive(Clone)]
2489 enum MockBehavior {
2490 Ok(String),
2491 OkStatus(u16, String),
2492 Err(String),
2493 }
2494
2495 #[async_trait::async_trait]
2496 impl PageFetcher for MockFetcher {
2497 async fn fetch(
2498 &self,
2499 url: &str,
2500 _headers: &HashMap<String, String>,
2501 _wait_for_ms: Option<u64>,
2502 _deadline: crw_core::Deadline,
2503 ) -> CrwResult<FetchResult> {
2504 let (status, html) = match &self.behavior {
2505 MockBehavior::Ok(html) => (200u16, html.clone()),
2506 MockBehavior::OkStatus(s, html) => (*s, html.clone()),
2507 MockBehavior::Err(msg) => return Err(CrwError::RendererError(msg.clone())),
2508 };
2509 Ok(FetchResult {
2510 url: url.to_string(),
2511 final_url: None,
2512 status_code: status,
2513 html,
2514 content_type: Some("text/html".to_string()),
2515 raw_bytes: None,
2516 rendered_with: Some(self.name.to_string()),
2517 elapsed_ms: 0,
2518 warning: None,
2519 render_decision: None,
2520 credit_cost: 0,
2521 warnings: Vec::new(),
2522 truncated: false,
2523 deadline_exceeded: false,
2524 captured_responses: Vec::new(),
2525 screenshot: None,
2526 })
2527 }
2528
2529 fn name(&self) -> &str {
2530 self.name
2531 }
2532 fn supports_js(&self) -> bool {
2533 true
2534 }
2535 async fn is_available(&self) -> bool {
2536 true
2537 }
2538 }
2539
2540 fn rich_html(marker: &str) -> String {
2541 format!(
2542 "<html><body><article>{}{}</article></body></html>",
2543 marker,
2544 "x".repeat(200)
2545 )
2546 }
2547
2548 fn make_renderer_with_mocks(mocks: Vec<Arc<dyn PageFetcher>>) -> FallbackRenderer {
2549 let cfg = base_cfg(RendererMode::None);
2551 let mut r =
2552 FallbackRenderer::new(&cfg, "crw-test", None, &StealthConfig::default()).unwrap();
2553 r.js_renderers = mocks;
2554 r
2555 }
2556
2557 #[tokio::test]
2558 async fn proxy_active_lightpanda_only_fails_closed() {
2559 let lp = Arc::new(MockFetcher {
2562 name: "lightpanda",
2563 behavior: MockBehavior::Ok(rich_html("LP-")),
2564 }) as Arc<dyn PageFetcher>;
2565 let r = make_renderer_with_mocks(vec![lp]);
2566 let entry = Arc::new(crw_core::ProxyEntry::parse("http://p:8080").unwrap());
2567 let res = REQUEST_PROXY
2570 .scope(Some(entry), async {
2571 r.fetch_with_js(
2572 "https://example.com",
2573 &HashMap::new(),
2574 None,
2575 None,
2576 crw_core::Deadline::from_request_ms(5000),
2577 )
2578 .await
2579 })
2580 .await;
2581 assert!(
2582 res.is_err(),
2583 "lightpanda-only + proxy active must fail closed, got {res:?}"
2584 );
2585 }
2586
2587 #[tokio::test]
2588 async fn proxy_active_prefers_chrome_over_lightpanda() {
2589 let lp = Arc::new(MockFetcher {
2592 name: "lightpanda",
2593 behavior: MockBehavior::Ok(rich_html("LP-")),
2594 }) as Arc<dyn PageFetcher>;
2595 let chrome = Arc::new(MockFetcher {
2596 name: "chrome",
2597 behavior: MockBehavior::Ok(rich_html("CHROME-")),
2598 }) as Arc<dyn PageFetcher>;
2599 let r = make_renderer_with_mocks(vec![lp, chrome]);
2600 let entry = Arc::new(crw_core::ProxyEntry::parse("http://p:8080").unwrap());
2601 let res = REQUEST_PROXY
2602 .scope(Some(entry), async {
2603 r.fetch_with_js(
2604 "https://example.com",
2605 &HashMap::new(),
2606 None,
2607 None,
2608 crw_core::Deadline::from_request_ms(5000),
2609 )
2610 .await
2611 })
2612 .await
2613 .unwrap();
2614 assert_eq!(res.rendered_with.as_deref(), Some("chrome"));
2615 }
2616
2617 #[tokio::test]
2618 async fn fetch_with_pinned_renderer_filters_pool() {
2619 let lp = Arc::new(MockFetcher {
2620 name: "lightpanda",
2621 behavior: MockBehavior::Ok(rich_html("LP-")),
2622 }) as Arc<dyn PageFetcher>;
2623 let chrome = Arc::new(MockFetcher {
2624 name: "chrome",
2625 behavior: MockBehavior::Ok(rich_html("CHROME-")),
2626 }) as Arc<dyn PageFetcher>;
2627 let r = make_renderer_with_mocks(vec![lp, chrome]);
2628
2629 let result = r
2630 .fetch(
2631 "https://example.com",
2632 &HashMap::new(),
2633 Some(true),
2634 None,
2635 Some("chrome"),
2636 tdl(),
2637 )
2638 .await
2639 .unwrap();
2640 assert!(result.html.contains("CHROME-"), "expected chrome output");
2641 assert_eq!(result.rendered_with.as_deref(), Some("chrome"));
2642 }
2643
2644 #[tokio::test]
2645 async fn fetch_with_pinned_renderer_unknown_returns_error() {
2646 let chrome = Arc::new(MockFetcher {
2647 name: "chrome",
2648 behavior: MockBehavior::Ok(rich_html("CHROME-")),
2649 }) as Arc<dyn PageFetcher>;
2650 let r = make_renderer_with_mocks(vec![chrome]);
2651
2652 let err = r
2653 .fetch(
2654 "https://example.com",
2655 &HashMap::new(),
2656 Some(true),
2657 None,
2658 Some("lightpanda"),
2659 tdl(),
2660 )
2661 .await
2662 .unwrap_err();
2663 let msg = err.to_string();
2664 assert!(
2665 msg.contains("lightpanda") && msg.contains("chrome"),
2666 "expected error to name pinned + available: {msg}"
2667 );
2668 }
2669
2670 #[tokio::test]
2671 async fn fetch_with_renderer_auto_uses_full_chain() {
2672 let lp = Arc::new(MockFetcher {
2673 name: "lightpanda",
2674 behavior: MockBehavior::Ok(rich_html("LP-")),
2675 }) as Arc<dyn PageFetcher>;
2676 let chrome = Arc::new(MockFetcher {
2677 name: "chrome",
2678 behavior: MockBehavior::Ok(rich_html("CHROME-")),
2679 }) as Arc<dyn PageFetcher>;
2680 let r = make_renderer_with_mocks(vec![lp, chrome]);
2681
2682 let result = r
2683 .fetch(
2684 "https://example.com",
2685 &HashMap::new(),
2686 Some(true),
2687 None,
2688 Some("auto"),
2689 tdl(),
2690 )
2691 .await
2692 .unwrap();
2693 assert!(result.html.contains("LP-"), "expected lightpanda first");
2695 }
2696
2697 #[tokio::test]
2698 async fn failover_skips_renderer_that_returns_failed_render() {
2699 let bad_lp_html = format!(
2702 "<html><body><div id=\"__next-error-0\">{}</div></body></html>",
2703 "x".repeat(200)
2704 );
2705 let lp = Arc::new(MockFetcher {
2706 name: "lightpanda",
2707 behavior: MockBehavior::Ok(bad_lp_html),
2708 }) as Arc<dyn PageFetcher>;
2709 let chrome = Arc::new(MockFetcher {
2710 name: "chrome",
2711 behavior: MockBehavior::Ok(rich_html("CHROME-OK")),
2712 }) as Arc<dyn PageFetcher>;
2713 let r = make_renderer_with_mocks(vec![lp, chrome]);
2714
2715 let result = r
2716 .fetch(
2717 "https://example.com",
2718 &HashMap::new(),
2719 Some(true),
2720 None,
2721 None,
2722 tdl(),
2723 )
2724 .await
2725 .unwrap();
2726 assert!(result.html.contains("CHROME-OK"));
2727 assert_eq!(result.rendered_with.as_deref(), Some("chrome"));
2728 }
2729
2730 #[tokio::test]
2731 async fn failover_surfaces_warning_when_only_failed_render_available() {
2732 let bad_lp_html = format!(
2736 "<html><body><div id=\"__next-error-0\">{}</div></body></html>",
2737 "x".repeat(200)
2738 );
2739 let lp = Arc::new(MockFetcher {
2740 name: "lightpanda",
2741 behavior: MockBehavior::Ok(bad_lp_html),
2742 }) as Arc<dyn PageFetcher>;
2743 let r = make_renderer_with_mocks(vec![lp]);
2744
2745 let result = r
2746 .fetch(
2747 "https://example.com",
2748 &HashMap::new(),
2749 Some(true),
2750 None,
2751 None,
2752 tdl(),
2753 )
2754 .await
2755 .unwrap();
2756 let warning = result.warning.expect("expected warning to be set");
2757 assert!(
2758 warning.contains("lightpanda") && warning.contains("nextjs_client_error"),
2759 "warning should name renderer + reason: {warning}"
2760 );
2761 }
2762
2763 #[tokio::test]
2764 async fn failover_concats_warnings_across_two_failed_renderers() {
2765 let bad_lp_html = format!(
2769 "<html><body><div id=\"__next-error-0\">{}</div></body></html>",
2770 "x".repeat(200)
2771 );
2772 let bad_chrome_html = format!(
2773 "<html><body><div id=\"__next_error__\">{}</div></body></html>",
2774 "y".repeat(200)
2775 );
2776 let lp = Arc::new(MockFetcher {
2777 name: "lightpanda",
2778 behavior: MockBehavior::Ok(bad_lp_html),
2779 }) as Arc<dyn PageFetcher>;
2780 let chrome = Arc::new(MockFetcher {
2781 name: "chrome",
2782 behavior: MockBehavior::Ok(bad_chrome_html),
2783 }) as Arc<dyn PageFetcher>;
2784 let r = make_renderer_with_mocks(vec![lp, chrome]);
2785
2786 let result = r
2787 .fetch(
2788 "https://example.com",
2789 &HashMap::new(),
2790 Some(true),
2791 None,
2792 None,
2793 tdl(),
2794 )
2795 .await
2796 .unwrap();
2797 let warning = result.warning.expect("expected warning to be set");
2798 assert!(
2799 warning.contains("lightpanda") && warning.contains("chrome"),
2800 "warning should mention both renderers: {warning}"
2801 );
2802 }
2803
2804 #[tokio::test]
2805 async fn fetch_pinned_renderer_failure_propagates() {
2806 let chrome = Arc::new(MockFetcher {
2807 name: "chrome",
2808 behavior: MockBehavior::Err("boom".into()),
2809 }) as Arc<dyn PageFetcher>;
2810 let r = make_renderer_with_mocks(vec![chrome]);
2811
2812 let err = r
2813 .fetch(
2814 "https://example.com",
2815 &HashMap::new(),
2816 Some(true),
2817 None,
2818 Some("chrome"),
2819 tdl(),
2820 )
2821 .await
2822 .unwrap_err();
2823 assert!(err.to_string().contains("boom"));
2824 }
2825
2826 #[tokio::test]
2827 async fn auto_promoted_host_tries_chrome_first() {
2828 let lp = Arc::new(MockFetcher {
2832 name: "lightpanda",
2833 behavior: MockBehavior::Ok(rich_html("LP-")),
2834 }) as Arc<dyn PageFetcher>;
2835 let chrome = Arc::new(MockFetcher {
2836 name: "chrome",
2837 behavior: MockBehavior::Ok(rich_html("CHROME-")),
2838 }) as Arc<dyn PageFetcher>;
2839 let r = make_renderer_with_mocks(vec![lp, chrome]);
2840
2841 for _ in 0..3 {
2843 r.preferences
2844 .record_failure("example.com", &FailoverErrorKind::NextJsClientError)
2845 .await;
2846 }
2847
2848 let result = r
2849 .fetch(
2850 "https://example.com",
2851 &HashMap::new(),
2852 Some(true),
2853 None,
2854 None,
2855 tdl(),
2856 )
2857 .await
2858 .unwrap();
2859 assert!(
2860 result.html.contains("CHROME-"),
2861 "promoted host should hit chrome first, got: {}",
2862 &result.html[..80.min(result.html.len())]
2863 );
2864 assert_eq!(result.credit_cost, 2, "chrome costs 2 credits");
2865 assert!(matches!(
2866 result.render_decision,
2867 Some(RenderDecision::AutoPromoted {
2868 chosen: RendererKind::Chrome,
2869 ..
2870 })
2871 ));
2872 }
2873
2874 #[tokio::test]
2875 async fn breaker_skipped_renderer_falls_through_to_next() {
2876 let lp = Arc::new(MockFetcher {
2879 name: "lightpanda",
2880 behavior: MockBehavior::Err("would fire if reached".into()),
2881 }) as Arc<dyn PageFetcher>;
2882 let chrome = Arc::new(MockFetcher {
2883 name: "chrome",
2884 behavior: MockBehavior::Ok(rich_html("CHROME-OK")),
2885 }) as Arc<dyn PageFetcher>;
2886 let mut r = make_renderer_with_mocks(vec![lp, chrome]);
2887
2888 let breaker_cfg = BreakerConfig {
2894 base_cooldown: Duration::from_secs(300),
2895 max_cooldown: Duration::from_secs(300),
2896 ..BreakerConfig::default()
2897 };
2898 r.breakers = Arc::new(BreakerRegistry::new(breaker_cfg));
2899 for _ in 0..80 {
2900 r.breakers
2901 .record_result("example.com", RendererKind::Lightpanda, false)
2902 .await;
2903 }
2904
2905 let result = r
2906 .fetch(
2907 "https://example.com",
2908 &HashMap::new(),
2909 Some(true),
2910 None,
2911 None,
2912 tdl(),
2913 )
2914 .await
2915 .unwrap();
2916 assert!(result.html.contains("CHROME-OK"));
2917 assert!(matches!(
2918 result.render_decision,
2919 Some(RenderDecision::BreakerSkipped {
2920 skipped: RendererKind::Lightpanda,
2921 chosen: RendererKind::Chrome
2922 })
2923 ));
2924 }
2925
2926 #[tokio::test]
2927 async fn user_pinned_failed_render_emits_warning() {
2928 let bad_html = format!(
2933 "<html><body><div id=\"__next-error-0\">{}</div></body></html>",
2934 "x".repeat(200)
2935 );
2936 let lp = Arc::new(MockFetcher {
2937 name: "lightpanda",
2938 behavior: MockBehavior::Ok(bad_html),
2939 }) as Arc<dyn PageFetcher>;
2940 let chrome = Arc::new(MockFetcher {
2941 name: "chrome",
2942 behavior: MockBehavior::Ok(rich_html("CHROME-")),
2943 }) as Arc<dyn PageFetcher>;
2944 let r = make_renderer_with_mocks(vec![lp, chrome]);
2945
2946 let result = r
2947 .fetch(
2948 "https://example.com",
2949 &HashMap::new(),
2950 Some(true),
2951 None,
2952 Some("lightpanda"),
2953 tdl(),
2954 )
2955 .await
2956 .unwrap();
2957 let pin_hint = result
2958 .warnings
2959 .iter()
2960 .find(|w| w.starts_with("Pinned renderer 'lightpanda'"));
2961 assert!(
2962 pin_hint.is_some(),
2963 "expected pin-failure hint in warnings, got: {:?}",
2964 result.warnings
2965 );
2966 let hint = pin_hint.unwrap();
2967 assert!(
2968 hint.contains("nextJsClientError"),
2969 "hint should name camelCase reason: {hint}"
2970 );
2971 assert!(
2972 hint.contains("renderer=\"chrome\""),
2973 "hint should suggest a fix: {hint}"
2974 );
2975 assert!(matches!(
2977 result.render_decision,
2978 Some(RenderDecision::Failover { ref chain, .. }) if chain.len() == 1
2979 ));
2980 }
2981
2982 #[tokio::test]
2983 async fn user_pinned_decision_records_credit_and_kind() {
2984 let chrome = Arc::new(MockFetcher {
2985 name: "chrome",
2986 behavior: MockBehavior::Ok(rich_html("CHROME-")),
2987 }) as Arc<dyn PageFetcher>;
2988 let r = make_renderer_with_mocks(vec![chrome]);
2989 let result = r
2990 .fetch(
2991 "https://example.com",
2992 &HashMap::new(),
2993 Some(true),
2994 None,
2995 Some("chrome"),
2996 tdl(),
2997 )
2998 .await
2999 .unwrap();
3000 assert_eq!(result.credit_cost, 2);
3001 assert!(matches!(
3002 result.render_decision,
3003 Some(RenderDecision::UserPinned {
3004 renderer: RendererKind::Chrome
3005 })
3006 ));
3007 }
3008
3009 #[tokio::test]
3010 async fn js_tier_escalates_on_403_status() {
3011 let lp = Arc::new(MockFetcher {
3014 name: "lightpanda",
3015 behavior: MockBehavior::OkStatus(403, rich_html("BLOCKED-")),
3016 }) as Arc<dyn PageFetcher>;
3017 let chrome = Arc::new(MockFetcher {
3018 name: "chrome",
3019 behavior: MockBehavior::Ok(rich_html("CHROME-")),
3020 }) as Arc<dyn PageFetcher>;
3021 let r = make_renderer_with_mocks(vec![lp, chrome]);
3022
3023 let result = r
3024 .fetch(
3025 "https://example.com",
3026 &HashMap::new(),
3027 Some(true),
3028 None,
3029 Some("auto"),
3030 tdl(),
3031 )
3032 .await
3033 .unwrap();
3034 assert!(
3035 result.html.contains("CHROME-"),
3036 "expected chrome output after lightpanda 403"
3037 );
3038 assert_eq!(result.status_code, 200);
3039 }
3040
3041 #[tokio::test]
3042 async fn js_tier_escalates_on_vendor_block_with_200() {
3043 let cf_html = format!(
3046 "<html><head><script src=\"/cdn-cgi/challenge-platform/h/g/orchestrate/chl_page/v1\"></script></head><body>{}</body></html>",
3047 "x".repeat(200)
3048 );
3049 let lp = Arc::new(MockFetcher {
3050 name: "lightpanda",
3051 behavior: MockBehavior::Ok(cf_html),
3052 }) as Arc<dyn PageFetcher>;
3053 let chrome = Arc::new(MockFetcher {
3054 name: "chrome",
3055 behavior: MockBehavior::Ok(rich_html("CHROME-")),
3056 }) as Arc<dyn PageFetcher>;
3057 let r = make_renderer_with_mocks(vec![lp, chrome]);
3058
3059 let result = r
3060 .fetch(
3061 "https://example.com",
3062 &HashMap::new(),
3063 Some(true),
3064 None,
3065 Some("auto"),
3066 tdl(),
3067 )
3068 .await
3069 .unwrap();
3070 assert!(
3071 result.html.contains("CHROME-"),
3072 "expected chrome output after lightpanda vendor block"
3073 );
3074 }
3075
3076 #[tokio::test]
3077 async fn js_tier_accepts_200_clean_response() {
3078 let lp = Arc::new(MockFetcher {
3081 name: "lightpanda",
3082 behavior: MockBehavior::Ok(rich_html("LP-CLEAN-")),
3083 }) as Arc<dyn PageFetcher>;
3084 let chrome = Arc::new(MockFetcher {
3085 name: "chrome",
3086 behavior: MockBehavior::Ok(rich_html("CHROME-")),
3087 }) as Arc<dyn PageFetcher>;
3088 let r = make_renderer_with_mocks(vec![lp, chrome]);
3089
3090 let result = r
3091 .fetch(
3092 "https://example.com",
3093 &HashMap::new(),
3094 Some(true),
3095 None,
3096 Some("auto"),
3097 tdl(),
3098 )
3099 .await
3100 .unwrap();
3101 assert!(result.html.contains("LP-CLEAN-"));
3102 assert_eq!(result.status_code, 200);
3103 }
3104
3105 fn network_security_block_html() -> String {
3109 format!(
3110 "<html><body><article>You've been blocked by network security.{}</article></body></html>",
3111 "x".repeat(200)
3112 )
3113 }
3114
3115 #[tokio::test]
3116 async fn js_tier_escalates_to_chrome_proxy_on_antibot_block() {
3117 let lp = Arc::new(MockFetcher {
3120 name: "lightpanda",
3121 behavior: MockBehavior::Ok(network_security_block_html()),
3122 }) as Arc<dyn PageFetcher>;
3123 let chrome = Arc::new(MockFetcher {
3124 name: "chrome",
3125 behavior: MockBehavior::Ok(network_security_block_html()),
3126 }) as Arc<dyn PageFetcher>;
3127 let chrome_proxy = Arc::new(MockFetcher {
3128 name: "chrome_proxy",
3129 behavior: MockBehavior::Ok(rich_html("PROXY-")),
3130 }) as Arc<dyn PageFetcher>;
3131 let r = make_renderer_with_mocks(vec![lp, chrome, chrome_proxy]);
3132
3133 let result = r
3134 .fetch(
3135 "https://example.com",
3136 &HashMap::new(),
3137 Some(true),
3138 None,
3139 Some("auto"),
3140 tdl(),
3141 )
3142 .await
3143 .unwrap();
3144 assert!(
3145 result.html.contains("PROXY-"),
3146 "expected chrome_proxy output after antibot block"
3147 );
3148 assert_eq!(
3149 result.render_decision,
3150 Some(RenderDecision::Failover {
3151 chain: vec![
3152 RendererKind::Lightpanda,
3153 RendererKind::Chrome,
3154 RendererKind::ChromeProxy,
3155 ],
3156 reason: FailoverErrorKind::AntibotBlock,
3157 })
3158 );
3159 }
3160
3161 #[tokio::test]
3162 async fn antibot_block_returns_as_success_when_escalation_disabled() {
3163 let lp = Arc::new(MockFetcher {
3167 name: "lightpanda",
3168 behavior: MockBehavior::Ok(network_security_block_html()),
3169 }) as Arc<dyn PageFetcher>;
3170 let chrome = Arc::new(MockFetcher {
3171 name: "chrome",
3172 behavior: MockBehavior::Ok(rich_html("CHROME-")),
3173 }) as Arc<dyn PageFetcher>;
3174 let mut r = make_renderer_with_mocks(vec![lp, chrome]);
3175 r.antibot.escalate_in_failover = false;
3176
3177 let result = r
3178 .fetch(
3179 "https://example.com",
3180 &HashMap::new(),
3181 Some(true),
3182 None,
3183 Some("auto"),
3184 tdl(),
3185 )
3186 .await
3187 .unwrap();
3188 assert!(
3189 result.html.contains("network security"),
3190 "block page should be returned as-is when escalation is disabled"
3191 );
3192 assert_eq!(result.rendered_with.as_deref(), Some("lightpanda"));
3193 }
3194
3195 #[tokio::test]
3196 async fn promoted_host_escalates_chrome_to_chrome_proxy_not_lightpanda() {
3197 let lp = Arc::new(MockFetcher {
3201 name: "lightpanda",
3202 behavior: MockBehavior::Ok(rich_html("LP-")),
3203 }) as Arc<dyn PageFetcher>;
3204 let chrome = Arc::new(MockFetcher {
3205 name: "chrome",
3206 behavior: MockBehavior::Ok(network_security_block_html()),
3207 }) as Arc<dyn PageFetcher>;
3208 let chrome_proxy = Arc::new(MockFetcher {
3209 name: "chrome_proxy",
3210 behavior: MockBehavior::Ok(rich_html("PROXY-")),
3211 }) as Arc<dyn PageFetcher>;
3212 let r = make_renderer_with_mocks(vec![lp, chrome, chrome_proxy]);
3213
3214 for _ in 0..3 {
3216 r.preferences
3217 .record_failure("example.com", &FailoverErrorKind::NextJsClientError)
3218 .await;
3219 }
3220
3221 let result = r
3222 .fetch(
3223 "https://example.com",
3224 &HashMap::new(),
3225 Some(true),
3226 None,
3227 None,
3228 tdl(),
3229 )
3230 .await
3231 .unwrap();
3232 assert!(
3233 result.html.contains("PROXY-"),
3234 "expected chrome_proxy output"
3235 );
3236 assert_eq!(
3237 result.render_decision,
3238 Some(RenderDecision::Failover {
3239 chain: vec![RendererKind::Chrome, RendererKind::ChromeProxy],
3240 reason: FailoverErrorKind::AntibotBlock,
3241 }),
3242 "chrome must escalate straight to chrome_proxy, skipping lightpanda"
3243 );
3244 }
3245}