1pub mod blocklist;
37pub mod breaker;
38#[cfg(feature = "auto-browser")]
39pub mod browser;
40#[cfg(feature = "cdp")]
41pub mod browser_pool;
42#[cfg(feature = "cdp")]
43pub mod cdp;
44#[cfg(feature = "cdp")]
45pub mod cdp_conn;
46pub mod detector;
47#[cfg(feature = "cdp")]
48pub mod health_telemetry;
49pub mod host_limiter;
50pub mod http_only;
51pub mod preference;
52pub mod traits;
53
54use crate::breaker::{
55 AttemptContext, BreakerOutcome, BreakerRegistry, Permit, ProbeGuard, classify_outcome,
56};
57use crate::preference::HostPreferences;
58use crw_core::config::{BUILTIN_UA_POOL, RendererConfig, RendererMode, StealthConfig};
59use crw_core::error::{CrwError, CrwResult};
60use crw_core::metrics::metrics;
61use crw_core::types::{
62 FailoverErrorKind, FetchResult, RenderDecision, RendererKind, resolve_render_js,
63};
64use std::collections::HashMap;
65use std::sync::Arc;
66use std::time::Duration;
67use traits::PageFetcher;
68
69tokio::task_local! {
70 pub static REQUEST_COUNTRY: Option<String>;
76}
77
78fn renderer_kind_for(name: &str) -> Option<RendererKind> {
82 match name {
83 "http" | "http_only_fallback" => Some(RendererKind::Http),
84 "lightpanda" => Some(RendererKind::Lightpanda),
85 "chrome" => Some(RendererKind::Chrome),
86 "chrome_proxy" => Some(RendererKind::ChromeProxy),
87 _ => None,
88 }
89}
90
91fn classify_renderer_error(err: &CrwError) -> FailoverErrorKind {
101 match err {
102 CrwError::Timeout(_) => FailoverErrorKind::LightpandaTimeout,
103 CrwError::TargetUnreachable(_) => FailoverErrorKind::NetworkError,
104 CrwError::HttpError(_) => FailoverErrorKind::NetworkError,
105 CrwError::RendererError(_) => FailoverErrorKind::LightpandaCrash,
108 _ => FailoverErrorKind::Other,
109 }
110}
111
112fn tier_timeouts_from(
115 config: &RendererConfig,
116) -> std::collections::HashMap<RendererKind, std::time::Duration> {
117 let mut m = std::collections::HashMap::new();
118 m.insert(
119 RendererKind::Http,
120 std::time::Duration::from_millis(config.http_timeout()),
121 );
122 m.insert(
123 RendererKind::Lightpanda,
124 std::time::Duration::from_millis(config.lightpanda_timeout()),
125 );
126 m.insert(
127 RendererKind::Chrome,
128 std::time::Duration::from_millis(config.chrome_timeout()),
129 );
130 m.insert(
131 RendererKind::ChromeProxy,
132 std::time::Duration::from_millis(config.chrome_proxy_timeout()),
133 );
134 m
135}
136
137fn credit_for(kind: RendererKind) -> u32 {
140 match kind {
141 RendererKind::Http => 1,
142 RendererKind::Lightpanda => 1,
143 RendererKind::Chrome => 2,
144 RendererKind::ChromeProxy => 2,
147 }
148}
149
150fn stamp_http_decision(result: &mut FetchResult, requested_renderer: Option<&str>) {
154 if result.render_decision.is_some() {
155 return;
156 }
157 let kind = RendererKind::Http;
158 result.credit_cost = credit_for(kind);
159 result.render_decision = Some(match requested_renderer {
160 Some("http") => RenderDecision::UserPinned { renderer: kind },
161 _ => RenderDecision::AutoDefault { chosen: kind },
162 });
163 metrics()
165 .render_route_decision_total
166 .with_label_values(&[kind.as_str(), "success"])
167 .inc();
168}
169
170fn host_of(url: &str) -> String {
172 url::Url::parse(url)
173 .ok()
174 .and_then(|u| u.host_str().map(|h| h.to_string()))
175 .unwrap_or_default()
176}
177
178fn pick_ua<'a>(default_ua: &'a str, stealth: &'a StealthConfig) -> String {
180 if stealth.enabled {
181 let pool: &[&str] = if stealth.user_agents.is_empty() {
182 BUILTIN_UA_POOL
183 } else {
184 return stealth.user_agents[rand::random_range(0..stealth.user_agents.len())].clone();
186 };
187 pool[rand::random_range(0..pool.len())].to_string()
188 } else {
189 default_ua.to_string()
190 }
191}
192
193pub struct FallbackRenderer {
195 http: Arc<dyn PageFetcher>,
196 js_renderers: Vec<Arc<dyn PageFetcher>>,
197 render_js_default: Option<bool>,
199 preferences: Arc<HostPreferences>,
201 breakers: Arc<BreakerRegistry>,
203 tier_timeouts: std::collections::HashMap<RendererKind, std::time::Duration>,
207 requests_per_second: f64,
211 per_host_max_concurrent: u32,
213 antibot: crw_core::config::AntibotConfig,
217 #[cfg(feature = "cdp")]
220 chrome_pool: Option<Arc<browser_pool::BrowserContextPool<cdp_conn::CdpConnection>>>,
221}
222
223impl std::fmt::Debug for FallbackRenderer {
224 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
225 f.debug_struct("FallbackRenderer")
226 .field("http", &self.http.name())
227 .field(
228 "js_renderers",
229 &self
230 .js_renderers
231 .iter()
232 .map(|r| r.name())
233 .collect::<Vec<_>>(),
234 )
235 .field("render_js_default", &self.render_js_default)
236 .finish()
237 }
238}
239
240impl FallbackRenderer {
241 pub fn new(
242 config: &RendererConfig,
243 user_agent: &str,
244 proxy: Option<&str>,
245 stealth: &StealthConfig,
246 ) -> CrwResult<Self> {
247 let effective_ua = pick_ua(user_agent, stealth);
248 let inject_headers = stealth.enabled && stealth.inject_headers;
249 let http = Arc::new(http_only::HttpFetcher::with_timeout(
250 &effective_ua,
251 proxy,
252 inject_headers,
253 std::time::Duration::from_millis(config.http_timeout()),
254 )) as Arc<dyn PageFetcher>;
255
256 #[cfg(not(feature = "cdp"))]
260 if matches!(
261 config.mode,
262 RendererMode::Lightpanda | RendererMode::Chrome | RendererMode::Playwright
263 ) {
264 return Err(CrwError::ConfigError(format!(
265 "renderer.mode = {:?} requires the 'cdp' feature, but this build was \
266 compiled without it. Rebuild with --features cdp or set mode = \"auto\"/\"none\".",
267 config.mode
268 )));
269 }
270
271 #[allow(unused_mut)]
272 let mut js_renderers: Vec<Arc<dyn PageFetcher>> = Vec::new();
273
274 if matches!(config.mode, RendererMode::None) {
275 if config.render_js_default == Some(true) {
276 tracing::warn!(
277 "render_js_default=true has no effect with mode=none; \
278 requests will fall back to HTTP via http_only_fallback"
279 );
280 }
281 return Ok(Self {
282 http,
283 js_renderers,
284 render_js_default: config.render_js_default,
285 preferences: Arc::new(HostPreferences::with_defaults()),
286 breakers: Arc::new(BreakerRegistry::with_defaults()),
287 tier_timeouts: tier_timeouts_from(config),
288 requests_per_second: 0.0,
289 per_host_max_concurrent: 1,
290 antibot: config.antibot.clone(),
291 #[cfg(feature = "cdp")]
292 chrome_pool: None,
293 });
294 }
295
296 #[cfg(feature = "cdp")]
297 let mut chrome_pool: Option<
298 Arc<browser_pool::BrowserContextPool<cdp_conn::CdpConnection>>,
299 > = None;
300
301 #[cfg(feature = "cdp")]
302 {
303 let want = |m: RendererMode| -> bool {
304 matches!(config.mode, RendererMode::Auto) || config.mode == m
305 };
306
307 if want(RendererMode::Lightpanda) {
308 if let Some(lp) = &config.lightpanda {
309 js_renderers.push(Arc::new(cdp::CdpRenderer::new(
310 "lightpanda",
311 &lp.ws_url,
312 config.lightpanda_timeout(),
313 config.pool_size,
314 )));
315 } else if matches!(config.mode, RendererMode::Lightpanda) {
316 return Err(CrwError::ConfigError(
317 "renderer.mode = \"lightpanda\" but [renderer.lightpanda] ws_url is not \
318 configured"
319 .into(),
320 ));
321 }
322 }
323 if want(RendererMode::Playwright) {
324 if let Some(pw) = &config.playwright {
325 js_renderers.push(Arc::new(cdp::CdpRenderer::new(
328 "playwright",
329 &pw.ws_url,
330 config.chrome_timeout(),
331 config.pool_size,
332 )));
333 } else if matches!(config.mode, RendererMode::Playwright) {
334 return Err(CrwError::ConfigError(
335 "renderer.mode = \"playwright\" but [renderer.playwright] ws_url is not \
336 configured"
337 .into(),
338 ));
339 }
340 }
341 if want(RendererMode::Chrome) {
342 if let Some(ch) = &config.chrome {
343 let blocklist = blocklist::Blocklist::defaults()
344 .with_stylesheets(config.chrome_intercept_stylesheets);
345 let mut renderer = cdp::CdpRenderer::new(
346 "chrome",
347 &ch.ws_url,
348 config.chrome_timeout(),
349 config.pool_size,
350 )
351 .with_nav_budget(config.chrome_nav_budget_ms)
352 .with_interception(
353 config.chrome_intercept_resources,
354 blocklist,
355 config.chrome_host_intercept_disable.clone(),
356 );
357
358 if config.chrome_context_pool_enabled {
362 match config.chrome_backend {
363 crw_core::config::ChromeBackend::Vanilla => {
364 let pcfg = &config.chrome_pool;
365 let size = pcfg.size.unwrap_or_else(|| {
366 let n = std::thread::available_parallelism()
367 .map(|p| p.get())
368 .unwrap_or(2);
369 std::cmp::max(2, n / 2)
370 });
371 renderer = renderer.with_pool(browser_pool::PoolCfg {
372 size,
373 recycle_after_navs: pcfg.recycle_after_navs,
374 idle_timeout: std::time::Duration::from_secs(
375 pcfg.idle_timeout_secs,
376 ),
377 health_check_after: std::time::Duration::from_secs(
378 pcfg.health_check_secs,
379 ),
380 shutdown_drain: std::time::Duration::from_secs(
381 pcfg.shutdown_drain_secs,
382 ),
383 close_target_timeout: std::time::Duration::from_secs(2),
384 dispose_ctx_timeout: std::time::Duration::from_secs(1),
385 create_ctx_timeout: std::time::Duration::from_secs(1),
386 });
387 tracing::info!(
388 pool_size = size,
389 "chrome browser-context pool enabled"
390 );
391 }
392 crw_core::config::ChromeBackend::Browserless => {
393 tracing::warn!(
394 "chrome_context_pool_enabled = true but \
395 chrome_backend = browserless — pool unsupported on \
396 this backend in v1, falling back to legacy path"
397 );
398 }
399 }
400 }
401 chrome_pool = renderer.pool();
402 js_renderers.push(Arc::new(renderer));
403 } else if matches!(config.mode, RendererMode::Chrome) {
404 return Err(CrwError::ConfigError(
405 "renderer.mode = \"chrome\" but [renderer.chrome] ws_url is not configured"
406 .into(),
407 ));
408 }
409 if let Some(cp) = config
417 .chrome_proxy
418 .as_ref()
419 .filter(|c| !c.ws_url.trim().is_empty())
420 {
421 let blocklist = blocklist::Blocklist::defaults()
422 .with_stylesheets(config.chrome_intercept_stylesheets);
423 let mut renderer = cdp::CdpRenderer::new(
424 "chrome_proxy",
425 &cp.ws_url,
426 config.chrome_proxy_timeout(),
427 config.pool_size,
428 )
429 .with_nav_budget(config.chrome_nav_budget_ms)
430 .with_interception(
431 config.chrome_intercept_resources,
432 blocklist,
433 config.chrome_host_intercept_disable.clone(),
434 );
435 if let (Some(u), Some(p)) = (&config.proxy_base_user, &config.proxy_base_pass) {
440 renderer = renderer.with_proxy_auth_base(
441 u.clone(),
442 p.clone(),
443 config.proxy_default_country.clone(),
444 );
445 }
446 tracing::info!(
447 ws_url = %cp.ws_url,
448 proxy_auth = config.proxy_base_user.is_some(),
449 default_country = ?config.proxy_default_country,
450 "chrome_proxy tier enabled"
451 );
452 js_renderers.push(Arc::new(renderer));
453 }
454 }
455 }
456
457 #[cfg(feature = "cdp")]
461 health_telemetry::spawn_once();
462
463 if config.render_js_default == Some(true) && js_renderers.is_empty() {
464 tracing::warn!(
465 "render_js_default=true but no JS renderer is available; \
466 requests will fall back to HTTP via http_only_fallback"
467 );
468 }
469
470 Ok(Self {
471 http,
472 js_renderers,
473 render_js_default: config.render_js_default,
474 preferences: Arc::new(HostPreferences::with_defaults()),
475 breakers: Arc::new(BreakerRegistry::with_defaults()),
476 tier_timeouts: tier_timeouts_from(config),
477 requests_per_second: 0.0,
478 per_host_max_concurrent: 1,
479 antibot: config.antibot.clone(),
480 #[cfg(feature = "cdp")]
481 chrome_pool,
482 })
483 }
484
485 #[cfg(feature = "cdp")]
489 pub async fn shutdown_chrome_pool(&self, drain: std::time::Duration) {
490 if let Some(pool) = self.chrome_pool.clone() {
491 tracing::info!(
492 drain_secs = drain.as_secs(),
493 "draining chrome browser-context pool"
494 );
495 pool.shutdown(drain).await;
496 }
497 }
498
499 #[cfg(not(feature = "cdp"))]
501 pub async fn shutdown_chrome_pool(&self, _drain: std::time::Duration) {}
502
503 pub fn with_host_limits(
507 mut self,
508 requests_per_second: f64,
509 per_host_max_concurrent: u32,
510 ) -> Self {
511 self.requests_per_second = requests_per_second;
512 self.per_host_max_concurrent = per_host_max_concurrent;
513 self
514 }
515
516 pub fn preferences(&self) -> Arc<HostPreferences> {
518 Arc::clone(&self.preferences)
519 }
520
521 pub fn breakers(&self) -> Arc<BreakerRegistry> {
523 Arc::clone(&self.breakers)
524 }
525
526 pub fn js_renderer_names(&self) -> Vec<&str> {
529 self.js_renderers.iter().map(|r| r.name()).collect()
530 }
531
532 pub async fn fetch(
540 &self,
541 url: &str,
542 headers: &HashMap<String, String>,
543 render_js: Option<bool>,
544 wait_for_ms: Option<u64>,
545 requested_renderer: Option<&str>,
546 deadline: crw_core::Deadline,
547 ) -> CrwResult<FetchResult> {
548 let host_key = url::Url::parse(url)
552 .ok()
553 .and_then(|u| u.host_str().map(crate::preference::normalize_host));
554 let _host_permit = if let Some(key) = host_key.as_deref() {
555 let remaining = deadline.remaining();
556 if remaining.is_zero() {
557 return Err(CrwError::Timeout(
558 deadline.overrun().as_millis().max(1) as u64
559 ));
560 }
561 match tokio::time::timeout(
562 remaining,
563 crate::host_limiter::acquire(
564 key,
565 self.requests_per_second,
566 self.per_host_max_concurrent as usize,
567 ),
568 )
569 .await
570 {
571 Ok(Ok((permit, sleep))) => {
572 if !sleep.is_zero() {
573 let budget = deadline.remaining();
574 if sleep > budget {
575 return Err(CrwError::Timeout(sleep.as_millis().max(1) as u64));
576 }
577 tokio::time::sleep(sleep).await;
578 }
579 Some(permit)
580 }
581 Ok(Err(_)) => return Err(CrwError::RendererError("host limiter closed".into())),
582 Err(_) => {
583 return Err(CrwError::Timeout(
584 deadline.overrun().as_millis().max(1) as u64
585 ));
586 }
587 }
588 } else {
589 None
590 };
591
592 let effective = resolve_render_js(render_js, self.render_js_default);
593 tracing::debug!(
594 url,
595 request_render_js = ?render_js,
596 default_render_js = ?self.render_js_default,
597 effective_render_js = ?effective,
598 requested_renderer,
599 "FallbackRenderer::fetch dispatching"
600 );
601 let is_hard_pinned = matches!(requested_renderer, Some(name) if name != "auto");
603 match effective {
604 Some(false) => {
605 let mut r = self.http.fetch(url, headers, None, deadline).await?;
606 stamp_http_decision(&mut r, requested_renderer);
607 Ok(r)
608 }
609 Some(true) => {
610 let mut http_result = self.http.fetch(url, headers, None, deadline).await?;
612 if http_result.content_type.as_deref() == Some("application/pdf") {
613 stamp_http_decision(&mut http_result, requested_renderer);
614 return Ok(http_result);
615 }
616
617 if self.js_renderers.is_empty() {
618 tracing::warn!(
619 url,
620 "JS rendering requested but no renderer available — falling back to HTTP"
621 );
622 let mut result = http_result;
623 result.rendered_with = Some("http_only_fallback".to_string());
624 result.warning = Some("JS rendering was requested but no renderer is available. Content was fetched via HTTP only.".to_string());
625 result.warnings.push(
626 "JS rendering requested but no renderer available; HTTP fallback used"
627 .into(),
628 );
629 stamp_http_decision(&mut result, requested_renderer);
630 Ok(result)
631 } else {
632 self.fetch_with_js(url, headers, wait_for_ms, requested_renderer, deadline)
633 .await
634 }
635 }
636 None => {
637 let mut result = match self.http.fetch(url, headers, None, deadline).await {
644 Ok(r) => r,
645 Err(e) if !self.js_renderers.is_empty() => {
646 tracing::info!(
647 url,
648 error = %e,
649 "HTTP fetch failed, escalating to JS renderer"
650 );
651 return self
652 .fetch_with_js(url, headers, wait_for_ms, requested_renderer, deadline)
653 .await
654 .map_err(|js_err| {
655 tracing::warn!("Both HTTP and JS failed: http={e}, js={js_err}");
656 js_err
657 });
658 }
659 Err(e) => return Err(e),
660 };
661
662 if result.content_type.as_deref() == Some("application/pdf") {
664 stamp_http_decision(&mut result, requested_renderer);
665 return Ok(result);
666 }
667
668 let needs_js = detector::needs_js_rendering(&result.html);
669 let cf_header_signal = result.warning.as_deref() == Some("cloudflare_mitigated");
670 let is_generic_bot_wall = detector::looks_like_generic_bot_wall(&result.html);
671 let is_blocked = cf_header_signal
672 || detector::looks_like_cloudflare_challenge(&result.html)
673 || is_generic_bot_wall;
674 let is_auth_blocked = matches!(
687 result.status_code,
688 401 | 403 | 404 | 405 | 406 | 410 | 412 | 429 | 451 | 500 | 503
689 );
690 let is_2xx = (200..300).contains(&result.status_code);
697 let is_thin_content = is_2xx && detector::looks_like_thin_html(&result.html);
698
699 if !self.js_renderers.is_empty()
700 && (needs_js || is_blocked || is_auth_blocked || is_thin_content)
701 {
702 if is_auth_blocked {
703 tracing::info!(
704 url,
705 status_code = result.status_code,
706 "HTTP {} received, escalating to JS renderer",
707 result.status_code
708 );
709 } else if is_blocked {
710 tracing::info!(
711 url,
712 "Anti-bot challenge detected in HTTP response, escalating to JS renderer"
713 );
714 if is_generic_bot_wall {
715 tracing::info!(
716 url,
717 "Generic anti-bot interstitial detected, escalating to JS renderer"
718 );
719 }
720 } else if needs_js {
721 tracing::info!(url, "SPA shell detected, retrying with JS renderer");
722 } else {
723 tracing::info!(
724 url,
725 html_len = result.html.len(),
726 "HTTP 2xx but body is thin, escalating to JS renderer"
727 );
728 }
729 match self
730 .fetch_with_js(url, headers, wait_for_ms, requested_renderer, deadline)
731 .await
732 {
733 Ok(js_result) => Ok(js_result),
734 Err(e) if is_hard_pinned => {
735 Err(e)
738 }
739 Err(e) => {
740 if is_auth_blocked {
748 tracing::error!(
749 url,
750 status_code = result.status_code,
751 "JS escalation failed for soft-block status; surfacing HTTP shell with warning: {e}"
752 );
753 let warning = format!("js_escalation_failed: {e}");
754 result.warning = Some(match result.warning.take() {
755 Some(prev) => format!("{warning}; {prev}"),
756 None => warning,
757 });
758 } else {
759 tracing::warn!(
760 "JS rendering failed, falling back to HTTP result: {e}"
761 );
762 }
763 stamp_http_decision(&mut result, requested_renderer);
764 Ok(result)
765 }
766 }
767 } else {
768 stamp_http_decision(&mut result, requested_renderer);
769 Ok(result)
770 }
771 }
772 }
773 }
774
775 const MIN_RENDERED_TEXT_LEN: usize = 50;
779
780 async fn fetch_with_js(
781 &self,
782 url: &str,
783 headers: &HashMap<String, String>,
784 wait_for_ms: Option<u64>,
785 requested_renderer: Option<&str>,
786 deadline: crw_core::Deadline,
787 ) -> CrwResult<FetchResult> {
788 let host = host_of(url);
789 let is_user_pinned = matches!(requested_renderer, Some(name) if name != "auto");
790 if let Some(pinned) = requested_renderer
791 && let Some(kind) = renderer_kind_for(pinned)
792 {
793 metrics()
794 .user_pin_total
795 .with_label_values(&[kind.as_str()])
796 .inc();
797 }
798
799 let mut renderers: Vec<&Arc<dyn PageFetcher>> = match requested_renderer {
802 Some(name) if name != "auto" => self
803 .js_renderers
804 .iter()
805 .filter(|r| r.name() == name)
806 .collect(),
807 _ => self.js_renderers.iter().collect(),
808 };
809
810 if !is_user_pinned
812 && let Some(RendererKind::Chrome) = self.preferences.preferred(&host).await
813 {
814 renderers.sort_by_key(|r| match r.name() {
820 "chrome" => 0,
821 "chrome_proxy" => 1,
822 _ => 2,
823 });
824 tracing::debug!(host = %host, "host promoted to chrome by preference learner");
825 }
826
827 if renderers.is_empty() {
828 let available = self.js_renderer_names();
829 return Err(CrwError::RendererError(format!(
830 "requested renderer '{}' not in pool [{}]",
831 requested_renderer.unwrap_or("auto"),
832 available.join(", ")
833 )));
834 }
835
836 let mut chain: Vec<RendererKind> = Vec::new();
839 let mut breaker_skipped: Vec<RendererKind> = Vec::new();
840 let mut last_error = None;
841 let mut last_failover_reason: Option<FailoverErrorKind> = None;
842 let mut thin_result: Option<FetchResult> = None;
843 let renderers_snapshot: Vec<&Arc<dyn PageFetcher>> = renderers.clone();
848
849 for renderer in renderers {
850 let kind = renderer_kind_for(renderer.name());
851
852 let trackable = kind.filter(|_| !host.is_empty());
855
856 let mut probe_guard: Option<ProbeGuard> = None;
867 if let Some(k) = trackable {
868 let (permit, guard) = self.breakers.acquire_with_guard(&host, k).await;
869 if permit == Permit::Rejected {
870 tracing::info!(
871 renderer = renderer.name(),
872 host = %host,
873 "circuit breaker open, skipping renderer"
874 );
875 metrics()
876 .render_route_decision_total
877 .with_label_values(&[k.as_str(), "breakerSkipped"])
878 .inc();
879 breaker_skipped.push(k);
880 drop(guard); continue;
882 }
883 probe_guard = Some(guard);
884 }
885 if let Some(k) = kind {
886 chain.push(k);
887 }
888
889 let attempt_ctx = {
892 let remaining = deadline.remaining();
893 let tier_budget = kind
894 .and_then(|k| self.tier_timeouts.get(&k).copied())
895 .unwrap_or(remaining);
896 AttemptContext::capture(remaining, tier_budget)
897 };
898 match renderer.fetch(url, headers, wait_for_ms, deadline).await {
899 Ok(mut result) => {
900 let text_len = html_body_text_len(&result.html);
901 let is_placeholder = detector::looks_like_loading_placeholder(&result.html);
902 let failed_render = detector::looks_like_failed_render(&result.html);
903 let is_bot_wall = detector::looks_like_generic_bot_wall(&result.html);
904 let vendor_block = detector::looks_like_vendor_block(&result.html);
905 let is_status_blocked = matches!(
909 result.status_code,
910 401 | 403 | 404 | 405 | 406 | 410 | 412 | 429 | 451 | 500 | 503
911 );
912 let antibot = if self.antibot.enabled {
921 crw_extract::antibot::classify(Some(result.status_code), &result.html)
922 } else {
923 crw_extract::antibot::AntibotResult::none()
924 };
925 let antibot_blocked =
926 self.antibot.escalate_in_failover && antibot.signal.is_blocked();
927 if text_len >= Self::MIN_RENDERED_TEXT_LEN
928 && !is_placeholder
929 && failed_render.is_none()
930 && !is_bot_wall
931 && vendor_block.is_none()
932 && !is_status_blocked
933 && !antibot_blocked
934 {
935 let was_promoted = matches!(
939 self.preferences.preferred(&host).await,
940 Some(RendererKind::Chrome)
941 );
942 if let Some(k) = trackable {
943 let outcome = if result.truncated {
946 BreakerOutcome::Truncated
947 } else {
948 BreakerOutcome::Success
949 };
950 self.breakers.record_outcome(&host, k, outcome).await;
951 self.preferences.record_success(&host).await;
952 metrics()
953 .render_route_decision_total
954 .with_label_values(&[k.as_str(), "success"])
955 .inc();
956 metrics()
957 .host_preferences_size
958 .set(self.preferences.size() as i64);
959 }
960 if let Some(g) = probe_guard.take() {
961 g.disarm();
962 }
963 if let Some(k) = kind {
965 result.credit_cost = credit_for(k);
966 result.render_decision = Some(if is_user_pinned {
967 RenderDecision::UserPinned { renderer: k }
968 } else if !breaker_skipped.is_empty() {
969 RenderDecision::BreakerSkipped {
970 skipped: breaker_skipped[0],
971 chosen: k,
972 }
973 } else if chain.len() > 1 {
974 RenderDecision::Failover {
975 chain: chain.clone(),
976 reason: last_failover_reason
977 .clone()
978 .unwrap_or(FailoverErrorKind::Other),
979 }
980 } else if was_promoted && k == RendererKind::Chrome {
981 RenderDecision::AutoPromoted {
982 chosen: k,
983 from: RendererKind::Lightpanda,
984 reason: "host preference learner".into(),
985 }
986 } else {
987 RenderDecision::AutoDefault { chosen: k }
988 });
989 }
990 return Ok(result);
991 }
992 let err_kind = match failed_render {
995 Some(detector::FailedRenderReason::NextJsClientError) => {
996 FailoverErrorKind::NextJsClientError
997 }
998 Some(detector::FailedRenderReason::ReactMinifiedError) => {
999 FailoverErrorKind::NextJsClientError
1000 }
1001 Some(detector::FailedRenderReason::EmptyNextRoot) => {
1002 FailoverErrorKind::EmptyNextRoot
1003 }
1004 None if vendor_block.is_some() => FailoverErrorKind::VendorBlock,
1005 None if is_status_blocked => FailoverErrorKind::StatusBlocked,
1006 None if is_placeholder => FailoverErrorKind::PlaceholderContent,
1007 None if is_bot_wall => FailoverErrorKind::PlaceholderContent,
1008 None if antibot_blocked => FailoverErrorKind::AntibotBlock,
1010 None => FailoverErrorKind::PlaceholderContent,
1011 };
1012 last_failover_reason = Some(err_kind.clone());
1013 if let Some(k) = trackable {
1014 let outcome = classify_outcome(false, false, false, &attempt_ctx);
1018 self.breakers.record_outcome(&host, k, outcome).await;
1019 if k == RendererKind::Lightpanda
1020 && let Some(target) =
1021 self.preferences.record_failure(&host, &err_kind).await
1022 {
1023 metrics()
1024 .host_preferences_promotions_total
1025 .with_label_values(&[k.as_str(), target.as_str()])
1026 .inc();
1027 tracing::info!(
1028 host = %host,
1029 "host promoted by preference learner: {} -> {}",
1030 k.as_str(),
1031 target.as_str()
1032 );
1033 }
1034 }
1035 if let Some(g) = probe_guard.take() {
1036 g.disarm();
1037 }
1038 if let Some(vendor) = vendor_block {
1039 metrics()
1040 .vendor_block_total
1041 .with_label_values(&[vendor])
1042 .inc();
1043 tracing::warn!(
1044 renderer = renderer.name(),
1045 url,
1046 vendor,
1047 "vendor anti-bot block detected"
1048 );
1049 }
1050 if antibot.signal.is_blocked() {
1053 metrics()
1054 .antibot_escalation_total
1055 .with_label_values(&[antibot.signal.class_name()])
1056 .inc();
1057 tracing::warn!(
1058 renderer = renderer.name(),
1059 url,
1060 signal = antibot.signal.class_name(),
1061 reason = %antibot.reason,
1062 status_code = result.status_code,
1063 text_len,
1064 escalated = antibot_blocked,
1065 "antibot classifier flagged a block"
1066 );
1067 }
1068 tracing::info!(
1069 renderer = renderer.name(),
1070 text_len,
1071 is_placeholder,
1072 is_bot_wall,
1073 vendor_block,
1074 is_status_blocked,
1075 antibot_signal = antibot.signal.class_name(),
1076 antibot_blocked,
1077 status_code = result.status_code,
1078 failed_render = ?failed_render,
1079 "JS renderer returned thin/placeholder/failed content, trying next renderer"
1080 );
1081 let mut annotated = result;
1088 let attempt_warning = if let Some(reason) = failed_render {
1089 format!(
1090 "{} returned a failed render ({})",
1091 renderer.name(),
1092 reason.as_str()
1093 )
1094 } else if is_placeholder {
1095 format!("{} returned a loading placeholder", renderer.name())
1096 } else if let Some(vendor) = vendor_block {
1097 format!(
1098 "{} returned a vendor anti-bot block ({vendor})",
1099 renderer.name()
1100 )
1101 } else if is_bot_wall {
1102 format!(
1103 "{} returned a generic anti-bot interstitial",
1104 renderer.name()
1105 )
1106 } else if is_status_blocked {
1107 format!(
1108 "{} returned HTTP {} (treated as blocked)",
1109 renderer.name(),
1110 annotated.status_code
1111 )
1112 } else if antibot_blocked {
1113 format!(
1114 "{} returned an anti-bot block ({}: {})",
1115 renderer.name(),
1116 antibot.signal.class_name(),
1117 antibot.reason
1118 )
1119 } else {
1120 format!(
1121 "{} returned thin content (text_len={text_len})",
1122 renderer.name()
1123 )
1124 };
1125 if is_bot_wall || vendor_block.is_some() || is_status_blocked || antibot_blocked
1126 {
1127 let msg = if let Some(v) = vendor_block {
1136 format!("{} returned a vendor anti-bot block ({v})", renderer.name())
1137 } else if is_status_blocked {
1138 format!(
1139 "{} returned HTTP {} (treated as blocked)",
1140 renderer.name(),
1141 annotated.status_code
1142 )
1143 } else if is_bot_wall {
1144 format!(
1145 "{} returned a generic anti-bot interstitial",
1146 renderer.name()
1147 )
1148 } else {
1149 format!(
1150 "{} returned an anti-bot block ({}: {})",
1151 renderer.name(),
1152 antibot.signal.class_name(),
1153 antibot.reason
1154 )
1155 };
1156 last_error = Some(CrwError::RendererError(msg));
1157 }
1158 annotated.warnings.push(attempt_warning.clone());
1159 annotated.warning = Some(match annotated.warning {
1160 Some(prev) => format!("{prev}; {attempt_warning}"),
1161 None => attempt_warning.clone(),
1162 });
1163 thin_result = Some(match thin_result {
1164 None => annotated,
1165 Some(existing) => {
1166 let (mut keeper, dropped) =
1173 if annotated.html.len() > existing.html.len() {
1174 (annotated, existing)
1175 } else {
1176 (existing, annotated)
1177 };
1178 keeper.warnings.push(attempt_warning.clone());
1179 keeper.warning = Some(match keeper.warning {
1180 Some(prev) => format!("{prev}; {attempt_warning}"),
1181 None => attempt_warning,
1182 });
1183 for w in dropped.warnings {
1186 if !keeper.warnings.contains(&w) {
1187 keeper.warnings.push(w);
1188 }
1189 }
1190 keeper
1191 }
1192 });
1193 }
1194 Err(e) => {
1195 tracing::warn!(renderer = renderer.name(), "JS renderer failed: {e}");
1196 let err_kind = classify_renderer_error(&e);
1197 last_failover_reason = Some(err_kind.clone());
1198 if let Some(k) = trackable {
1199 let was_timeout = matches!(e, CrwError::Timeout(_));
1200 let outcome = classify_outcome(false, false, was_timeout, &attempt_ctx);
1201 self.breakers.record_outcome(&host, k, outcome).await;
1202 if k == RendererKind::Lightpanda {
1203 let _ = self.preferences.record_failure(&host, &err_kind).await;
1204 }
1205 }
1206 if let Some(g) = probe_guard.take() {
1207 g.disarm();
1208 }
1209 last_error = Some(e);
1210 continue;
1211 }
1212 }
1213 }
1214 const LEAK_MIN_BUDGET: Duration = Duration::from_millis(500);
1232 if thin_result.is_none()
1233 && !breaker_skipped.is_empty()
1234 && !is_user_pinned
1235 && deadline.remaining() >= LEAK_MIN_BUDGET
1236 {
1237 for renderer in &renderers_snapshot {
1238 let kind = renderer_kind_for(renderer.name());
1239 let trackable = kind.filter(|_| !host.is_empty());
1240 let Some(k) = trackable else { continue };
1241 if !breaker_skipped.contains(&k) {
1242 continue;
1243 }
1244 let permit = self.breakers.try_acquire_host_only(&host, k).await;
1245 if permit == Permit::Rejected {
1246 continue;
1247 }
1248 tracing::info!(
1249 renderer = renderer.name(),
1250 host = %host,
1251 "global breaker open, host clean — leaking through one attempt"
1252 );
1253 metrics()
1254 .render_route_decision_total
1255 .with_label_values(&[k.as_str(), "leakThrough"])
1256 .inc();
1257 let attempt_ctx = {
1258 let remaining = deadline.remaining();
1259 let tier_budget = self.tier_timeouts.get(&k).copied().unwrap_or(remaining);
1260 AttemptContext::capture(remaining, tier_budget)
1261 };
1262 let res = renderer.fetch(url, headers, wait_for_ms, deadline).await;
1263 match res {
1264 Ok(mut result) => {
1265 let text_len = html_body_text_len(&result.html);
1266 let is_placeholder = detector::looks_like_loading_placeholder(&result.html);
1267 let failed_render = detector::looks_like_failed_render(&result.html);
1268 let truncated = result.truncated;
1269 let content_ok = text_len >= Self::MIN_RENDERED_TEXT_LEN
1270 && !is_placeholder
1271 && failed_render.is_none();
1272 let outcome = classify_outcome(content_ok, truncated, false, &attempt_ctx);
1273 self.breakers
1276 .record_scoped_outcome(&host, k, None, Some(outcome))
1277 .await;
1278 if content_ok {
1279 result.credit_cost = credit_for(k);
1280 result.render_decision =
1281 Some(RenderDecision::AutoDefault { chosen: k });
1282 return Ok(result);
1283 }
1284 last_error = Some(CrwError::RendererError(format!(
1287 "leak attempt on {} returned thin content (text_len={text_len})",
1288 renderer.name()
1289 )));
1290 break;
1291 }
1292 Err(e) => {
1293 let was_timeout = matches!(e, CrwError::Timeout(_));
1294 let outcome = classify_outcome(false, false, was_timeout, &attempt_ctx);
1295 self.breakers
1296 .record_scoped_outcome(&host, k, None, Some(outcome))
1297 .await;
1298 last_error = Some(e);
1299 break;
1300 }
1301 }
1302 }
1303 }
1304
1305 if let Some(mut result) = thin_result {
1307 if let Some(last) = chain.last().copied() {
1310 result.credit_cost = credit_for(last);
1311 result.render_decision = Some(RenderDecision::Failover {
1312 chain: chain.clone(),
1313 reason: last_failover_reason
1314 .clone()
1315 .unwrap_or(FailoverErrorKind::Other),
1316 });
1317 }
1318 if is_user_pinned
1323 && chain.len() == 1
1324 && let Some(pinned) = chain.first().copied()
1325 {
1326 let reason = last_failover_reason
1327 .as_ref()
1328 .map(|r| r.as_str())
1329 .unwrap_or("unknown");
1330 let hint = format!(
1331 "Pinned renderer '{}' returned a failed render ({}). Content may be unreliable. Retry with renderer=\"chrome\" or omit the renderer field for auto-failover.",
1332 pinned.as_str(),
1333 reason,
1334 );
1335 result.warnings.push(hint);
1336 }
1337 Ok(result)
1338 } else {
1339 Err(last_error
1340 .unwrap_or_else(|| CrwError::RendererError("No JS renderer available".to_string())))
1341 }
1342 }
1343
1344 pub async fn check_health(&self) -> HashMap<String, bool> {
1346 let mut health = HashMap::new();
1347 health.insert("http".to_string(), self.http.is_available().await);
1348 for r in &self.js_renderers {
1349 health.insert(r.name().to_string(), r.is_available().await);
1350 }
1351 health
1352 }
1353}
1354
1355fn html_body_text_len(html: &str) -> usize {
1359 let body = if let Some(start) = html.find("<body") {
1361 let start = html[start..].find('>').map(|i| start + i + 1).unwrap_or(0);
1362 let end = html.find("</body>").unwrap_or(html.len());
1363 &html[start..end]
1364 } else {
1365 html
1366 };
1367 let mut in_tag = false;
1369 let mut text_len = 0;
1370 let mut prev_ws = true;
1371 for ch in body.chars() {
1372 if ch == '<' {
1373 in_tag = true;
1374 } else if ch == '>' {
1375 in_tag = false;
1376 } else if !in_tag {
1377 if ch.is_whitespace() {
1378 if !prev_ws {
1379 text_len += 1;
1380 prev_ws = true;
1381 }
1382 } else {
1383 text_len += 1;
1384 prev_ws = false;
1385 }
1386 }
1387 }
1388 text_len
1389}
1390
1391#[cfg(test)]
1392mod tests {
1393 use super::*;
1394 use crate::breaker::BreakerConfig;
1395 #[cfg(feature = "cdp")]
1396 use crw_core::config::CdpEndpoint;
1397 use std::time::Duration;
1398
1399 fn tdl() -> crw_core::Deadline {
1401 crw_core::Deadline::now_plus(Duration::from_secs(60))
1402 }
1403
1404 fn base_cfg(mode: RendererMode) -> RendererConfig {
1405 RendererConfig {
1406 mode,
1407 ..Default::default()
1408 }
1409 }
1410
1411 #[test]
1412 fn new_mode_none_ok_no_js_renderers() {
1413 let cfg = base_cfg(RendererMode::None);
1414 let r = FallbackRenderer::new(&cfg, "crw-test", None, &StealthConfig::default()).unwrap();
1415 assert!(r.js_renderer_names().is_empty());
1416 assert_eq!(r.render_js_default, None);
1417 }
1418
1419 #[test]
1420 fn new_mode_auto_no_endpoints_ok_http_only() {
1421 let cfg = base_cfg(RendererMode::Auto);
1422 let r = FallbackRenderer::new(&cfg, "crw-test", None, &StealthConfig::default()).unwrap();
1423 assert!(r.js_renderer_names().is_empty());
1424 }
1425
1426 #[cfg(feature = "cdp")]
1427 #[test]
1428 fn new_mode_chrome_without_endpoint_errors() {
1429 let cfg = base_cfg(RendererMode::Chrome);
1430 let err =
1431 FallbackRenderer::new(&cfg, "crw-test", None, &StealthConfig::default()).unwrap_err();
1432 let msg = err.to_string().to_lowercase();
1433 assert!(msg.contains("chrome"), "expected chrome in error: {msg}");
1434 assert!(
1435 msg.contains("ws_url") || msg.contains("not configured"),
1436 "expected ws_url hint in error: {msg}"
1437 );
1438 }
1439
1440 #[cfg(feature = "cdp")]
1441 #[test]
1442 fn new_mode_chrome_with_endpoint_ok_only_chrome() {
1443 let cfg = RendererConfig {
1444 mode: RendererMode::Chrome,
1445 chrome: Some(CdpEndpoint {
1446 ws_url: "ws://127.0.0.1:9222/".into(),
1447 }),
1448 lightpanda: Some(CdpEndpoint {
1449 ws_url: "ws://127.0.0.1:9223/".into(),
1450 }),
1451 ..Default::default()
1452 };
1453 let r = FallbackRenderer::new(&cfg, "crw-test", None, &StealthConfig::default()).unwrap();
1454 assert_eq!(r.js_renderer_names(), vec!["chrome"]);
1455 }
1456
1457 #[cfg(feature = "cdp")]
1458 #[test]
1459 fn new_mode_lightpanda_without_endpoint_errors() {
1460 let cfg = base_cfg(RendererMode::Lightpanda);
1461 let err =
1462 FallbackRenderer::new(&cfg, "crw-test", None, &StealthConfig::default()).unwrap_err();
1463 assert!(err.to_string().to_lowercase().contains("lightpanda"));
1464 }
1465
1466 #[cfg(feature = "cdp")]
1467 #[test]
1468 fn new_mode_auto_with_both_endpoints_preserves_order() {
1469 let cfg = RendererConfig {
1470 mode: RendererMode::Auto,
1471 lightpanda: Some(CdpEndpoint {
1472 ws_url: "ws://127.0.0.1:9222/".into(),
1473 }),
1474 chrome: Some(CdpEndpoint {
1475 ws_url: "ws://127.0.0.1:9223/".into(),
1476 }),
1477 ..Default::default()
1478 };
1479 let r = FallbackRenderer::new(&cfg, "crw-test", None, &StealthConfig::default()).unwrap();
1480 assert_eq!(r.js_renderer_names(), vec!["lightpanda", "chrome"]);
1481 }
1482
1483 #[cfg(feature = "cdp")]
1484 #[test]
1485 fn ladder_includes_chrome_proxy_when_configured() {
1486 let cfg = RendererConfig {
1487 mode: RendererMode::Auto,
1488 lightpanda: Some(CdpEndpoint {
1489 ws_url: "ws://127.0.0.1:9222/".into(),
1490 }),
1491 chrome: Some(CdpEndpoint {
1492 ws_url: "ws://127.0.0.1:9223/".into(),
1493 }),
1494 chrome_proxy: Some(CdpEndpoint {
1495 ws_url: "ws://127.0.0.1:9224/".into(),
1496 }),
1497 ..Default::default()
1498 };
1499 let r = FallbackRenderer::new(&cfg, "crw-test", None, &StealthConfig::default()).unwrap();
1500 assert_eq!(
1503 r.js_renderer_names(),
1504 vec!["lightpanda", "chrome", "chrome_proxy"]
1505 );
1506 }
1507
1508 #[cfg(feature = "cdp")]
1509 #[test]
1510 fn ladder_omits_chrome_proxy_when_not_configured() {
1511 let cfg = RendererConfig {
1512 mode: RendererMode::Auto,
1513 chrome: Some(CdpEndpoint {
1514 ws_url: "ws://127.0.0.1:9223/".into(),
1515 }),
1516 chrome_proxy: None,
1517 ..Default::default()
1518 };
1519 let r = FallbackRenderer::new(&cfg, "crw-test", None, &StealthConfig::default()).unwrap();
1520 assert!(!r.js_renderer_names().contains(&"chrome_proxy"));
1521 }
1522
1523 #[cfg(not(feature = "cdp"))]
1524 #[test]
1525 fn new_mode_chrome_errors_without_cdp_feature() {
1526 let cfg = base_cfg(RendererMode::Chrome);
1527 let err =
1528 FallbackRenderer::new(&cfg, "crw-test", None, &StealthConfig::default()).unwrap_err();
1529 let msg = err.to_string().to_lowercase();
1530 assert!(msg.contains("cdp"), "expected cdp in error: {msg}");
1531 }
1532
1533 #[test]
1534 fn new_render_js_default_stored() {
1535 let cfg = RendererConfig {
1536 mode: RendererMode::None,
1537 render_js_default: Some(true),
1538 ..Default::default()
1539 };
1540 let r = FallbackRenderer::new(&cfg, "crw-test", None, &StealthConfig::default()).unwrap();
1541 assert_eq!(r.render_js_default, Some(true));
1542 }
1543
1544 struct MockFetcher {
1546 name: &'static str,
1547 behavior: MockBehavior,
1548 }
1549
1550 #[derive(Clone)]
1551 enum MockBehavior {
1552 Ok(String),
1553 OkStatus(u16, String),
1554 Err(String),
1555 }
1556
1557 #[async_trait::async_trait]
1558 impl PageFetcher for MockFetcher {
1559 async fn fetch(
1560 &self,
1561 url: &str,
1562 _headers: &HashMap<String, String>,
1563 _wait_for_ms: Option<u64>,
1564 _deadline: crw_core::Deadline,
1565 ) -> CrwResult<FetchResult> {
1566 let (status, html) = match &self.behavior {
1567 MockBehavior::Ok(html) => (200u16, html.clone()),
1568 MockBehavior::OkStatus(s, html) => (*s, html.clone()),
1569 MockBehavior::Err(msg) => return Err(CrwError::RendererError(msg.clone())),
1570 };
1571 Ok(FetchResult {
1572 url: url.to_string(),
1573 final_url: None,
1574 status_code: status,
1575 html,
1576 content_type: Some("text/html".to_string()),
1577 raw_bytes: None,
1578 rendered_with: Some(self.name.to_string()),
1579 elapsed_ms: 0,
1580 warning: None,
1581 render_decision: None,
1582 credit_cost: 0,
1583 warnings: Vec::new(),
1584 truncated: false,
1585 deadline_exceeded: false,
1586 captured_responses: Vec::new(),
1587 })
1588 }
1589
1590 fn name(&self) -> &str {
1591 self.name
1592 }
1593 fn supports_js(&self) -> bool {
1594 true
1595 }
1596 async fn is_available(&self) -> bool {
1597 true
1598 }
1599 }
1600
1601 fn rich_html(marker: &str) -> String {
1602 format!(
1603 "<html><body><article>{}{}</article></body></html>",
1604 marker,
1605 "x".repeat(200)
1606 )
1607 }
1608
1609 fn make_renderer_with_mocks(mocks: Vec<Arc<dyn PageFetcher>>) -> FallbackRenderer {
1610 let cfg = base_cfg(RendererMode::None);
1612 let mut r =
1613 FallbackRenderer::new(&cfg, "crw-test", None, &StealthConfig::default()).unwrap();
1614 r.js_renderers = mocks;
1615 r
1616 }
1617
1618 #[tokio::test]
1619 async fn fetch_with_pinned_renderer_filters_pool() {
1620 let lp = Arc::new(MockFetcher {
1621 name: "lightpanda",
1622 behavior: MockBehavior::Ok(rich_html("LP-")),
1623 }) as Arc<dyn PageFetcher>;
1624 let chrome = Arc::new(MockFetcher {
1625 name: "chrome",
1626 behavior: MockBehavior::Ok(rich_html("CHROME-")),
1627 }) as Arc<dyn PageFetcher>;
1628 let r = make_renderer_with_mocks(vec![lp, chrome]);
1629
1630 let result = r
1631 .fetch(
1632 "https://example.com",
1633 &HashMap::new(),
1634 Some(true),
1635 None,
1636 Some("chrome"),
1637 tdl(),
1638 )
1639 .await
1640 .unwrap();
1641 assert!(result.html.contains("CHROME-"), "expected chrome output");
1642 assert_eq!(result.rendered_with.as_deref(), Some("chrome"));
1643 }
1644
1645 #[tokio::test]
1646 async fn fetch_with_pinned_renderer_unknown_returns_error() {
1647 let chrome = Arc::new(MockFetcher {
1648 name: "chrome",
1649 behavior: MockBehavior::Ok(rich_html("CHROME-")),
1650 }) as Arc<dyn PageFetcher>;
1651 let r = make_renderer_with_mocks(vec![chrome]);
1652
1653 let err = r
1654 .fetch(
1655 "https://example.com",
1656 &HashMap::new(),
1657 Some(true),
1658 None,
1659 Some("lightpanda"),
1660 tdl(),
1661 )
1662 .await
1663 .unwrap_err();
1664 let msg = err.to_string();
1665 assert!(
1666 msg.contains("lightpanda") && msg.contains("chrome"),
1667 "expected error to name pinned + available: {msg}"
1668 );
1669 }
1670
1671 #[tokio::test]
1672 async fn fetch_with_renderer_auto_uses_full_chain() {
1673 let lp = Arc::new(MockFetcher {
1674 name: "lightpanda",
1675 behavior: MockBehavior::Ok(rich_html("LP-")),
1676 }) as Arc<dyn PageFetcher>;
1677 let chrome = Arc::new(MockFetcher {
1678 name: "chrome",
1679 behavior: MockBehavior::Ok(rich_html("CHROME-")),
1680 }) as Arc<dyn PageFetcher>;
1681 let r = make_renderer_with_mocks(vec![lp, chrome]);
1682
1683 let result = r
1684 .fetch(
1685 "https://example.com",
1686 &HashMap::new(),
1687 Some(true),
1688 None,
1689 Some("auto"),
1690 tdl(),
1691 )
1692 .await
1693 .unwrap();
1694 assert!(result.html.contains("LP-"), "expected lightpanda first");
1696 }
1697
1698 #[tokio::test]
1699 async fn failover_skips_renderer_that_returns_failed_render() {
1700 let bad_lp_html = format!(
1703 "<html><body><div id=\"__next-error-0\">{}</div></body></html>",
1704 "x".repeat(200)
1705 );
1706 let lp = Arc::new(MockFetcher {
1707 name: "lightpanda",
1708 behavior: MockBehavior::Ok(bad_lp_html),
1709 }) as Arc<dyn PageFetcher>;
1710 let chrome = Arc::new(MockFetcher {
1711 name: "chrome",
1712 behavior: MockBehavior::Ok(rich_html("CHROME-OK")),
1713 }) as Arc<dyn PageFetcher>;
1714 let r = make_renderer_with_mocks(vec![lp, chrome]);
1715
1716 let result = r
1717 .fetch(
1718 "https://example.com",
1719 &HashMap::new(),
1720 Some(true),
1721 None,
1722 None,
1723 tdl(),
1724 )
1725 .await
1726 .unwrap();
1727 assert!(result.html.contains("CHROME-OK"));
1728 assert_eq!(result.rendered_with.as_deref(), Some("chrome"));
1729 }
1730
1731 #[tokio::test]
1732 async fn failover_surfaces_warning_when_only_failed_render_available() {
1733 let bad_lp_html = format!(
1737 "<html><body><div id=\"__next-error-0\">{}</div></body></html>",
1738 "x".repeat(200)
1739 );
1740 let lp = Arc::new(MockFetcher {
1741 name: "lightpanda",
1742 behavior: MockBehavior::Ok(bad_lp_html),
1743 }) as Arc<dyn PageFetcher>;
1744 let r = make_renderer_with_mocks(vec![lp]);
1745
1746 let result = r
1747 .fetch(
1748 "https://example.com",
1749 &HashMap::new(),
1750 Some(true),
1751 None,
1752 None,
1753 tdl(),
1754 )
1755 .await
1756 .unwrap();
1757 let warning = result.warning.expect("expected warning to be set");
1758 assert!(
1759 warning.contains("lightpanda") && warning.contains("nextjs_client_error"),
1760 "warning should name renderer + reason: {warning}"
1761 );
1762 }
1763
1764 #[tokio::test]
1765 async fn failover_concats_warnings_across_two_failed_renderers() {
1766 let bad_lp_html = format!(
1770 "<html><body><div id=\"__next-error-0\">{}</div></body></html>",
1771 "x".repeat(200)
1772 );
1773 let bad_chrome_html = format!(
1774 "<html><body><div id=\"__next_error__\">{}</div></body></html>",
1775 "y".repeat(200)
1776 );
1777 let lp = Arc::new(MockFetcher {
1778 name: "lightpanda",
1779 behavior: MockBehavior::Ok(bad_lp_html),
1780 }) as Arc<dyn PageFetcher>;
1781 let chrome = Arc::new(MockFetcher {
1782 name: "chrome",
1783 behavior: MockBehavior::Ok(bad_chrome_html),
1784 }) as Arc<dyn PageFetcher>;
1785 let r = make_renderer_with_mocks(vec![lp, chrome]);
1786
1787 let result = r
1788 .fetch(
1789 "https://example.com",
1790 &HashMap::new(),
1791 Some(true),
1792 None,
1793 None,
1794 tdl(),
1795 )
1796 .await
1797 .unwrap();
1798 let warning = result.warning.expect("expected warning to be set");
1799 assert!(
1800 warning.contains("lightpanda") && warning.contains("chrome"),
1801 "warning should mention both renderers: {warning}"
1802 );
1803 }
1804
1805 #[tokio::test]
1806 async fn fetch_pinned_renderer_failure_propagates() {
1807 let chrome = Arc::new(MockFetcher {
1808 name: "chrome",
1809 behavior: MockBehavior::Err("boom".into()),
1810 }) as Arc<dyn PageFetcher>;
1811 let r = make_renderer_with_mocks(vec![chrome]);
1812
1813 let err = r
1814 .fetch(
1815 "https://example.com",
1816 &HashMap::new(),
1817 Some(true),
1818 None,
1819 Some("chrome"),
1820 tdl(),
1821 )
1822 .await
1823 .unwrap_err();
1824 assert!(err.to_string().contains("boom"));
1825 }
1826
1827 #[tokio::test]
1828 async fn auto_promoted_host_tries_chrome_first() {
1829 let lp = Arc::new(MockFetcher {
1833 name: "lightpanda",
1834 behavior: MockBehavior::Ok(rich_html("LP-")),
1835 }) as Arc<dyn PageFetcher>;
1836 let chrome = Arc::new(MockFetcher {
1837 name: "chrome",
1838 behavior: MockBehavior::Ok(rich_html("CHROME-")),
1839 }) as Arc<dyn PageFetcher>;
1840 let r = make_renderer_with_mocks(vec![lp, chrome]);
1841
1842 for _ in 0..3 {
1844 r.preferences
1845 .record_failure("example.com", &FailoverErrorKind::NextJsClientError)
1846 .await;
1847 }
1848
1849 let result = r
1850 .fetch(
1851 "https://example.com",
1852 &HashMap::new(),
1853 Some(true),
1854 None,
1855 None,
1856 tdl(),
1857 )
1858 .await
1859 .unwrap();
1860 assert!(
1861 result.html.contains("CHROME-"),
1862 "promoted host should hit chrome first, got: {}",
1863 &result.html[..80.min(result.html.len())]
1864 );
1865 assert_eq!(result.credit_cost, 2, "chrome costs 2 credits");
1866 assert!(matches!(
1867 result.render_decision,
1868 Some(RenderDecision::AutoPromoted {
1869 chosen: RendererKind::Chrome,
1870 ..
1871 })
1872 ));
1873 }
1874
1875 #[tokio::test]
1876 async fn breaker_skipped_renderer_falls_through_to_next() {
1877 let lp = Arc::new(MockFetcher {
1880 name: "lightpanda",
1881 behavior: MockBehavior::Err("would fire if reached".into()),
1882 }) as Arc<dyn PageFetcher>;
1883 let chrome = Arc::new(MockFetcher {
1884 name: "chrome",
1885 behavior: MockBehavior::Ok(rich_html("CHROME-OK")),
1886 }) as Arc<dyn PageFetcher>;
1887 let mut r = make_renderer_with_mocks(vec![lp, chrome]);
1888
1889 let breaker_cfg = BreakerConfig {
1895 base_cooldown: Duration::from_secs(300),
1896 max_cooldown: Duration::from_secs(300),
1897 ..BreakerConfig::default()
1898 };
1899 r.breakers = Arc::new(BreakerRegistry::new(breaker_cfg));
1900 for _ in 0..80 {
1901 r.breakers
1902 .record_result("example.com", RendererKind::Lightpanda, false)
1903 .await;
1904 }
1905
1906 let result = r
1907 .fetch(
1908 "https://example.com",
1909 &HashMap::new(),
1910 Some(true),
1911 None,
1912 None,
1913 tdl(),
1914 )
1915 .await
1916 .unwrap();
1917 assert!(result.html.contains("CHROME-OK"));
1918 assert!(matches!(
1919 result.render_decision,
1920 Some(RenderDecision::BreakerSkipped {
1921 skipped: RendererKind::Lightpanda,
1922 chosen: RendererKind::Chrome
1923 })
1924 ));
1925 }
1926
1927 #[tokio::test]
1928 async fn user_pinned_failed_render_emits_warning() {
1929 let bad_html = format!(
1934 "<html><body><div id=\"__next-error-0\">{}</div></body></html>",
1935 "x".repeat(200)
1936 );
1937 let lp = Arc::new(MockFetcher {
1938 name: "lightpanda",
1939 behavior: MockBehavior::Ok(bad_html),
1940 }) as Arc<dyn PageFetcher>;
1941 let chrome = Arc::new(MockFetcher {
1942 name: "chrome",
1943 behavior: MockBehavior::Ok(rich_html("CHROME-")),
1944 }) as Arc<dyn PageFetcher>;
1945 let r = make_renderer_with_mocks(vec![lp, chrome]);
1946
1947 let result = r
1948 .fetch(
1949 "https://example.com",
1950 &HashMap::new(),
1951 Some(true),
1952 None,
1953 Some("lightpanda"),
1954 tdl(),
1955 )
1956 .await
1957 .unwrap();
1958 let pin_hint = result
1959 .warnings
1960 .iter()
1961 .find(|w| w.starts_with("Pinned renderer 'lightpanda'"));
1962 assert!(
1963 pin_hint.is_some(),
1964 "expected pin-failure hint in warnings, got: {:?}",
1965 result.warnings
1966 );
1967 let hint = pin_hint.unwrap();
1968 assert!(
1969 hint.contains("nextJsClientError"),
1970 "hint should name camelCase reason: {hint}"
1971 );
1972 assert!(
1973 hint.contains("renderer=\"chrome\""),
1974 "hint should suggest a fix: {hint}"
1975 );
1976 assert!(matches!(
1978 result.render_decision,
1979 Some(RenderDecision::Failover { ref chain, .. }) if chain.len() == 1
1980 ));
1981 }
1982
1983 #[tokio::test]
1984 async fn user_pinned_decision_records_credit_and_kind() {
1985 let chrome = Arc::new(MockFetcher {
1986 name: "chrome",
1987 behavior: MockBehavior::Ok(rich_html("CHROME-")),
1988 }) as Arc<dyn PageFetcher>;
1989 let r = make_renderer_with_mocks(vec![chrome]);
1990 let result = r
1991 .fetch(
1992 "https://example.com",
1993 &HashMap::new(),
1994 Some(true),
1995 None,
1996 Some("chrome"),
1997 tdl(),
1998 )
1999 .await
2000 .unwrap();
2001 assert_eq!(result.credit_cost, 2);
2002 assert!(matches!(
2003 result.render_decision,
2004 Some(RenderDecision::UserPinned {
2005 renderer: RendererKind::Chrome
2006 })
2007 ));
2008 }
2009
2010 #[tokio::test]
2011 async fn js_tier_escalates_on_403_status() {
2012 let lp = Arc::new(MockFetcher {
2015 name: "lightpanda",
2016 behavior: MockBehavior::OkStatus(403, rich_html("BLOCKED-")),
2017 }) as Arc<dyn PageFetcher>;
2018 let chrome = Arc::new(MockFetcher {
2019 name: "chrome",
2020 behavior: MockBehavior::Ok(rich_html("CHROME-")),
2021 }) as Arc<dyn PageFetcher>;
2022 let r = make_renderer_with_mocks(vec![lp, chrome]);
2023
2024 let result = r
2025 .fetch(
2026 "https://example.com",
2027 &HashMap::new(),
2028 Some(true),
2029 None,
2030 Some("auto"),
2031 tdl(),
2032 )
2033 .await
2034 .unwrap();
2035 assert!(
2036 result.html.contains("CHROME-"),
2037 "expected chrome output after lightpanda 403"
2038 );
2039 assert_eq!(result.status_code, 200);
2040 }
2041
2042 #[tokio::test]
2043 async fn js_tier_escalates_on_vendor_block_with_200() {
2044 let cf_html = format!(
2047 "<html><head><script src=\"/cdn-cgi/challenge-platform/h/g/orchestrate/chl_page/v1\"></script></head><body>{}</body></html>",
2048 "x".repeat(200)
2049 );
2050 let lp = Arc::new(MockFetcher {
2051 name: "lightpanda",
2052 behavior: MockBehavior::Ok(cf_html),
2053 }) as Arc<dyn PageFetcher>;
2054 let chrome = Arc::new(MockFetcher {
2055 name: "chrome",
2056 behavior: MockBehavior::Ok(rich_html("CHROME-")),
2057 }) as Arc<dyn PageFetcher>;
2058 let r = make_renderer_with_mocks(vec![lp, chrome]);
2059
2060 let result = r
2061 .fetch(
2062 "https://example.com",
2063 &HashMap::new(),
2064 Some(true),
2065 None,
2066 Some("auto"),
2067 tdl(),
2068 )
2069 .await
2070 .unwrap();
2071 assert!(
2072 result.html.contains("CHROME-"),
2073 "expected chrome output after lightpanda vendor block"
2074 );
2075 }
2076
2077 #[tokio::test]
2078 async fn js_tier_accepts_200_clean_response() {
2079 let lp = Arc::new(MockFetcher {
2082 name: "lightpanda",
2083 behavior: MockBehavior::Ok(rich_html("LP-CLEAN-")),
2084 }) as Arc<dyn PageFetcher>;
2085 let chrome = Arc::new(MockFetcher {
2086 name: "chrome",
2087 behavior: MockBehavior::Ok(rich_html("CHROME-")),
2088 }) as Arc<dyn PageFetcher>;
2089 let r = make_renderer_with_mocks(vec![lp, chrome]);
2090
2091 let result = r
2092 .fetch(
2093 "https://example.com",
2094 &HashMap::new(),
2095 Some(true),
2096 None,
2097 Some("auto"),
2098 tdl(),
2099 )
2100 .await
2101 .unwrap();
2102 assert!(result.html.contains("LP-CLEAN-"));
2103 assert_eq!(result.status_code, 200);
2104 }
2105
2106 fn network_security_block_html() -> String {
2110 format!(
2111 "<html><body><article>You've been blocked by network security.{}</article></body></html>",
2112 "x".repeat(200)
2113 )
2114 }
2115
2116 #[tokio::test]
2117 async fn js_tier_escalates_to_chrome_proxy_on_antibot_block() {
2118 let lp = Arc::new(MockFetcher {
2121 name: "lightpanda",
2122 behavior: MockBehavior::Ok(network_security_block_html()),
2123 }) as Arc<dyn PageFetcher>;
2124 let chrome = Arc::new(MockFetcher {
2125 name: "chrome",
2126 behavior: MockBehavior::Ok(network_security_block_html()),
2127 }) as Arc<dyn PageFetcher>;
2128 let chrome_proxy = Arc::new(MockFetcher {
2129 name: "chrome_proxy",
2130 behavior: MockBehavior::Ok(rich_html("PROXY-")),
2131 }) as Arc<dyn PageFetcher>;
2132 let r = make_renderer_with_mocks(vec![lp, chrome, chrome_proxy]);
2133
2134 let result = r
2135 .fetch(
2136 "https://example.com",
2137 &HashMap::new(),
2138 Some(true),
2139 None,
2140 Some("auto"),
2141 tdl(),
2142 )
2143 .await
2144 .unwrap();
2145 assert!(
2146 result.html.contains("PROXY-"),
2147 "expected chrome_proxy output after antibot block"
2148 );
2149 assert_eq!(
2150 result.render_decision,
2151 Some(RenderDecision::Failover {
2152 chain: vec![
2153 RendererKind::Lightpanda,
2154 RendererKind::Chrome,
2155 RendererKind::ChromeProxy,
2156 ],
2157 reason: FailoverErrorKind::AntibotBlock,
2158 })
2159 );
2160 }
2161
2162 #[tokio::test]
2163 async fn antibot_block_returns_as_success_when_escalation_disabled() {
2164 let lp = Arc::new(MockFetcher {
2168 name: "lightpanda",
2169 behavior: MockBehavior::Ok(network_security_block_html()),
2170 }) as Arc<dyn PageFetcher>;
2171 let chrome = Arc::new(MockFetcher {
2172 name: "chrome",
2173 behavior: MockBehavior::Ok(rich_html("CHROME-")),
2174 }) as Arc<dyn PageFetcher>;
2175 let mut r = make_renderer_with_mocks(vec![lp, chrome]);
2176 r.antibot.escalate_in_failover = false;
2177
2178 let result = r
2179 .fetch(
2180 "https://example.com",
2181 &HashMap::new(),
2182 Some(true),
2183 None,
2184 Some("auto"),
2185 tdl(),
2186 )
2187 .await
2188 .unwrap();
2189 assert!(
2190 result.html.contains("network security"),
2191 "block page should be returned as-is when escalation is disabled"
2192 );
2193 assert_eq!(result.rendered_with.as_deref(), Some("lightpanda"));
2194 }
2195
2196 #[tokio::test]
2197 async fn promoted_host_escalates_chrome_to_chrome_proxy_not_lightpanda() {
2198 let lp = Arc::new(MockFetcher {
2202 name: "lightpanda",
2203 behavior: MockBehavior::Ok(rich_html("LP-")),
2204 }) as Arc<dyn PageFetcher>;
2205 let chrome = Arc::new(MockFetcher {
2206 name: "chrome",
2207 behavior: MockBehavior::Ok(network_security_block_html()),
2208 }) as Arc<dyn PageFetcher>;
2209 let chrome_proxy = Arc::new(MockFetcher {
2210 name: "chrome_proxy",
2211 behavior: MockBehavior::Ok(rich_html("PROXY-")),
2212 }) as Arc<dyn PageFetcher>;
2213 let r = make_renderer_with_mocks(vec![lp, chrome, chrome_proxy]);
2214
2215 for _ in 0..3 {
2217 r.preferences
2218 .record_failure("example.com", &FailoverErrorKind::NextJsClientError)
2219 .await;
2220 }
2221
2222 let result = r
2223 .fetch(
2224 "https://example.com",
2225 &HashMap::new(),
2226 Some(true),
2227 None,
2228 None,
2229 tdl(),
2230 )
2231 .await
2232 .unwrap();
2233 assert!(
2234 result.html.contains("PROXY-"),
2235 "expected chrome_proxy output"
2236 );
2237 assert_eq!(
2238 result.render_decision,
2239 Some(RenderDecision::Failover {
2240 chain: vec![RendererKind::Chrome, RendererKind::ChromeProxy],
2241 reason: FailoverErrorKind::AntibotBlock,
2242 }),
2243 "chrome must escalate straight to chrome_proxy, skipping lightpanda"
2244 );
2245 }
2246}