#[cfg(feature = "cdp")]
pub mod cdp;
pub mod detector;
pub mod http_only;
pub mod traits;
use crw_core::config::{BUILTIN_UA_POOL, RendererConfig, StealthConfig};
use crw_core::error::{CrwError, CrwResult};
use crw_core::types::FetchResult;
use std::collections::HashMap;
use std::sync::Arc;
use traits::PageFetcher;
fn pick_ua<'a>(default_ua: &'a str, stealth: &'a StealthConfig) -> String {
if stealth.enabled {
let pool: &[&str] = if stealth.user_agents.is_empty() {
BUILTIN_UA_POOL
} else {
return stealth.user_agents[rand::random::<usize>() % stealth.user_agents.len()]
.clone();
};
pool[rand::random::<usize>() % pool.len()].to_string()
} else {
default_ua.to_string()
}
}
pub struct FallbackRenderer {
http: Arc<dyn PageFetcher>,
js_renderers: Vec<Arc<dyn PageFetcher>>,
}
impl FallbackRenderer {
pub fn new(
config: &RendererConfig,
user_agent: &str,
proxy: Option<&str>,
stealth: &StealthConfig,
) -> Self {
let effective_ua = pick_ua(user_agent, stealth);
let inject_headers = stealth.enabled && stealth.inject_headers;
let http = Arc::new(http_only::HttpFetcher::new(
&effective_ua,
proxy,
inject_headers,
)) as Arc<dyn PageFetcher>;
#[allow(unused_mut)]
let mut js_renderers: Vec<Arc<dyn PageFetcher>> = Vec::new();
if config.mode == "none" {
return Self { http, js_renderers };
}
#[cfg(feature = "cdp")]
{
if let Some(lp) = &config.lightpanda {
js_renderers.push(Arc::new(cdp::CdpRenderer::new(
"lightpanda",
&lp.ws_url,
config.page_timeout_ms,
config.pool_size,
)));
}
if let Some(pw) = &config.playwright {
js_renderers.push(Arc::new(cdp::CdpRenderer::new(
"playwright",
&pw.ws_url,
config.page_timeout_ms,
config.pool_size,
)));
}
if let Some(ch) = &config.chrome {
js_renderers.push(Arc::new(cdp::CdpRenderer::new(
"chrome",
&ch.ws_url,
config.page_timeout_ms,
config.pool_size,
)));
}
}
#[cfg(not(feature = "cdp"))]
if config.lightpanda.is_some() || config.playwright.is_some() || config.chrome.is_some() {
tracing::warn!(
"CDP renderers configured but 'cdp' feature not enabled. JS rendering disabled."
);
}
Self { http, js_renderers }
}
pub async fn fetch(
&self,
url: &str,
headers: &HashMap<String, String>,
render_js: Option<bool>,
wait_for_ms: Option<u64>,
) -> CrwResult<FetchResult> {
match render_js {
Some(false) => self.http.fetch(url, headers, None).await,
Some(true) => {
let http_result = self.http.fetch(url, headers, None).await?;
if http_result.content_type.as_deref() == Some("application/pdf") {
return Ok(http_result);
}
if self.js_renderers.is_empty() {
tracing::warn!(
url,
"JS rendering requested but no renderer available — falling back to HTTP"
);
let mut result = http_result;
result.rendered_with = Some("http_only_fallback".to_string());
result.warning = Some("JS rendering was requested but no renderer is available. Content was fetched via HTTP only.".to_string());
Ok(result)
} else {
self.fetch_with_js(url, headers, wait_for_ms).await
}
}
None => {
let result = self.http.fetch(url, headers, None).await?;
if result.content_type.as_deref() == Some("application/pdf") {
return Ok(result);
}
let needs_js = detector::needs_js_rendering(&result.html);
let is_blocked = Self::looks_like_challenge(&result.html);
if !self.js_renderers.is_empty() && (needs_js || is_blocked) {
if is_blocked {
tracing::info!(
url,
"Anti-bot challenge detected in HTTP response, escalating to JS renderer"
);
} else {
tracing::info!(url, "SPA shell detected, retrying with JS renderer");
}
match self.fetch_with_js(url, headers, wait_for_ms).await {
Ok(js_result) => Ok(js_result),
Err(e) => {
tracing::warn!("JS rendering failed, falling back to HTTP result: {e}");
Ok(result)
}
}
} else {
Ok(result)
}
}
}
}
fn looks_like_challenge(html: &str) -> bool {
if html.len() > 50_000 {
return false;
}
let lower = html.to_lowercase();
lower.contains("just a moment")
|| lower.contains("cf-browser-verification")
|| lower.contains("cf-challenge-running")
|| lower.contains("challenge-platform")
|| (lower.contains("attention required") && lower.contains("cloudflare"))
}
const MIN_RENDERED_TEXT_LEN: usize = 50;
async fn fetch_with_js(
&self,
url: &str,
headers: &HashMap<String, String>,
wait_for_ms: Option<u64>,
) -> CrwResult<FetchResult> {
let mut last_error = None;
let mut thin_result: Option<FetchResult> = None;
for renderer in &self.js_renderers {
match renderer.fetch(url, headers, wait_for_ms).await {
Ok(result) => {
let text_len = html_body_text_len(&result.html);
if text_len >= Self::MIN_RENDERED_TEXT_LEN {
return Ok(result);
}
tracing::info!(
renderer = renderer.name(),
text_len,
"JS renderer returned thin content, trying next renderer"
);
if thin_result.is_none() {
thin_result = Some(result);
}
}
Err(e) => {
tracing::warn!(renderer = renderer.name(), "JS renderer failed: {e}");
last_error = Some(e);
continue;
}
}
}
if let Some(result) = thin_result {
Ok(result)
} else {
Err(last_error
.unwrap_or_else(|| CrwError::RendererError("No JS renderer available".to_string())))
}
}
pub async fn check_health(&self) -> HashMap<String, bool> {
let mut health = HashMap::new();
health.insert("http".to_string(), self.http.is_available().await);
for r in &self.js_renderers {
health.insert(r.name().to_string(), r.is_available().await);
}
health
}
}
fn html_body_text_len(html: &str) -> usize {
let body = if let Some(start) = html.find("<body") {
let start = html[start..].find('>').map(|i| start + i + 1).unwrap_or(0);
let end = html.find("</body>").unwrap_or(html.len());
&html[start..end]
} else {
html
};
let mut in_tag = false;
let mut text_len = 0;
let mut prev_ws = true;
for ch in body.chars() {
if ch == '<' {
in_tag = true;
} else if ch == '>' {
in_tag = false;
} else if !in_tag {
if ch.is_whitespace() {
if !prev_ws {
text_len += 1;
prev_ws = true;
}
} else {
text_len += 1;
prev_ws = false;
}
}
}
text_len
}