Skip to main content

crw_renderer/
lib.rs

1//! HTTP and headless-browser rendering engine for the CRW web scraper.
2//!
3//! Provides a [`FallbackRenderer`] that fetches pages via plain HTTP and optionally
4//! re-renders them through a CDP-based headless browser when SPA content is detected.
5//!
6//! - [`http_only`] — Simple HTTP fetcher using `reqwest`
7//! - [`detector`] — Heuristic SPA shell detection (empty body, framework markers)
8//! - `cdp` — Chrome DevTools Protocol renderer (LightPanda, Playwright, Chrome) *(requires `cdp` feature)*
9//! - [`traits`] — [`PageFetcher`] trait for pluggable backends
10//!
11//! # Feature flags
12//!
13//! | Flag  | Description |
14//! |-------|-------------|
15//! | `cdp` | Enables CDP WebSocket rendering via `tokio-tungstenite` |
16//!
17//! # Example
18//!
19//! ```rust,no_run
20//! use crw_core::config::RendererConfig;
21//! use crw_renderer::FallbackRenderer;
22//! use std::collections::HashMap;
23//!
24//! # async fn example() -> Result<(), Box<dyn std::error::Error>> {
25//! use crw_core::config::StealthConfig;
26//! let config = RendererConfig::default();
27//! let stealth = StealthConfig::default();
28//! let renderer = FallbackRenderer::new(&config, "crw/0.1", None, &stealth);
29//! let result = renderer.fetch("https://example.com", &HashMap::new(), None, None).await?;
30//! println!("status: {}", result.status_code);
31//! # Ok(())
32//! # }
33//! ```
34
35#[cfg(feature = "auto-browser")]
36pub mod browser;
37#[cfg(feature = "cdp")]
38pub mod cdp;
39pub mod detector;
40pub mod http_only;
41pub mod traits;
42
43use crw_core::config::{BUILTIN_UA_POOL, RendererConfig, StealthConfig};
44use crw_core::error::{CrwError, CrwResult};
45use crw_core::types::FetchResult;
46use std::collections::HashMap;
47use std::sync::Arc;
48use traits::PageFetcher;
49
50/// Pick a user-agent: rotate from stealth pool when stealth is enabled.
51fn pick_ua<'a>(default_ua: &'a str, stealth: &'a StealthConfig) -> String {
52    if stealth.enabled {
53        let pool: &[&str] = if stealth.user_agents.is_empty() {
54            BUILTIN_UA_POOL
55        } else {
56            // Safe: user_agents is non-empty in this branch.
57            return stealth.user_agents[rand::random::<usize>() % stealth.user_agents.len()]
58                .clone();
59        };
60        pool[rand::random::<usize>() % pool.len()].to_string()
61    } else {
62        default_ua.to_string()
63    }
64}
65
66/// Composite renderer that tries multiple backends in order.
67pub struct FallbackRenderer {
68    http: Arc<dyn PageFetcher>,
69    js_renderers: Vec<Arc<dyn PageFetcher>>,
70}
71
72impl FallbackRenderer {
73    pub fn new(
74        config: &RendererConfig,
75        user_agent: &str,
76        proxy: Option<&str>,
77        stealth: &StealthConfig,
78    ) -> Self {
79        let effective_ua = pick_ua(user_agent, stealth);
80        let inject_headers = stealth.enabled && stealth.inject_headers;
81        let http = Arc::new(http_only::HttpFetcher::new(
82            &effective_ua,
83            proxy,
84            inject_headers,
85        )) as Arc<dyn PageFetcher>;
86
87        #[allow(unused_mut)]
88        let mut js_renderers: Vec<Arc<dyn PageFetcher>> = Vec::new();
89
90        if config.mode == "none" {
91            return Self { http, js_renderers };
92        }
93
94        #[cfg(feature = "cdp")]
95        {
96            if let Some(lp) = &config.lightpanda {
97                js_renderers.push(Arc::new(cdp::CdpRenderer::new(
98                    "lightpanda",
99                    &lp.ws_url,
100                    config.page_timeout_ms,
101                    config.pool_size,
102                )));
103            }
104            if let Some(pw) = &config.playwright {
105                js_renderers.push(Arc::new(cdp::CdpRenderer::new(
106                    "playwright",
107                    &pw.ws_url,
108                    config.page_timeout_ms,
109                    config.pool_size,
110                )));
111            }
112            if let Some(ch) = &config.chrome {
113                js_renderers.push(Arc::new(cdp::CdpRenderer::new(
114                    "chrome",
115                    &ch.ws_url,
116                    config.page_timeout_ms,
117                    config.pool_size,
118                )));
119            }
120        }
121
122        #[cfg(not(feature = "cdp"))]
123        if config.lightpanda.is_some() || config.playwright.is_some() || config.chrome.is_some() {
124            tracing::warn!(
125                "CDP renderers configured but 'cdp' feature not enabled. JS rendering disabled."
126            );
127        }
128
129        Self { http, js_renderers }
130    }
131
132    /// Fetch a URL with smart mode: HTTP first, then JS if needed.
133    ///
134    /// When `render_js` is `None` (auto-detect), the renderer also escalates to
135    /// JS rendering if the HTTP response looks like an anti-bot challenge page
136    /// (Cloudflare "Just a moment...", etc.). The CDP renderer has built-in
137    /// challenge retry logic that waits for non-interactive JS challenges to
138    /// auto-resolve.
139    pub async fn fetch(
140        &self,
141        url: &str,
142        headers: &HashMap<String, String>,
143        render_js: Option<bool>,
144        wait_for_ms: Option<u64>,
145    ) -> CrwResult<FetchResult> {
146        match render_js {
147            Some(false) => self.http.fetch(url, headers, None).await,
148            Some(true) => {
149                // Fetch via HTTP first to check content type — PDFs can't be JS-rendered.
150                let http_result = self.http.fetch(url, headers, None).await?;
151                if http_result.content_type.as_deref() == Some("application/pdf") {
152                    return Ok(http_result);
153                }
154
155                if self.js_renderers.is_empty() {
156                    tracing::warn!(
157                        url,
158                        "JS rendering requested but no renderer available — falling back to HTTP"
159                    );
160                    let mut result = http_result;
161                    result.rendered_with = Some("http_only_fallback".to_string());
162                    result.warning = Some("JS rendering was requested but no renderer is available. Content was fetched via HTTP only.".to_string());
163                    Ok(result)
164                } else {
165                    self.fetch_with_js(url, headers, wait_for_ms).await
166                }
167            }
168            None => {
169                let result = self.http.fetch(url, headers, None).await?;
170
171                // PDFs don't need JS rendering — return immediately.
172                if result.content_type.as_deref() == Some("application/pdf") {
173                    return Ok(result);
174                }
175
176                let needs_js = detector::needs_js_rendering(&result.html);
177                let is_blocked = Self::looks_like_challenge(&result.html);
178                let is_auth_blocked = matches!(result.status_code, 401 | 403);
179
180                if !self.js_renderers.is_empty() && (needs_js || is_blocked || is_auth_blocked) {
181                    if is_auth_blocked {
182                        tracing::info!(
183                            url,
184                            status_code = result.status_code,
185                            "HTTP {} received, escalating to JS renderer",
186                            result.status_code
187                        );
188                    } else if is_blocked {
189                        tracing::info!(
190                            url,
191                            "Anti-bot challenge detected in HTTP response, escalating to JS renderer"
192                        );
193                    } else {
194                        tracing::info!(url, "SPA shell detected, retrying with JS renderer");
195                    }
196                    match self.fetch_with_js(url, headers, wait_for_ms).await {
197                        Ok(js_result) => Ok(js_result),
198                        Err(e) => {
199                            tracing::warn!("JS rendering failed, falling back to HTTP result: {e}");
200                            Ok(result)
201                        }
202                    }
203                } else {
204                    Ok(result)
205                }
206            }
207        }
208    }
209
210    /// Quick check if HTML looks like an anti-bot challenge/interstitial page.
211    fn looks_like_challenge(html: &str) -> bool {
212        if html.len() > 50_000 {
213            return false;
214        }
215        let lower = html.to_lowercase();
216        lower.contains("just a moment")
217            || lower.contains("cf-browser-verification")
218            || lower.contains("cf-challenge-running")
219            || lower.contains("challenge-platform")
220            || (lower.contains("attention required") && lower.contains("cloudflare"))
221    }
222
223    /// Minimum body text length for a JS-rendered result to be considered
224    /// successful. If the rendered page has less visible text than this, the
225    /// next renderer in the chain is tried.
226    const MIN_RENDERED_TEXT_LEN: usize = 50;
227
228    async fn fetch_with_js(
229        &self,
230        url: &str,
231        headers: &HashMap<String, String>,
232        wait_for_ms: Option<u64>,
233    ) -> CrwResult<FetchResult> {
234        let mut last_error = None;
235        let mut thin_result: Option<FetchResult> = None;
236        for renderer in &self.js_renderers {
237            match renderer.fetch(url, headers, wait_for_ms).await {
238                Ok(result) => {
239                    let text_len = html_body_text_len(&result.html);
240                    if text_len >= Self::MIN_RENDERED_TEXT_LEN {
241                        return Ok(result);
242                    }
243                    tracing::info!(
244                        renderer = renderer.name(),
245                        text_len,
246                        "JS renderer returned thin content, trying next renderer"
247                    );
248                    if thin_result.is_none() {
249                        thin_result = Some(result);
250                    }
251                }
252                Err(e) => {
253                    tracing::warn!(renderer = renderer.name(), "JS renderer failed: {e}");
254                    last_error = Some(e);
255                    continue;
256                }
257            }
258        }
259        // Return the best thin result if we have one, otherwise the last error.
260        if let Some(result) = thin_result {
261            Ok(result)
262        } else {
263            Err(last_error
264                .unwrap_or_else(|| CrwError::RendererError("No JS renderer available".to_string())))
265        }
266    }
267
268    /// Check availability of all renderers.
269    pub async fn check_health(&self) -> HashMap<String, bool> {
270        let mut health = HashMap::new();
271        health.insert("http".to_string(), self.http.is_available().await);
272        for r in &self.js_renderers {
273            health.insert(r.name().to_string(), r.is_available().await);
274        }
275        health
276    }
277}
278
279/// Rough estimate of visible text length in an HTML document.
280/// Strips tags and collapses whitespace. Used to detect "thin" renders
281/// where a renderer returned HTML but failed to execute JavaScript.
282fn html_body_text_len(html: &str) -> usize {
283    // Extract body content if present, otherwise use entire HTML.
284    let body = if let Some(start) = html.find("<body") {
285        let start = html[start..].find('>').map(|i| start + i + 1).unwrap_or(0);
286        let end = html.find("</body>").unwrap_or(html.len());
287        &html[start..end]
288    } else {
289        html
290    };
291    // Strip tags crudely.
292    let mut in_tag = false;
293    let mut text_len = 0;
294    let mut prev_ws = true;
295    for ch in body.chars() {
296        if ch == '<' {
297            in_tag = true;
298        } else if ch == '>' {
299            in_tag = false;
300        } else if !in_tag {
301            if ch.is_whitespace() {
302                if !prev_ws {
303                    text_len += 1;
304                    prev_ws = true;
305                }
306            } else {
307                text_len += 1;
308                prev_ws = false;
309            }
310        }
311    }
312    text_len
313}