Skip to main content

crw_renderer/
lib.rs

1//! HTTP and headless-browser rendering engine for the CRW web scraper.
2//!
3//! Provides a [`FallbackRenderer`] that fetches pages via plain HTTP and optionally
4//! re-renders them through a CDP-based headless browser when SPA content is detected.
5//!
6//! - [`http_only`] — Simple HTTP fetcher using `reqwest`
7//! - [`detector`] — Heuristic SPA shell detection (empty body, framework markers)
8//! - `cdp` — Chrome DevTools Protocol renderer (LightPanda, Playwright, Chrome) *(requires `cdp` feature)*
9//! - [`traits`] — [`PageFetcher`] trait for pluggable backends
10//!
11//! # Feature flags
12//!
13//! | Flag  | Description |
14//! |-------|-------------|
15//! | `cdp` | Enables CDP WebSocket rendering via `tokio-tungstenite` |
16//!
17//! # Example
18//!
19//! ```rust,no_run
20//! use crw_core::config::RendererConfig;
21//! use crw_renderer::FallbackRenderer;
22//! use std::collections::HashMap;
23//!
24//! # async fn example() -> Result<(), Box<dyn std::error::Error>> {
25//! use crw_core::config::StealthConfig;
26//! let config = RendererConfig::default();
27//! let stealth = StealthConfig::default();
28//! let renderer = FallbackRenderer::new(&config, "crw/0.1", None, &stealth);
29//! let result = renderer.fetch("https://example.com", &HashMap::new(), None, None).await?;
30//! println!("status: {}", result.status_code);
31//! # Ok(())
32//! # }
33//! ```
34
35#[cfg(feature = "cdp")]
36pub mod cdp;
37pub mod detector;
38pub mod http_only;
39pub mod traits;
40
41use crw_core::config::{BUILTIN_UA_POOL, RendererConfig, StealthConfig};
42use crw_core::error::{CrwError, CrwResult};
43use crw_core::types::FetchResult;
44use std::collections::HashMap;
45use std::sync::Arc;
46use traits::PageFetcher;
47
48/// Pick a user-agent: rotate from stealth pool when stealth is enabled.
49fn pick_ua<'a>(default_ua: &'a str, stealth: &'a StealthConfig) -> String {
50    if stealth.enabled {
51        let pool: &[&str] = if stealth.user_agents.is_empty() {
52            BUILTIN_UA_POOL
53        } else {
54            // Safe: user_agents is non-empty in this branch.
55            return stealth.user_agents[rand::random::<usize>() % stealth.user_agents.len()]
56                .clone();
57        };
58        pool[rand::random::<usize>() % pool.len()].to_string()
59    } else {
60        default_ua.to_string()
61    }
62}
63
64/// Composite renderer that tries multiple backends in order.
65pub struct FallbackRenderer {
66    http: Arc<dyn PageFetcher>,
67    js_renderers: Vec<Arc<dyn PageFetcher>>,
68}
69
70impl FallbackRenderer {
71    pub fn new(
72        config: &RendererConfig,
73        user_agent: &str,
74        proxy: Option<&str>,
75        stealth: &StealthConfig,
76    ) -> Self {
77        let effective_ua = pick_ua(user_agent, stealth);
78        let inject_headers = stealth.enabled && stealth.inject_headers;
79        let http = Arc::new(http_only::HttpFetcher::new(
80            &effective_ua,
81            proxy,
82            inject_headers,
83        )) as Arc<dyn PageFetcher>;
84
85        #[allow(unused_mut)]
86        let mut js_renderers: Vec<Arc<dyn PageFetcher>> = Vec::new();
87
88        if config.mode == "none" {
89            return Self { http, js_renderers };
90        }
91
92        #[cfg(feature = "cdp")]
93        {
94            if let Some(lp) = &config.lightpanda {
95                js_renderers.push(Arc::new(cdp::CdpRenderer::new(
96                    "lightpanda",
97                    &lp.ws_url,
98                    config.page_timeout_ms,
99                    config.pool_size,
100                )));
101            }
102            if let Some(pw) = &config.playwright {
103                js_renderers.push(Arc::new(cdp::CdpRenderer::new(
104                    "playwright",
105                    &pw.ws_url,
106                    config.page_timeout_ms,
107                    config.pool_size,
108                )));
109            }
110            if let Some(ch) = &config.chrome {
111                js_renderers.push(Arc::new(cdp::CdpRenderer::new(
112                    "chrome",
113                    &ch.ws_url,
114                    config.page_timeout_ms,
115                    config.pool_size,
116                )));
117            }
118        }
119
120        #[cfg(not(feature = "cdp"))]
121        if config.lightpanda.is_some() || config.playwright.is_some() || config.chrome.is_some() {
122            tracing::warn!(
123                "CDP renderers configured but 'cdp' feature not enabled. JS rendering disabled."
124            );
125        }
126
127        Self { http, js_renderers }
128    }
129
130    /// Fetch a URL with smart mode: HTTP first, then JS if needed.
131    ///
132    /// When `render_js` is `None` (auto-detect), the renderer also escalates to
133    /// JS rendering if the HTTP response looks like an anti-bot challenge page
134    /// (Cloudflare "Just a moment...", etc.). The CDP renderer has built-in
135    /// challenge retry logic that waits for non-interactive JS challenges to
136    /// auto-resolve.
137    pub async fn fetch(
138        &self,
139        url: &str,
140        headers: &HashMap<String, String>,
141        render_js: Option<bool>,
142        wait_for_ms: Option<u64>,
143    ) -> CrwResult<FetchResult> {
144        match render_js {
145            Some(false) => self.http.fetch(url, headers, None).await,
146            Some(true) => {
147                // Fetch via HTTP first to check content type — PDFs can't be JS-rendered.
148                let http_result = self.http.fetch(url, headers, None).await?;
149                if http_result.content_type.as_deref() == Some("application/pdf") {
150                    return Ok(http_result);
151                }
152
153                if self.js_renderers.is_empty() {
154                    tracing::warn!(
155                        url,
156                        "JS rendering requested but no renderer available — falling back to HTTP"
157                    );
158                    let mut result = http_result;
159                    result.rendered_with = Some("http_only_fallback".to_string());
160                    result.warning = Some("JS rendering was requested but no renderer is available. Content was fetched via HTTP only.".to_string());
161                    Ok(result)
162                } else {
163                    self.fetch_with_js(url, headers, wait_for_ms).await
164                }
165            }
166            None => {
167                let result = self.http.fetch(url, headers, None).await?;
168
169                // PDFs don't need JS rendering — return immediately.
170                if result.content_type.as_deref() == Some("application/pdf") {
171                    return Ok(result);
172                }
173
174                let needs_js = detector::needs_js_rendering(&result.html);
175                let is_blocked = Self::looks_like_challenge(&result.html);
176                let is_auth_blocked = matches!(result.status_code, 401 | 403);
177
178                if !self.js_renderers.is_empty() && (needs_js || is_blocked || is_auth_blocked) {
179                    if is_auth_blocked {
180                        tracing::info!(
181                            url,
182                            status_code = result.status_code,
183                            "HTTP {} received, escalating to JS renderer",
184                            result.status_code
185                        );
186                    } else if is_blocked {
187                        tracing::info!(
188                            url,
189                            "Anti-bot challenge detected in HTTP response, escalating to JS renderer"
190                        );
191                    } else {
192                        tracing::info!(url, "SPA shell detected, retrying with JS renderer");
193                    }
194                    match self.fetch_with_js(url, headers, wait_for_ms).await {
195                        Ok(js_result) => Ok(js_result),
196                        Err(e) => {
197                            tracing::warn!("JS rendering failed, falling back to HTTP result: {e}");
198                            Ok(result)
199                        }
200                    }
201                } else {
202                    Ok(result)
203                }
204            }
205        }
206    }
207
208    /// Quick check if HTML looks like an anti-bot challenge/interstitial page.
209    fn looks_like_challenge(html: &str) -> bool {
210        if html.len() > 50_000 {
211            return false;
212        }
213        let lower = html.to_lowercase();
214        lower.contains("just a moment")
215            || lower.contains("cf-browser-verification")
216            || lower.contains("cf-challenge-running")
217            || lower.contains("challenge-platform")
218            || (lower.contains("attention required") && lower.contains("cloudflare"))
219    }
220
221    /// Minimum body text length for a JS-rendered result to be considered
222    /// successful. If the rendered page has less visible text than this, the
223    /// next renderer in the chain is tried.
224    const MIN_RENDERED_TEXT_LEN: usize = 50;
225
226    async fn fetch_with_js(
227        &self,
228        url: &str,
229        headers: &HashMap<String, String>,
230        wait_for_ms: Option<u64>,
231    ) -> CrwResult<FetchResult> {
232        let mut last_error = None;
233        let mut thin_result: Option<FetchResult> = None;
234        for renderer in &self.js_renderers {
235            match renderer.fetch(url, headers, wait_for_ms).await {
236                Ok(result) => {
237                    let text_len = html_body_text_len(&result.html);
238                    if text_len >= Self::MIN_RENDERED_TEXT_LEN {
239                        return Ok(result);
240                    }
241                    tracing::info!(
242                        renderer = renderer.name(),
243                        text_len,
244                        "JS renderer returned thin content, trying next renderer"
245                    );
246                    if thin_result.is_none() {
247                        thin_result = Some(result);
248                    }
249                }
250                Err(e) => {
251                    tracing::warn!(renderer = renderer.name(), "JS renderer failed: {e}");
252                    last_error = Some(e);
253                    continue;
254                }
255            }
256        }
257        // Return the best thin result if we have one, otherwise the last error.
258        if let Some(result) = thin_result {
259            Ok(result)
260        } else {
261            Err(last_error
262                .unwrap_or_else(|| CrwError::RendererError("No JS renderer available".to_string())))
263        }
264    }
265
266    /// Check availability of all renderers.
267    pub async fn check_health(&self) -> HashMap<String, bool> {
268        let mut health = HashMap::new();
269        health.insert("http".to_string(), self.http.is_available().await);
270        for r in &self.js_renderers {
271            health.insert(r.name().to_string(), r.is_available().await);
272        }
273        health
274    }
275}
276
277/// Rough estimate of visible text length in an HTML document.
278/// Strips tags and collapses whitespace. Used to detect "thin" renders
279/// where a renderer returned HTML but failed to execute JavaScript.
280fn html_body_text_len(html: &str) -> usize {
281    // Extract body content if present, otherwise use entire HTML.
282    let body = if let Some(start) = html.find("<body") {
283        let start = html[start..].find('>').map(|i| start + i + 1).unwrap_or(0);
284        let end = html.find("</body>").unwrap_or(html.len());
285        &html[start..end]
286    } else {
287        html
288    };
289    // Strip tags crudely.
290    let mut in_tag = false;
291    let mut text_len = 0;
292    let mut prev_ws = true;
293    for ch in body.chars() {
294        if ch == '<' {
295            in_tag = true;
296        } else if ch == '>' {
297            in_tag = false;
298        } else if !in_tag {
299            if ch.is_whitespace() {
300                if !prev_ws {
301                    text_len += 1;
302                    prev_ws = true;
303                }
304            } else {
305                text_len += 1;
306                prev_ws = false;
307            }
308        }
309    }
310    text_len
311}