Skip to main content

crw_core/
config.rs

1use serde::Deserialize;
2
3#[derive(Debug, Clone, Deserialize, Default)]
4pub struct AppConfig {
5    #[serde(default)]
6    pub server: ServerConfig,
7    #[serde(default)]
8    pub renderer: RendererConfig,
9    #[serde(default)]
10    pub crawler: CrawlerConfig,
11    #[serde(default)]
12    pub extraction: ExtractionConfig,
13    #[serde(default)]
14    pub auth: AuthConfig,
15    #[serde(default)]
16    pub request: RequestConfig,
17    #[serde(default)]
18    pub search: SearchConfig,
19}
20
21/// Per-tier CDP overhead in milliseconds — sum of SPA selector poll budget,
22/// challenge retry budget, content-stability budget, and fetch overhead.
23/// Mirrors the constants in `crw-renderer::cdp`. The drift between the two
24/// is regression-tested by `crates/crw-server/tests/cdp_constants_test.rs`
25/// (gated behind `feature = "cdp"`).
26///
27/// Used by [`RendererConfig::min_deadline_for_full_ladder_ms`] so the request
28/// deadline accommodates each CDP tier's outer fetch timeout, not just its
29/// configured `page_timeout`.
30pub const CDP_TIER_OVERHEAD_MS: u64 = 28_000;
31
32/// Hard upper bound on the per-request `wait_for_ms` budget. The Tower outer
33/// timeout is sized so a worst-case implicit scrape (no `deadlineMs`,
34/// `wait_for` at this maximum) still completes inside it; values above this
35/// are clamped by [`AppConfig::effective_deadline_ms`] so the inner deadline
36/// can never escape the outer envelope. Documented as `(0, 60000]` in
37/// `types.rs::ScrapeRequest::wait_for`.
38pub const MAX_WAIT_FOR_MS: u64 = 60_000;
39
40/// Configuration for the `/v1/search` endpoint and its SearXNG backend.
41///
42/// When `searxng_url` is unset the endpoint returns HTTP 503 with
43/// `error_code: "search_disabled"` — the route remains mounted so that
44/// startup doesn't have to know whether search will ever be configured.
45#[derive(Debug, Clone, Deserialize)]
46pub struct SearchConfig {
47    /// Master switch. Defaults to `true`; set to `false` to refuse all
48    /// `/v1/search` requests even if `searxng_url` is configured.
49    #[serde(default = "default_true_search")]
50    pub enabled: bool,
51    /// Base URL of the SearXNG instance (e.g. `http://searxng:8080`).
52    /// `None` (the default) disables the endpoint with a clear error.
53    #[serde(default)]
54    pub searxng_url: Option<String>,
55    /// End-to-end timeout for the SearXNG call in milliseconds.
56    #[serde(default = "default_search_timeout_ms")]
57    pub timeout_ms: u64,
58    /// Default `limit` when the request omits it.
59    #[serde(default = "default_search_limit")]
60    pub default_limit: u32,
61    /// Hard cap on `limit` per request. SaaS uses 20.
62    #[serde(default = "default_search_max_limit")]
63    pub max_limit: u32,
64    /// SearXNG engines invoked when the request includes `categories: ["research"]`.
65    /// Defaults match the SaaS implementation.
66    #[serde(default = "default_research_engines")]
67    pub research_engines: Vec<String>,
68    /// SearXNG engines invoked when the request includes `categories: ["github"]`.
69    #[serde(default = "default_github_engines")]
70    pub github_engines: Vec<String>,
71}
72
73impl Default for SearchConfig {
74    fn default() -> Self {
75        Self {
76            enabled: true,
77            searxng_url: None,
78            timeout_ms: default_search_timeout_ms(),
79            default_limit: default_search_limit(),
80            max_limit: default_search_max_limit(),
81            research_engines: default_research_engines(),
82            github_engines: default_github_engines(),
83        }
84    }
85}
86
87fn default_true_search() -> bool {
88    true
89}
90fn default_search_timeout_ms() -> u64 {
91    15_000
92}
93fn default_search_limit() -> u32 {
94    5
95}
96fn default_search_max_limit() -> u32 {
97    20
98}
99fn default_research_engines() -> Vec<String> {
100    vec![
101        "arxiv".into(),
102        "crossref".into(),
103        "google scholar".into(),
104        "semantic scholar".into(),
105    ]
106}
107fn default_github_engines() -> Vec<String> {
108    vec!["github".into()]
109}
110
111/// Per-request defaults that apply to every scrape, crawl, or map call when
112/// the caller does not specify an override. Currently only governs the
113/// end-to-end deadline budget (see `crw-core/src/deadline.rs`).
114#[derive(Debug, Clone, Deserialize)]
115pub struct RequestConfig {
116    /// Default end-to-end deadline budget in milliseconds when a request does
117    /// not specify `deadlineMs`. The SLO p95 latency metric is computed only
118    /// over requests with `deadline_ms <= 8000`; longer values land in a
119    /// separate slow-path histogram.
120    #[serde(default = "default_deadline_ms")]
121    pub deadline_ms_default: u64,
122    /// When `true` (default), an implicit deadline (no per-request `deadlineMs`)
123    /// is auto-extended to `max(deadline_ms_default, ladder_min)` where
124    /// `ladder_min = sum(http+lightpanda+chrome timeouts) + N_cdp_tiers * 28s`.
125    /// This prevents `chrome_timeout_ms = 30000` from appearing inert when
126    /// `deadline_ms_default` is small (issue #35).
127    ///
128    /// Set to `false` to enforce a strict SLO regardless of tier sizing —
129    /// requests that would have completed under the extended budget will
130    /// instead time out at `deadline_ms_default`.
131    #[serde(default = "default_true_request")]
132    pub auto_extend_deadline_for_ladder: bool,
133}
134
135impl Default for RequestConfig {
136    fn default() -> Self {
137        Self {
138            deadline_ms_default: default_deadline_ms(),
139            auto_extend_deadline_for_ladder: true,
140        }
141    }
142}
143
144fn default_true_request() -> bool {
145    true
146}
147
148fn default_deadline_ms() -> u64 {
149    8000
150}
151
152#[derive(Debug, Clone, Deserialize)]
153pub struct ServerConfig {
154    #[serde(default = "default_host")]
155    pub host: String,
156    #[serde(default = "default_port")]
157    pub port: u16,
158    #[serde(default = "default_request_timeout")]
159    pub request_timeout_secs: u64,
160    /// Maximum requests per second (global). 0 = unlimited.
161    #[serde(default = "default_rate_limit_rps")]
162    pub rate_limit_rps: u64,
163}
164
165impl Default for ServerConfig {
166    fn default() -> Self {
167        Self {
168            host: default_host(),
169            port: default_port(),
170            request_timeout_secs: default_request_timeout(),
171            rate_limit_rps: default_rate_limit_rps(),
172        }
173    }
174}
175
176fn default_rate_limit_rps() -> u64 {
177    10
178}
179
180fn default_host() -> String {
181    "0.0.0.0".into()
182}
183fn default_port() -> u16 {
184    3000
185}
186fn default_request_timeout() -> u64 {
187    60
188}
189
190/// Selects which JS renderer(s) the [`FallbackRenderer`] will build.
191///
192/// - `Auto` (default): try every configured CDP endpoint (Lightpanda, Playwright, Chrome)
193///   in order. If none is configured, JS rendering is disabled but HTTP still works.
194/// - `None`: HTTP-only. Never attempt JS rendering.
195/// - `Lightpanda` / `Chrome` / `Playwright`: require the matching `[renderer.<name>]`
196///   endpoint; fail startup if missing. Only the named backend is used.
197///
198/// [`FallbackRenderer`]: https://docs.rs/crw-renderer/latest/crw_renderer/struct.FallbackRenderer.html
199#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, Deserialize)]
200#[serde(rename_all = "lowercase")]
201pub enum RendererMode {
202    #[default]
203    Auto,
204    None,
205    Lightpanda,
206    Chrome,
207    Playwright,
208}
209
210#[derive(Debug, Clone, Deserialize)]
211pub struct RendererConfig {
212    #[serde(default)]
213    pub mode: RendererMode,
214    /// Generic per-page navigation timeout. Used as the fallback when no
215    /// per-tier override is configured. Kept for backward compatibility — the
216    /// per-tier knobs below are preferred for new deployments.
217    #[serde(default = "default_page_timeout")]
218    pub page_timeout_ms: u64,
219    /// Override for the HTTP-only fetcher request timeout. Falls back to
220    /// `page_timeout_ms` when unset. HTTP responses arrive quickly when they
221    /// arrive at all, so 15s is generous and keeps slow upstreams from
222    /// hogging the request budget that should be spent on JS retries.
223    #[serde(default)]
224    pub http_timeout_ms: Option<u64>,
225    /// Override for the LightPanda CDP renderer. LightPanda completes most
226    /// renders in <10s; if it stalls past 20s it almost always means an
227    /// adversarial page that Chrome will render anyway, so failing fast and
228    /// escalating beats waiting it out.
229    #[serde(default)]
230    pub lightpanda_timeout_ms: Option<u64>,
231    /// Override for the full-Chromium tier. Chrome is the slow path
232    /// (gov/legal SPAs need 30–40s for `networkidle`); the larger budget here
233    /// recovers ~6 URLs per fc-wins iteration without affecting the fast path.
234    #[serde(default)]
235    pub chrome_timeout_ms: Option<u64>,
236    #[serde(default = "default_pool_size")]
237    pub pool_size: usize,
238    /// If set, applies to every request that doesn't specify `renderJs` explicitly.
239    /// `Some(true)` = force JS rendering; `Some(false)` = skip JS; `None` = auto-detect.
240    ///
241    /// Accepts the `force_js` alias for backward compatibility.
242    #[serde(default, alias = "force_js")]
243    pub render_js_default: Option<bool>,
244    #[serde(default)]
245    pub lightpanda: Option<CdpEndpoint>,
246    #[serde(default)]
247    pub playwright: Option<CdpEndpoint>,
248    #[serde(default)]
249    pub chrome: Option<CdpEndpoint>,
250    /// Enable Chrome resource interception (`Fetch.enable` blocking of media,
251    /// fonts, trackers). Default `false`; flipped after the CDP-fake suite
252    /// validates pump + cleanup behaviour. See plan Phase 2.
253    #[serde(default)]
254    pub chrome_intercept_resources: bool,
255    /// Additionally block `stylesheet` requests when interception is enabled.
256    /// Default `false` — kept off in v1 because some extractors depend on
257    /// CSS-driven visibility / lazy-content triggers.
258    #[serde(default)]
259    pub chrome_intercept_stylesheets: bool,
260    /// Per-host opt-out for chrome interception. Hosts in this list run with
261    /// interception disabled even when `chrome_intercept_resources = true`.
262    #[serde(default)]
263    pub chrome_host_intercept_disable: Vec<String>,
264    /// Hard chrome-tier navigation budget in ms. Wraps `wait_for_page_ready`
265    /// in an inner race; on budget hit the renderer snapshots whatever DOM is
266    /// present and returns `truncated = true`. Calibrated as
267    /// `p90(successful chrome renders)` clamped to `[8_000, 12_000]`.
268    #[serde(default = "default_chrome_nav_budget_ms")]
269    pub chrome_nav_budget_ms: u64,
270    /// Enable the bounded browser-context pool. Default `false`; v1 ships
271    /// `RECYCLE_AFTER_NAV = 1` (recreate every release) before optimising to
272    /// reuse-with-clearing. See plan Phase 4.
273    #[serde(default)]
274    pub chrome_context_pool_enabled: bool,
275    /// Enable the success-ratio renderer predictor in `HostPreferences`.
276    /// Default `false`; flipped after the predictor replay harness gates
277    /// on the 1k bench (false-skip < 2 %, false-escalate < 5 %, churn < 3 / 1k).
278    #[serde(default)]
279    pub use_predictor: bool,
280    /// Engine escalation policy (firecrawl-shaped: race + on-error). When
281    /// disabled (default), the renderer keeps its current ladder unchanged.
282    #[serde(default)]
283    pub escalation: EscalationConfig,
284    /// Anti-bot detection policy (crawl4ai 3-tier classifier).
285    #[serde(default)]
286    pub antibot: AntibotConfig,
287}
288
289/// Engine escalation policy — adds `ChromeStealth` and `ChromeStealthProxy`
290/// tiers behind a feature flag. See `plans/recall-next-tier.md` Phase 2.
291#[derive(Debug, Clone, Deserialize)]
292pub struct EscalationConfig {
293    /// Master switch. Default `false` — current ladder runs unchanged.
294    #[serde(default)]
295    pub enabled: bool,
296    /// Per-tier waterfall trigger in ms. If the current engine hasn't returned
297    /// after this long, the next tier is started in parallel (firecrawl
298    /// `WaterfallNextEngineSignal`).
299    #[serde(default = "default_waterfall_timeout_ms")]
300    pub waterfall_timeout_ms: u64,
301    /// Hard global cap across the whole ladder.
302    #[serde(default = "default_escalation_global_timeout_ms")]
303    pub global_timeout_ms: u64,
304    /// Send `?proxy=residential&proxyCountry=…` to browserless on the
305    /// `ChromeStealthProxy` tier. Off by default — bears cost.
306    #[serde(default)]
307    pub residential_proxy: bool,
308    /// Country code passed to browserless when `residential_proxy = true`.
309    #[serde(default = "default_proxy_country")]
310    pub proxy_country: String,
311}
312
313impl Default for EscalationConfig {
314    fn default() -> Self {
315        Self {
316            enabled: false,
317            waterfall_timeout_ms: default_waterfall_timeout_ms(),
318            global_timeout_ms: default_escalation_global_timeout_ms(),
319            residential_proxy: false,
320            proxy_country: default_proxy_country(),
321        }
322    }
323}
324
325fn default_waterfall_timeout_ms() -> u64 {
326    8_000
327}
328fn default_escalation_global_timeout_ms() -> u64 {
329    60_000
330}
331fn default_proxy_country() -> String {
332    "us".to_string()
333}
334
335/// Anti-bot classifier policy. Default: detect+log only; escalation requires
336/// `escalate_on_signal = true` AND `escalation.enabled = true`.
337#[derive(Debug, Clone, Deserialize)]
338pub struct AntibotConfig {
339    /// Run the classifier on every fetch result. Cheap; default on.
340    #[serde(default = "default_true")]
341    pub enabled: bool,
342    /// When the classifier returns a non-`None` signal, advance to the next
343    /// engine tier (requires `escalation.enabled`).
344    #[serde(default)]
345    pub escalate_on_signal: bool,
346}
347
348impl Default for AntibotConfig {
349    fn default() -> Self {
350        Self {
351            enabled: true,
352            escalate_on_signal: false,
353        }
354    }
355}
356
357fn default_chrome_nav_budget_ms() -> u64 {
358    12_000
359}
360
361impl Default for RendererConfig {
362    fn default() -> Self {
363        Self {
364            mode: RendererMode::default(),
365            page_timeout_ms: default_page_timeout(),
366            http_timeout_ms: None,
367            lightpanda_timeout_ms: None,
368            chrome_timeout_ms: None,
369            pool_size: default_pool_size(),
370            render_js_default: None,
371            lightpanda: None,
372            playwright: None,
373            chrome: None,
374            chrome_intercept_resources: false,
375            chrome_intercept_stylesheets: false,
376            chrome_host_intercept_disable: Vec::new(),
377            chrome_nav_budget_ms: default_chrome_nav_budget_ms(),
378            chrome_context_pool_enabled: false,
379            use_predictor: false,
380            escalation: EscalationConfig::default(),
381            antibot: AntibotConfig::default(),
382        }
383    }
384}
385fn default_page_timeout() -> u64 {
386    30000
387}
388
389impl RendererConfig {
390    /// Resolved per-tier nav timeout in milliseconds. Resolution rules:
391    ///   1. If the explicit per-tier field is set, use it verbatim.
392    ///   2. Otherwise fall back to `page_timeout_ms` (which itself defaults
393    ///      to 30s for backward compatibility with pre-multi-tier configs).
394    ///
395    /// New deployments are encouraged to set the per-tier knobs to 15/20/45s
396    /// (see config.docker.toml) — these match the bench-tuned values that
397    /// recover slow gov sites in the chrome tier without giving the http
398    /// tier permission to hog the request budget.
399    pub fn http_timeout(&self) -> u64 {
400        self.http_timeout_ms.unwrap_or(self.page_timeout_ms)
401    }
402    pub fn lightpanda_timeout(&self) -> u64 {
403        self.lightpanda_timeout_ms.unwrap_or(self.page_timeout_ms)
404    }
405    pub fn chrome_timeout(&self) -> u64 {
406        self.chrome_timeout_ms.unwrap_or(self.page_timeout_ms)
407    }
408
409    /// Number of active CDP tiers (lightpanda + playwright + chrome) under
410    /// the current `mode`. Mirrors the predicate used at runtime in
411    /// `crw-renderer/src/lib.rs` when constructing the renderer ladder:
412    /// `want(mode) && config.<tier>.is_some()`.
413    ///
414    /// Returns `0` when the binary is built without the `cdp` feature — in
415    /// that case no JS renderer can be constructed regardless of the config,
416    /// so the deadline auto-extension policy must collapse to HTTP-only.
417    pub fn cdp_tier_count(&self) -> usize {
418        if !cfg!(feature = "cdp") {
419            return 0;
420        }
421        let want =
422            |m: RendererMode| -> bool { matches!(self.mode, RendererMode::Auto) || self.mode == m };
423        let mut n = 0;
424        if want(RendererMode::Lightpanda) && self.lightpanda.is_some() {
425            n += 1;
426        }
427        if want(RendererMode::Playwright) && self.playwright.is_some() {
428            n += 1;
429        }
430        if want(RendererMode::Chrome) && self.chrome.is_some() {
431            n += 1;
432        }
433        n
434    }
435
436    /// Minimum request deadline budget (ms) required so that every configured
437    /// tier can use its full allowance when fallback exhausts the chain.
438    /// Sums the per-tier timeouts and adds [`CDP_TIER_OVERHEAD_MS`] for each
439    /// active CDP tier, matching the runtime ladder built in
440    /// `crw-renderer/src/lib.rs`.
441    pub fn min_deadline_for_full_ladder_ms(&self) -> u64 {
442        let want =
443            |m: RendererMode| -> bool { matches!(self.mode, RendererMode::Auto) || self.mode == m };
444
445        let mut sum: u64 = 0;
446        // HTTP prefetch runs ahead of any JS tier (content-type sniffing,
447        // direct PDF/binary handling) regardless of pinned mode. Skipped only
448        // when mode is `None` (no fetching at all).
449        if !matches!(self.mode, RendererMode::None) {
450            sum = sum.saturating_add(self.http_timeout());
451        }
452
453        // CDP tiers only contribute when the binary was built with the `cdp`
454        // feature; otherwise no JS renderer is constructable at runtime and
455        // including their budgets would over-extend the deadline.
456        if !cfg!(feature = "cdp") {
457            return sum;
458        }
459
460        let mut cdp_tier_count: u64 = 0;
461        if want(RendererMode::Lightpanda) && self.lightpanda.is_some() {
462            sum = sum.saturating_add(self.lightpanda_timeout());
463            cdp_tier_count += 1;
464        }
465        if want(RendererMode::Playwright) && self.playwright.is_some() {
466            sum = sum.saturating_add(self.chrome_timeout());
467            cdp_tier_count += 1;
468        }
469        if want(RendererMode::Chrome) && self.chrome.is_some() {
470            sum = sum.saturating_add(self.chrome_timeout());
471            cdp_tier_count += 1;
472        }
473        sum.saturating_add(cdp_tier_count.saturating_mul(CDP_TIER_OVERHEAD_MS))
474    }
475}
476fn default_pool_size() -> usize {
477    4
478}
479
480#[derive(Debug, Clone, Deserialize)]
481pub struct CdpEndpoint {
482    pub ws_url: String,
483}
484
485/// Stealth mode configuration for evading bot detection.
486#[derive(Debug, Clone, Deserialize)]
487pub struct StealthConfig {
488    /// Enable stealth mode globally.
489    #[serde(default)]
490    pub enabled: bool,
491    /// Custom user-agent pool. Empty = use built-in pool.
492    #[serde(default)]
493    pub user_agents: Vec<String>,
494    /// Jitter factor for rate limiting (0.0–1.0, default 0.2 = ±20%).
495    #[serde(default = "default_jitter")]
496    pub jitter_factor: f64,
497    /// Inject realistic browser headers (Accept, Sec-Fetch-*, etc.).
498    #[serde(default = "default_true")]
499    pub inject_headers: bool,
500}
501
502impl Default for StealthConfig {
503    fn default() -> Self {
504        Self {
505            enabled: false,
506            user_agents: vec![],
507            jitter_factor: default_jitter(),
508            inject_headers: true,
509        }
510    }
511}
512
513fn default_jitter() -> f64 {
514    0.2
515}
516
517/// Built-in realistic user-agent pool used when stealth is enabled.
518pub const BUILTIN_UA_POOL: &[&str] = &[
519    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
520    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
521    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
522    "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:133.0) Gecko/20100101 Firefox/133.0",
523    "Mozilla/5.0 (Macintosh; Intel Mac OS X 14_7_2) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.2 Safari/605.1.15",
524];
525
526#[derive(Debug, Clone, Deserialize)]
527pub struct CrawlerConfig {
528    #[serde(default = "default_concurrency")]
529    pub max_concurrency: usize,
530    #[serde(default = "default_rps")]
531    pub requests_per_second: f64,
532    #[serde(default = "default_true")]
533    pub respect_robots_txt: bool,
534    #[serde(default = "default_ua")]
535    pub user_agent: String,
536    #[serde(default = "default_depth")]
537    pub default_max_depth: u32,
538    #[serde(default = "default_max_pages")]
539    pub default_max_pages: u32,
540    /// Proxy URL for crawler requests. Supports HTTP, HTTPS, and SOCKS5
541    /// (e.g. "http://proxy:8080" or "socks5://user:pass@proxy:1080").
542    #[serde(default)]
543    pub proxy: Option<String>,
544    /// TTL in seconds for completed crawl jobs before cleanup (default: 3600)
545    #[serde(default = "default_job_ttl")]
546    pub job_ttl_secs: u64,
547    #[serde(default)]
548    pub stealth: StealthConfig,
549    /// Floor for the per-host limiter interval, in milliseconds. When a host
550    /// advertises `Crawl-delay` in robots.txt, the higher of the two wins.
551    /// Default `0` — robots.txt is the authoritative source, this is a
552    /// per-deployment safety net.
553    #[serde(default)]
554    pub per_host_min_interval_ms: u64,
555    /// Maximum concurrent in-flight requests against a single eTLD+1.
556    /// Default `1` — strict ethics posture; operators raise consciously via
557    /// config when scraping their own infrastructure.
558    #[serde(default = "default_per_host_max_concurrent")]
559    pub per_host_max_concurrent: u32,
560}
561
562fn default_per_host_max_concurrent() -> u32 {
563    1
564}
565
566impl Default for CrawlerConfig {
567    fn default() -> Self {
568        Self {
569            max_concurrency: default_concurrency(),
570            requests_per_second: default_rps(),
571            respect_robots_txt: true,
572            user_agent: default_ua(),
573            default_max_depth: default_depth(),
574            default_max_pages: default_max_pages(),
575            proxy: None,
576            job_ttl_secs: default_job_ttl(),
577            stealth: StealthConfig::default(),
578            per_host_min_interval_ms: 0,
579            per_host_max_concurrent: default_per_host_max_concurrent(),
580        }
581    }
582}
583
584fn default_concurrency() -> usize {
585    10
586}
587fn default_rps() -> f64 {
588    10.0
589}
590fn default_true() -> bool {
591    true
592}
593fn default_ua() -> String {
594    // Modern Chrome UA. The legacy "CRW/0.1" was rejected by UA-filtering sites
595    // (opencorporates, killeenisd, wsj) returning 403/404. Kept in sync with the
596    // Sec-Ch-Ua client hint in `crw-renderer/src/http_only.rs`.
597    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 \
598     (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36"
599        .into()
600}
601fn default_depth() -> u32 {
602    2
603}
604fn default_max_pages() -> u32 {
605    100
606}
607fn default_job_ttl() -> u64 {
608    3600
609}
610
611#[derive(Debug, Clone, Deserialize)]
612pub struct ExtractionConfig {
613    #[serde(default = "default_format")]
614    pub default_format: String,
615    #[serde(default = "default_true_ext")]
616    pub only_main_content: bool,
617    #[serde(default)]
618    pub llm: Option<LlmConfig>,
619    /// Hostname → CSS selector overrides applied before readability narrowing.
620    /// Match is exact host (no wildcard); user-supplied selector still wins.
621    #[serde(default)]
622    pub domain_selectors: std::collections::HashMap<String, String>,
623    #[serde(default)]
624    pub llm_fallback: LlmFallbackConfig,
625    /// Bytes below which an HTTP-tier extraction is treated as "thin"
626    /// and triggers a JS-renderer escalation. Default 100.
627    #[serde(default = "default_http_retry_threshold")]
628    pub http_retry_threshold_bytes: usize,
629    /// Bytes below which a LightPanda-tier extraction is treated as
630    /// "thin" and triggers a Chrome escalation. Default 2000 (LP often
631    /// returns SPA husks of 90–500B that pass HTML-shape checks).
632    #[serde(default = "default_lightpanda_retry_threshold")]
633    pub lightpanda_retry_threshold_bytes: usize,
634}
635
636fn default_http_retry_threshold() -> usize {
637    100
638}
639
640fn default_lightpanda_retry_threshold() -> usize {
641    2000
642}
643
644impl Default for ExtractionConfig {
645    fn default() -> Self {
646        Self {
647            default_format: default_format(),
648            only_main_content: true,
649            llm: None,
650            domain_selectors: std::collections::HashMap::new(),
651            llm_fallback: LlmFallbackConfig::default(),
652            http_retry_threshold_bytes: default_http_retry_threshold(),
653            lightpanda_retry_threshold_bytes: default_lightpanda_retry_threshold(),
654        }
655    }
656}
657
658#[derive(Debug, Clone, Deserialize)]
659pub struct LlmFallbackConfig {
660    #[serde(default)]
661    pub enable: bool,
662    #[serde(default = "default_llm_quality_threshold")]
663    pub quality_threshold: f32,
664    #[serde(default = "default_llm_max_html_bytes")]
665    pub max_html_bytes: usize,
666    /// When true (and `enable` is true), invoke the LLM on every page rather
667    /// than only when DOM-based extraction scores below `quality_threshold`.
668    /// Mirrors the "LLM as primary extractor" pattern used by Reader-LM,
669    /// Firecrawl, and similar services. Higher cost, higher recall.
670    #[serde(default)]
671    pub always_run: bool,
672}
673
674impl Default for LlmFallbackConfig {
675    fn default() -> Self {
676        Self {
677            enable: false,
678            quality_threshold: default_llm_quality_threshold(),
679            max_html_bytes: default_llm_max_html_bytes(),
680            always_run: false,
681        }
682    }
683}
684
685fn default_llm_quality_threshold() -> f32 {
686    0.3
687}
688fn default_llm_max_html_bytes() -> usize {
689    100_000
690}
691
692#[derive(Debug, Clone, Deserialize)]
693pub struct LlmConfig {
694    #[serde(default = "default_llm_provider")]
695    pub provider: String,
696    pub api_key: String,
697    #[serde(default = "default_llm_model")]
698    pub model: String,
699    #[serde(default)]
700    pub base_url: Option<String>,
701    #[serde(default = "default_llm_max_tokens")]
702    pub max_tokens: u32,
703    /// Azure OpenAI API version (e.g. "2024-05-01-preview"). Required when
704    /// `provider = "azure"`; ignored otherwise.
705    #[serde(default)]
706    pub azure_api_version: Option<String>,
707}
708
709fn default_llm_provider() -> String {
710    "anthropic".into()
711}
712fn default_llm_model() -> String {
713    "claude-sonnet-4-20250514".into()
714}
715fn default_llm_max_tokens() -> u32 {
716    4096
717}
718
719fn default_format() -> String {
720    "markdown".into()
721}
722fn default_true_ext() -> bool {
723    true
724}
725
726#[derive(Debug, Clone, Default, Deserialize)]
727pub struct AuthConfig {
728    #[serde(default)]
729    pub api_keys: Vec<String>,
730}
731
732impl AppConfig {
733    /// Load config from config.default.toml + environment variable overrides.
734    /// Env vars use `CRW_` prefix, `__` as separator. E.g. `CRW_SERVER__PORT=8080`.
735    pub fn load() -> Result<Self, config::ConfigError> {
736        let mut builder = config::Config::builder()
737            .add_source(config::File::with_name("config.default").required(false));
738
739        // Load optional override config file (e.g. config.docker.toml in containers).
740        if let Ok(extra) = std::env::var("CRW_CONFIG") {
741            builder = builder.add_source(config::File::with_name(&extra).required(true));
742        } else {
743            builder = builder.add_source(config::File::with_name("config.local").required(false));
744        }
745
746        let cfg = builder
747            .add_source(
748                config::Environment::with_prefix("CRW")
749                    .prefix_separator("_")
750                    .separator("__")
751                    .try_parsing(true),
752            )
753            .build()?;
754        cfg.try_deserialize()
755    }
756
757    /// Compute the effective end-to-end request deadline (ms). Implements the
758    /// issue-#35 auto-extension policy:
759    ///
760    /// 1. If the caller supplied an explicit `requested_deadline_ms`, return it
761    ///    verbatim — operators trust the request budget over our heuristic.
762    /// 2. Otherwise, when `request.auto_extend_deadline_for_ladder` is on,
763    ///    return `max(deadline_ms_default, ladder_min + wait_for_extra)`.
764    ///    `ladder_min` covers the configured tier ladder; `wait_for_extra`
765    ///    compensates for callers that bumped `wait_for_ms` above the default
766    ///    SPA budget (8s) — without it, a long `wait_for` would silently
767    ///    re-clamp inside CDP.
768    /// 3. When the policy is disabled, return `deadline_ms_default` unchanged.
769    ///
770    /// `wait_for_ms` is the per-request override (ScrapeRequest::wait_for /
771    /// CrawlRequest::wait_for); pass `None` for sub-fetches that don't
772    /// surface a wait_for to the caller (search/map enrichment).
773    pub fn effective_deadline_ms(
774        &self,
775        requested_deadline_ms: Option<u64>,
776        wait_for_ms: Option<u64>,
777    ) -> u64 {
778        if let Some(explicit) = requested_deadline_ms {
779            return explicit;
780        }
781        let default_ms = self.request.deadline_ms_default;
782        if !self.request.auto_extend_deadline_for_ladder {
783            return default_ms;
784        }
785        // Issue #35 is specifically about CDP tier overhead silently clamping
786        // chrome_timeout_ms. HTTP-only deployments don't suffer the same
787        // problem (the HTTP renderer respects deadline.remaining without the
788        // extra fetch/challenge/stability overhead). Skip the extension when
789        // no CDP tiers are configured so HTTP-only users keep the strict
790        // operator-configured default.
791        if self.renderer.cdp_tier_count() == 0 {
792            return default_ms;
793        }
794        let ladder_min = self.renderer.min_deadline_for_full_ladder_ms();
795        // Mirrors crw_renderer::cdp::SPA_SELECTOR_MAX_MS. The CDP module
796        // adds `wait_for_ms.unwrap_or(SPA_SELECTOR_MAX_MS)` to its internal
797        // timeout, so when the caller exceeds the default we need to extend
798        // the deadline per active CDP tier.
799        const SPA_DEFAULT_MS: u64 = 8_000;
800        // Clamp `wait_for_ms` to MAX_WAIT_FOR_MS so the inner deadline never
801        // exceeds the Tower envelope, which is sized off the same constant in
802        // `effective_request_timeout_secs`. A pathological caller passing
803        // `wait_for: 600_000` without `deadlineMs` would otherwise be cancelled
804        // by Tower before the inner CDP loop noticed the bigger budget.
805        let extra = if let Some(w) = wait_for_ms {
806            let bounded = w.min(MAX_WAIT_FOR_MS);
807            let per_tier = bounded.saturating_sub(SPA_DEFAULT_MS);
808            per_tier.saturating_mul(self.renderer.cdp_tier_count() as u64)
809        } else {
810            0
811        };
812        default_ms.max(ladder_min.saturating_add(extra))
813    }
814
815    /// Tower middleware outer timeout (seconds). Must accommodate the longest
816    /// legitimate handler runtime so a healthy request isn't cancelled by the
817    /// outer layer before the inner deadline fires.
818    ///
819    /// Covers the three route envelopes:
820    /// - `/scrape`, `/mcp` — auto-extended scrape deadline.
821    /// - `/search` — SearXNG fetch + bounded enrichment fan-out
822    ///   (`ceil(max_limit / max_concurrency)` batches × scrape_ms).
823    /// - `/crawl/jobs/:id`, `/map` — handler-side caps up to 300s.
824    ///
825    /// When auto-extend is disabled, returns the operator-configured baseline
826    /// unchanged.
827    pub fn effective_request_timeout_secs(&self) -> u64 {
828        let baseline = self.server.request_timeout_secs;
829        if !self.request.auto_extend_deadline_for_ladder {
830            return baseline;
831        }
832        const OUTER_BUFFER_SECS: u64 = 5;
833        // `/map` handler caps `req.timeout.unwrap_or(120).min(300)`; the outer
834        // must cover the upper bound so callers passing `timeout=300` aren't
835        // cancelled mid-flight.
836        const MAP_REQUEST_TIMEOUT_CEILING_MS: u64 = 300_000;
837        // Cover the worst-case implicit scrape: caller bumps `wait_for` to the
838        // configured maximum without supplying `deadlineMs`. The same
839        // [`MAX_WAIT_FOR_MS`] constant is used inside `effective_deadline_ms`
840        // to clamp the inner extension, so the inner deadline can never
841        // exceed this outer envelope.
842        let scrape_ms = self.effective_deadline_ms(None, Some(MAX_WAIT_FOR_MS));
843
844        // Search enrichment: bounded by max_concurrency. Worst case sequential
845        // batching with low concurrency: ceil(max_limit / max_concurrency)
846        // batches each bounded by scrape_ms.
847        let conc = (self.crawler.max_concurrency.max(1)) as u64;
848        let max_results = self.search.max_limit as u64;
849        let enrich_batches = max_results.div_ceil(conc);
850        let search_enrichment_ms = enrich_batches.saturating_mul(scrape_ms);
851        let search_ms = self.search.timeout_ms.saturating_add(search_enrichment_ms);
852
853        let max_handler_ms = scrape_ms.max(search_ms).max(MAP_REQUEST_TIMEOUT_CEILING_MS);
854        let needed_secs = max_handler_ms
855            .div_ceil(1_000)
856            .saturating_add(OUTER_BUFFER_SECS);
857        baseline.max(needed_secs)
858    }
859}
860
861#[cfg(test)]
862mod tests {
863    use super::*;
864
865    /// Env var tests modify process-wide state; serialize them to avoid cross-test
866    /// interference (e.g. `force_js` alias + `render_js_default` direct both set).
867    static ENV_LOCK: std::sync::Mutex<()> = std::sync::Mutex::new(());
868
869    fn clear_renderer_env() {
870        for k in [
871            "CRW_RENDERER__MODE",
872            "CRW_RENDERER__FORCE_JS",
873            "CRW_RENDERER__RENDER_JS_DEFAULT",
874            "CRW_RENDERER__LIGHTPANDA__WS_URL",
875            "CRW_SERVER__PORT",
876        ] {
877            unsafe { std::env::remove_var(k) };
878        }
879    }
880
881    #[test]
882    fn renderer_mode_parses_variants() {
883        #[derive(Deserialize)]
884        struct Wrap {
885            mode: RendererMode,
886        }
887        let cases = [
888            ("mode = \"auto\"", RendererMode::Auto),
889            ("mode = \"none\"", RendererMode::None),
890            ("mode = \"lightpanda\"", RendererMode::Lightpanda),
891            ("mode = \"chrome\"", RendererMode::Chrome),
892            ("mode = \"playwright\"", RendererMode::Playwright),
893        ];
894        for (toml_str, expected) in cases {
895            let w: Wrap = toml::from_str(toml_str).unwrap();
896            assert_eq!(w.mode, expected, "toml: {toml_str}");
897        }
898    }
899
900    #[test]
901    fn renderer_mode_bogus_errors() {
902        #[derive(Deserialize)]
903        struct Wrap {
904            #[allow(dead_code)]
905            mode: RendererMode,
906        }
907        let err: Result<Wrap, _> = toml::from_str("mode = \"bogus\"");
908        assert!(err.is_err(), "bogus mode should fail to parse");
909    }
910
911    #[test]
912    fn renderer_config_default_mode_is_auto() {
913        let cfg = RendererConfig::default();
914        assert_eq!(cfg.mode, RendererMode::Auto);
915        assert_eq!(cfg.render_js_default, None);
916    }
917
918    #[test]
919    fn render_js_default_force_js_alias() {
920        let cfg: RendererConfig = toml::from_str("force_js = true").unwrap();
921        assert_eq!(cfg.render_js_default, Some(true));
922    }
923
924    #[test]
925    fn render_js_default_direct_field() {
926        let cfg: RendererConfig = toml::from_str("render_js_default = false").unwrap();
927        assert_eq!(cfg.render_js_default, Some(false));
928    }
929
930    #[test]
931    fn env_var_renderer_mode_chrome() {
932        let _g = ENV_LOCK.lock().unwrap();
933        clear_renderer_env();
934        unsafe { std::env::set_var("CRW_RENDERER__MODE", "chrome") };
935        let cfg = AppConfig::load().unwrap();
936        clear_renderer_env();
937        assert_eq!(cfg.renderer.mode, RendererMode::Chrome);
938    }
939
940    #[test]
941    fn env_var_force_js_alias_works() {
942        let _g = ENV_LOCK.lock().unwrap();
943        clear_renderer_env();
944        unsafe { std::env::set_var("CRW_RENDERER__FORCE_JS", "true") };
945        let cfg = AppConfig::load().unwrap();
946        clear_renderer_env();
947        assert_eq!(cfg.renderer.render_js_default, Some(true));
948    }
949
950    #[test]
951    fn env_var_render_js_default_direct() {
952        let _g = ENV_LOCK.lock().unwrap();
953        clear_renderer_env();
954        unsafe { std::env::set_var("CRW_RENDERER__RENDER_JS_DEFAULT", "true") };
955        let cfg = AppConfig::load().unwrap();
956        clear_renderer_env();
957        assert_eq!(cfg.renderer.render_js_default, Some(true));
958    }
959
960    #[test]
961    fn request_config_defaults_match_plan() {
962        let r = RequestConfig::default();
963        assert_eq!(r.deadline_ms_default, 8000);
964        assert!(r.auto_extend_deadline_for_ladder);
965    }
966
967    #[test]
968    fn default_app_config_enables_auto_extend() {
969        // Programmatic Default must mirror serde defaults — issue #35.
970        let cfg = AppConfig::default();
971        assert!(cfg.request.auto_extend_deadline_for_ladder);
972        assert_eq!(cfg.request.deadline_ms_default, 8000);
973    }
974
975    fn renderer_with_chrome_only(chrome_ms: u64) -> RendererConfig {
976        RendererConfig {
977            mode: RendererMode::Chrome,
978            page_timeout_ms: chrome_ms,
979            chrome_timeout_ms: Some(chrome_ms),
980            chrome: Some(CdpEndpoint {
981                ws_url: "ws://chrome:9222".into(),
982            }),
983            ..Default::default()
984        }
985    }
986
987    #[test]
988    #[cfg(feature = "cdp")]
989    fn min_deadline_full_ladder_chrome_only() {
990        // chrome-only mode: http (page_timeout) + chrome + 1 * 28000.
991        let r = renderer_with_chrome_only(30_000);
992        // page_timeout_ms is set to chrome_ms here, so http_timeout() → 30s.
993        assert_eq!(
994            r.min_deadline_for_full_ladder_ms(),
995            30_000 + 30_000 + 28_000
996        );
997    }
998
999    #[test]
1000    #[cfg(feature = "cdp")]
1001    fn min_deadline_full_ladder_auto_three_tiers() {
1002        let r = RendererConfig {
1003            mode: RendererMode::Auto,
1004            page_timeout_ms: 15_000,
1005            http_timeout_ms: Some(15_000),
1006            lightpanda_timeout_ms: Some(2_500),
1007            chrome_timeout_ms: Some(30_000),
1008            lightpanda: Some(CdpEndpoint {
1009                ws_url: "ws://lp:9222".into(),
1010            }),
1011            chrome: Some(CdpEndpoint {
1012                ws_url: "ws://chrome:9222".into(),
1013            }),
1014            ..Default::default()
1015        };
1016        // http(15) + lp(2.5) + chrome(30) + 2*28 = 47.5 + 56 = 103_500.
1017        assert_eq!(
1018            r.min_deadline_for_full_ladder_ms(),
1019            15_000 + 2_500 + 30_000 + 2 * 28_000
1020        );
1021        assert_eq!(r.cdp_tier_count(), 2);
1022    }
1023
1024    #[test]
1025    fn effective_deadline_explicit_bypasses_auto_extend() {
1026        let mut cfg = AppConfig::default();
1027        cfg.request.auto_extend_deadline_for_ladder = true;
1028        cfg.renderer = renderer_with_chrome_only(30_000);
1029        // Explicit override beats both default and ladder_min.
1030        assert_eq!(cfg.effective_deadline_ms(Some(5_000), None), 5_000);
1031        assert_eq!(cfg.effective_deadline_ms(Some(500_000), None), 500_000);
1032    }
1033
1034    #[test]
1035    #[cfg(feature = "cdp")]
1036    fn effective_deadline_auto_extend_raises_to_ladder_min() {
1037        let mut cfg = AppConfig::default();
1038        cfg.request.auto_extend_deadline_for_ladder = true;
1039        cfg.request.deadline_ms_default = 8_000;
1040        cfg.renderer = renderer_with_chrome_only(30_000);
1041        let expected = cfg.renderer.min_deadline_for_full_ladder_ms();
1042        assert!(expected > 8_000);
1043        assert_eq!(cfg.effective_deadline_ms(None, None), expected);
1044    }
1045
1046    #[test]
1047    fn effective_deadline_default_wins_when_higher_than_ladder() {
1048        let mut cfg = AppConfig::default();
1049        cfg.request.auto_extend_deadline_for_ladder = true;
1050        cfg.request.deadline_ms_default = 1_000_000;
1051        cfg.renderer = renderer_with_chrome_only(30_000);
1052        assert_eq!(cfg.effective_deadline_ms(None, None), 1_000_000);
1053    }
1054
1055    #[test]
1056    fn effective_deadline_auto_extend_disabled_returns_baseline() {
1057        let mut cfg = AppConfig::default();
1058        cfg.request.auto_extend_deadline_for_ladder = false;
1059        cfg.request.deadline_ms_default = 8_000;
1060        cfg.renderer = renderer_with_chrome_only(30_000);
1061        assert_eq!(cfg.effective_deadline_ms(None, None), 8_000);
1062    }
1063
1064    #[test]
1065    #[cfg(feature = "cdp")]
1066    fn effective_deadline_extends_for_long_wait_for() {
1067        let mut cfg = AppConfig::default();
1068        cfg.request.auto_extend_deadline_for_ladder = true;
1069        cfg.request.deadline_ms_default = 8_000;
1070        cfg.renderer = renderer_with_chrome_only(30_000);
1071        let base = cfg.renderer.min_deadline_for_full_ladder_ms();
1072        let tier_count = cfg.renderer.cdp_tier_count() as u64;
1073        // wait_for = 20000 → per-tier extra = 12000 over SPA_DEFAULT_MS (8000).
1074        let with_wait = cfg.effective_deadline_ms(None, Some(20_000));
1075        assert_eq!(with_wait, base + 12_000 * tier_count);
1076        // wait_for below SPA default → no extra.
1077        assert_eq!(cfg.effective_deadline_ms(None, Some(2_000)), base);
1078    }
1079
1080    #[test]
1081    fn effective_request_timeout_covers_map_ceiling() {
1082        let mut cfg = AppConfig::default();
1083        cfg.request.auto_extend_deadline_for_ladder = true;
1084        cfg.request.deadline_ms_default = 8_000;
1085        cfg.renderer = renderer_with_chrome_only(30_000);
1086        cfg.search.timeout_ms = 15_000;
1087        cfg.crawler.max_concurrency = 10;
1088        cfg.search.max_limit = 20;
1089        cfg.server.request_timeout_secs = 60;
1090        // Map ceiling 300s + 5s buffer = 305s minimum.
1091        assert!(cfg.effective_request_timeout_secs() >= 305);
1092    }
1093
1094    #[test]
1095    fn effective_request_timeout_disabled_returns_baseline() {
1096        let mut cfg = AppConfig::default();
1097        cfg.request.auto_extend_deadline_for_ladder = false;
1098        cfg.server.request_timeout_secs = 60;
1099        assert_eq!(cfg.effective_request_timeout_secs(), 60);
1100    }
1101
1102    #[test]
1103    fn effective_request_timeout_respects_operator_override() {
1104        let mut cfg = AppConfig::default();
1105        cfg.request.auto_extend_deadline_for_ladder = true;
1106        cfg.server.request_timeout_secs = 600; // operator-configured high
1107        cfg.renderer = renderer_with_chrome_only(30_000);
1108        // Operator's explicit 600s should win over the auto-computed 305s.
1109        assert_eq!(cfg.effective_request_timeout_secs(), 600);
1110    }
1111
1112    #[test]
1113    fn effective_request_timeout_search_sequential_batching() {
1114        // Low concurrency forces ceil(max_limit/conc) batches → larger search_ms.
1115        let mut cfg = AppConfig::default();
1116        cfg.request.auto_extend_deadline_for_ladder = true;
1117        cfg.request.deadline_ms_default = 8_000;
1118        cfg.renderer = renderer_with_chrome_only(30_000);
1119        cfg.search.timeout_ms = 15_000;
1120        cfg.search.max_limit = 20;
1121        cfg.crawler.max_concurrency = 1;
1122        cfg.server.request_timeout_secs = 60;
1123        // The Tower envelope must cover the worst-case implicit scrape with
1124        // `wait_for` bumped to MAX_WAIT_FOR_MS (60s), because callers can do
1125        // that without supplying `deadlineMs`. Mirror that in the expected.
1126        let secs = cfg.effective_request_timeout_secs();
1127        let scrape_ms = cfg.effective_deadline_ms(None, Some(60_000));
1128        let expected_search_ms = 15_000 + 20 * scrape_ms;
1129        let expected_max_ms = scrape_ms.max(expected_search_ms).max(300_000);
1130        let expected_secs = expected_max_ms.div_ceil(1_000) + 5;
1131        assert_eq!(secs, 60u64.max(expected_secs));
1132    }
1133
1134    #[test]
1135    #[cfg(not(feature = "cdp"))]
1136    fn cdp_tier_count_zero_without_cdp_feature() {
1137        // Even when chrome/lightpanda are configured, a binary built without
1138        // the `cdp` feature can never construct a JS renderer. The deadline
1139        // policy must observe that and collapse to HTTP-only behavior.
1140        let r = RendererConfig {
1141            mode: RendererMode::Auto,
1142            page_timeout_ms: 15_000,
1143            chrome_timeout_ms: Some(30_000),
1144            chrome: Some(CdpEndpoint {
1145                ws_url: "ws://chrome:9222".into(),
1146            }),
1147            lightpanda: Some(CdpEndpoint {
1148                ws_url: "ws://lp:9222".into(),
1149            }),
1150            ..Default::default()
1151        };
1152        assert_eq!(r.cdp_tier_count(), 0);
1153        // Only the HTTP tier contributes to the ladder budget.
1154        assert_eq!(r.min_deadline_for_full_ladder_ms(), 15_000);
1155    }
1156
1157    #[test]
1158    fn effective_deadline_skipped_for_http_only_mode() {
1159        // P2 from codex review: HTTP-only deployments don't suffer the CDP
1160        // clamping problem (no fetch/challenge/stability overhead). The
1161        // auto-extension must NOT silently bump their default from 8s to 30s
1162        // just because page_timeout_ms defaults high.
1163        let mut cfg = AppConfig::default();
1164        cfg.request.auto_extend_deadline_for_ladder = true;
1165        cfg.request.deadline_ms_default = 8_000;
1166        cfg.renderer = RendererConfig {
1167            mode: RendererMode::Auto,
1168            page_timeout_ms: 30_000,
1169            // No CDP endpoints configured.
1170            lightpanda: None,
1171            playwright: None,
1172            chrome: None,
1173            ..Default::default()
1174        };
1175        assert_eq!(cfg.renderer.cdp_tier_count(), 0);
1176        assert_eq!(cfg.effective_deadline_ms(None, None), 8_000);
1177        assert_eq!(cfg.effective_deadline_ms(None, Some(30_000)), 8_000);
1178    }
1179
1180    #[test]
1181    #[cfg(feature = "cdp")]
1182    fn min_deadline_full_ladder_playwright_only() {
1183        // Playwright tier contributes one chrome_timeout + one CDP overhead,
1184        // matching the runtime predicate in `crw-renderer/src/lib.rs`.
1185        let r = RendererConfig {
1186            mode: RendererMode::Playwright,
1187            page_timeout_ms: 15_000,
1188            http_timeout_ms: Some(15_000),
1189            chrome_timeout_ms: Some(30_000),
1190            playwright: Some(CdpEndpoint {
1191                ws_url: "ws://playwright:9222".into(),
1192            }),
1193            ..Default::default()
1194        };
1195        assert_eq!(r.cdp_tier_count(), 1);
1196        // http(15) + chrome-equivalent(30) + 1 * 28 overhead.
1197        assert_eq!(
1198            r.min_deadline_for_full_ladder_ms(),
1199            15_000 + 30_000 + 28_000
1200        );
1201    }
1202
1203    #[test]
1204    fn renderer_phase_toggles_default_off_or_safe() {
1205        let r = RendererConfig::default();
1206        assert!(!r.chrome_intercept_resources);
1207        assert!(!r.chrome_intercept_stylesheets);
1208        assert!(r.chrome_host_intercept_disable.is_empty());
1209        assert_eq!(r.chrome_nav_budget_ms, 12_000);
1210        assert!(!r.chrome_context_pool_enabled);
1211        assert!(!r.use_predictor);
1212    }
1213
1214    #[test]
1215    fn crawler_per_host_limiter_defaults() {
1216        let c = CrawlerConfig::default();
1217        assert_eq!(c.per_host_min_interval_ms, 0);
1218        assert_eq!(c.per_host_max_concurrent, 1);
1219    }
1220
1221    #[test]
1222    fn env_var_overrides_toml_defaults() {
1223        let _g = ENV_LOCK.lock().unwrap();
1224        clear_renderer_env();
1225        unsafe {
1226            std::env::set_var("CRW_SERVER__PORT", "4444");
1227            std::env::set_var("CRW_RENDERER__LIGHTPANDA__WS_URL", "ws://test:9999/");
1228        }
1229        let cfg = AppConfig::load().unwrap();
1230        clear_renderer_env();
1231
1232        assert_eq!(cfg.server.port, 4444, "env var should override server.port");
1233        assert_eq!(
1234            cfg.renderer.lightpanda.as_ref().unwrap().ws_url,
1235            "ws://test:9999/",
1236            "env var should override renderer.lightpanda.ws_url"
1237        );
1238    }
1239}