Skip to main content

crw_core/
config.rs

1use serde::Deserialize;
2
3#[derive(Debug, Clone, Deserialize, Default)]
4pub struct AppConfig {
5    #[serde(default)]
6    pub server: ServerConfig,
7    #[serde(default)]
8    pub renderer: RendererConfig,
9    #[serde(default)]
10    pub crawler: CrawlerConfig,
11    #[serde(default)]
12    pub extraction: ExtractionConfig,
13    #[serde(default)]
14    pub auth: AuthConfig,
15    #[serde(default)]
16    pub request: RequestConfig,
17    #[serde(default)]
18    pub search: SearchConfig,
19    #[serde(default)]
20    pub map: MapConfig,
21}
22
23/// `[map]` section — currently only carries `[map.url_filter]`.
24#[derive(Debug, Clone, Deserialize, Default)]
25pub struct MapConfig {
26    #[serde(default)]
27    pub url_filter: MapUrlFilterConfig,
28}
29
30/// `[map.url_filter]` — raw TOML view of the filter knobs. Conversion to
31/// the runtime `UrlFilterCfg` lives in `crw-crawl` (which can see both this
32/// type and the filter module). Keeping this struct dependency-free here
33/// avoids a cycle (`crw-core` does not depend on `crw-crawl`).
34#[derive(Debug, Clone, Deserialize)]
35pub struct MapUrlFilterConfig {
36    /// Tier B — strip tracking params. Default: `true`.
37    #[serde(default = "default_true_filter")]
38    pub strip_tracking_params: bool,
39    /// Tier A — drop action URLs entirely. Default: `true`.
40    #[serde(default = "default_true_filter")]
41    pub drop_action_urls: bool,
42    /// When `true`, `.gov`/`.mil` hosts run Tier A too. Default `false`.
43    #[serde(default)]
44    pub gov_tld_drop_actions: bool,
45    /// Additive on top of `DEFAULT_TRACKING_PARAMS`.
46    #[serde(default)]
47    pub extra_tracking_params: Vec<String>,
48    /// Additive on top of `DEFAULT_ACTION_PARAMS`.
49    #[serde(default)]
50    pub extra_action_params: Vec<String>,
51    /// Additive on top of `ALWAYS_PRESERVE`.
52    #[serde(default)]
53    pub extra_preserve_params: Vec<String>,
54}
55
56impl Default for MapUrlFilterConfig {
57    fn default() -> Self {
58        Self {
59            strip_tracking_params: true,
60            drop_action_urls: true,
61            gov_tld_drop_actions: false,
62            extra_tracking_params: Vec::new(),
63            extra_action_params: Vec::new(),
64            extra_preserve_params: Vec::new(),
65        }
66    }
67}
68
69fn default_true_filter() -> bool {
70    true
71}
72
73/// Per-tier CDP overhead in milliseconds — sum of SPA selector poll budget,
74/// challenge retry budget, content-stability budget, and fetch overhead.
75/// Mirrors the constants in `crw-renderer::cdp`. The drift between the two
76/// is regression-tested by `crates/crw-server/tests/cdp_constants_test.rs`
77/// (gated behind `feature = "cdp"`).
78///
79/// Used by [`RendererConfig::min_deadline_for_full_ladder_ms`] so the request
80/// deadline accommodates each CDP tier's outer fetch timeout, not just its
81/// configured `page_timeout`.
82pub const CDP_TIER_OVERHEAD_MS: u64 = 28_000;
83
84/// Hard upper bound on the per-request `wait_for_ms` budget. The Tower outer
85/// timeout is sized so a worst-case implicit scrape (no `deadlineMs`,
86/// `wait_for` at this maximum) still completes inside it; values above this
87/// are clamped by [`AppConfig::effective_deadline_ms`] so the inner deadline
88/// can never escape the outer envelope. Documented as `(0, 60000]` in
89/// `types.rs::ScrapeRequest::wait_for`.
90pub const MAX_WAIT_FOR_MS: u64 = 60_000;
91
92/// Configuration for the `/v1/search` endpoint and its SearXNG backend.
93///
94/// When `searxng_url` is unset the endpoint returns HTTP 503 with
95/// `error_code: "search_disabled"` — the route remains mounted so that
96/// startup doesn't have to know whether search will ever be configured.
97#[derive(Debug, Clone, Deserialize)]
98pub struct SearchConfig {
99    /// Master switch. Defaults to `true`; set to `false` to refuse all
100    /// `/v1/search` requests even if `searxng_url` is configured.
101    #[serde(default = "default_true_search")]
102    pub enabled: bool,
103    /// Base URL of the SearXNG instance (e.g. `http://searxng:8080`).
104    /// `None` (the default) disables the endpoint with a clear error.
105    #[serde(default)]
106    pub searxng_url: Option<String>,
107    /// End-to-end timeout for the SearXNG call in milliseconds.
108    #[serde(default = "default_search_timeout_ms")]
109    pub timeout_ms: u64,
110    /// Default `limit` when the request omits it.
111    #[serde(default = "default_search_limit")]
112    pub default_limit: u32,
113    /// Hard cap on `limit` per request. SaaS uses 20.
114    #[serde(default = "default_search_max_limit")]
115    pub max_limit: u32,
116    /// SearXNG engines invoked when the request includes `categories: ["research"]`.
117    /// Defaults match the SaaS implementation.
118    #[serde(default = "default_research_engines")]
119    pub research_engines: Vec<String>,
120    /// SearXNG engines invoked when the request includes `categories: ["github"]`.
121    #[serde(default = "default_github_engines")]
122    pub github_engines: Vec<String>,
123}
124
125impl Default for SearchConfig {
126    fn default() -> Self {
127        Self {
128            enabled: true,
129            searxng_url: None,
130            timeout_ms: default_search_timeout_ms(),
131            default_limit: default_search_limit(),
132            max_limit: default_search_max_limit(),
133            research_engines: default_research_engines(),
134            github_engines: default_github_engines(),
135        }
136    }
137}
138
139fn default_true_search() -> bool {
140    true
141}
142fn default_search_timeout_ms() -> u64 {
143    15_000
144}
145fn default_search_limit() -> u32 {
146    5
147}
148fn default_search_max_limit() -> u32 {
149    20
150}
151fn default_research_engines() -> Vec<String> {
152    vec![
153        "arxiv".into(),
154        "crossref".into(),
155        "google scholar".into(),
156        "semantic scholar".into(),
157    ]
158}
159fn default_github_engines() -> Vec<String> {
160    vec!["github".into()]
161}
162
163/// Per-request defaults that apply to every scrape, crawl, or map call when
164/// the caller does not specify an override. Currently only governs the
165/// end-to-end deadline budget (see `crw-core/src/deadline.rs`).
166#[derive(Debug, Clone, Deserialize)]
167pub struct RequestConfig {
168    /// Default end-to-end deadline budget in milliseconds when a request does
169    /// not specify `deadlineMs`. The SLO p95 latency metric is computed only
170    /// over requests with `deadline_ms <= 8000`; longer values land in a
171    /// separate slow-path histogram.
172    #[serde(default = "default_deadline_ms")]
173    pub deadline_ms_default: u64,
174    /// When `true` (default), an implicit deadline (no per-request `deadlineMs`)
175    /// is auto-extended to `max(deadline_ms_default, ladder_min)` where
176    /// `ladder_min = sum(http+lightpanda+chrome timeouts) + N_cdp_tiers * 28s`.
177    /// This prevents `chrome_timeout_ms = 30000` from appearing inert when
178    /// `deadline_ms_default` is small (issue #35).
179    ///
180    /// Set to `false` to enforce a strict SLO regardless of tier sizing —
181    /// requests that would have completed under the extended budget will
182    /// instead time out at `deadline_ms_default`.
183    #[serde(default = "default_true_request")]
184    pub auto_extend_deadline_for_ladder: bool,
185}
186
187impl Default for RequestConfig {
188    fn default() -> Self {
189        Self {
190            deadline_ms_default: default_deadline_ms(),
191            auto_extend_deadline_for_ladder: true,
192        }
193    }
194}
195
196fn default_true_request() -> bool {
197    true
198}
199
200fn default_deadline_ms() -> u64 {
201    8000
202}
203
204#[derive(Debug, Clone, Deserialize)]
205pub struct ServerConfig {
206    #[serde(default = "default_host")]
207    pub host: String,
208    #[serde(default = "default_port")]
209    pub port: u16,
210    #[serde(default = "default_request_timeout")]
211    pub request_timeout_secs: u64,
212    /// Maximum requests per second (global). 0 = unlimited.
213    #[serde(default = "default_rate_limit_rps")]
214    pub rate_limit_rps: u64,
215}
216
217impl Default for ServerConfig {
218    fn default() -> Self {
219        Self {
220            host: default_host(),
221            port: default_port(),
222            request_timeout_secs: default_request_timeout(),
223            rate_limit_rps: default_rate_limit_rps(),
224        }
225    }
226}
227
228fn default_rate_limit_rps() -> u64 {
229    10
230}
231
232fn default_host() -> String {
233    "0.0.0.0".into()
234}
235fn default_port() -> u16 {
236    3000
237}
238fn default_request_timeout() -> u64 {
239    60
240}
241
242/// Selects which JS renderer(s) the [`FallbackRenderer`] will build.
243///
244/// - `Auto` (default): try every configured CDP endpoint (Lightpanda, Playwright, Chrome)
245///   in order. If none is configured, JS rendering is disabled but HTTP still works.
246/// - `None`: HTTP-only. Never attempt JS rendering.
247/// - `Lightpanda` / `Chrome` / `Playwright`: require the matching `[renderer.<name>]`
248///   endpoint; fail startup if missing. Only the named backend is used.
249///
250/// [`FallbackRenderer`]: https://docs.rs/crw-renderer/latest/crw_renderer/struct.FallbackRenderer.html
251#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, Deserialize)]
252#[serde(rename_all = "lowercase")]
253pub enum RendererMode {
254    #[default]
255    Auto,
256    None,
257    Lightpanda,
258    Chrome,
259    Playwright,
260}
261
262#[derive(Debug, Clone, Deserialize)]
263pub struct RendererConfig {
264    #[serde(default)]
265    pub mode: RendererMode,
266    /// Generic per-page navigation timeout. Used as the fallback when no
267    /// per-tier override is configured. Kept for backward compatibility — the
268    /// per-tier knobs below are preferred for new deployments.
269    #[serde(default = "default_page_timeout")]
270    pub page_timeout_ms: u64,
271    /// Override for the HTTP-only fetcher request timeout. Falls back to
272    /// `page_timeout_ms` when unset. HTTP responses arrive quickly when they
273    /// arrive at all, so 15s is generous and keeps slow upstreams from
274    /// hogging the request budget that should be spent on JS retries.
275    #[serde(default)]
276    pub http_timeout_ms: Option<u64>,
277    /// Override for the LightPanda CDP renderer. LightPanda completes most
278    /// renders in <10s; if it stalls past 20s it almost always means an
279    /// adversarial page that Chrome will render anyway, so failing fast and
280    /// escalating beats waiting it out.
281    #[serde(default)]
282    pub lightpanda_timeout_ms: Option<u64>,
283    /// Override for the full-Chromium tier. Chrome is the slow path
284    /// (gov/legal SPAs need 30–40s for `networkidle`); the larger budget here
285    /// recovers ~6 URLs per fc-wins iteration without affecting the fast path.
286    #[serde(default)]
287    pub chrome_timeout_ms: Option<u64>,
288    #[serde(default = "default_pool_size")]
289    pub pool_size: usize,
290    /// If set, applies to every request that doesn't specify `renderJs` explicitly.
291    /// `Some(true)` = force JS rendering; `Some(false)` = skip JS; `None` = auto-detect.
292    ///
293    /// Accepts the `force_js` alias for backward compatibility.
294    #[serde(default, alias = "force_js")]
295    pub render_js_default: Option<bool>,
296    #[serde(default)]
297    pub lightpanda: Option<CdpEndpoint>,
298    #[serde(default)]
299    pub playwright: Option<CdpEndpoint>,
300    #[serde(default)]
301    pub chrome: Option<CdpEndpoint>,
302    /// Enable Chrome resource interception (`Fetch.enable` blocking of media,
303    /// fonts, trackers). Default `false`; flipped after the CDP-fake suite
304    /// validates pump + cleanup behaviour. See plan Phase 2.
305    #[serde(default)]
306    pub chrome_intercept_resources: bool,
307    /// Additionally block `stylesheet` requests when interception is enabled.
308    /// Default `false` — kept off in v1 because some extractors depend on
309    /// CSS-driven visibility / lazy-content triggers.
310    #[serde(default)]
311    pub chrome_intercept_stylesheets: bool,
312    /// Per-host opt-out for chrome interception. Hosts in this list run with
313    /// interception disabled even when `chrome_intercept_resources = true`.
314    #[serde(default)]
315    pub chrome_host_intercept_disable: Vec<String>,
316    /// Hard chrome-tier navigation budget in ms. Wraps `wait_for_page_ready`
317    /// in an inner race; on budget hit the renderer snapshots whatever DOM is
318    /// present and returns `truncated = true`. Calibrated as
319    /// `p90(successful chrome renders)` clamped to `[8_000, 12_000]`.
320    #[serde(default = "default_chrome_nav_budget_ms")]
321    pub chrome_nav_budget_ms: u64,
322    /// Enable the bounded browser-context pool. Default `false`; v1 ships
323    /// `RECYCLE_AFTER_NAV = 1` (recreate every release) before optimising to
324    /// reuse-with-clearing. See plan Phase 4. **Gated off when
325    /// `chrome_backend = "browserless"`** — browserless v2's
326    /// `Target.createBrowserContext` semantics with long-lived sessions are
327    /// unproven; lib.rs forces this to `false` with a WARN log in that case.
328    #[serde(default)]
329    pub chrome_context_pool_enabled: bool,
330    /// Per-knob pool configuration. Read only when
331    /// `chrome_context_pool_enabled = true` AND backend is `Vanilla`.
332    #[serde(default)]
333    pub chrome_pool: ChromePoolConfig,
334    /// Which Chrome backend the WS URL points at. **Explicit** — never sniff
335    /// from URL substrings (k8s svc names, port-forwards, custom routes break
336    /// substring detection per plan §C2). Default `Vanilla`.
337    #[serde(default)]
338    pub chrome_backend: ChromeBackend,
339    /// Enable the success-ratio renderer predictor in `HostPreferences`.
340    /// Default `false`; flipped after the predictor replay harness gates
341    /// on the 1k bench (false-skip < 2 %, false-escalate < 5 %, churn < 3 / 1k).
342    #[serde(default)]
343    pub use_predictor: bool,
344    /// Engine escalation policy (firecrawl-shaped: race + on-error). When
345    /// disabled (default), the renderer keeps its current ladder unchanged.
346    #[serde(default)]
347    pub escalation: EscalationConfig,
348    /// Anti-bot detection policy (crawl4ai 3-tier classifier).
349    #[serde(default)]
350    pub antibot: AntibotConfig,
351}
352
353/// Engine escalation policy — adds `ChromeStealth` and `ChromeStealthProxy`
354/// tiers behind a feature flag. See `plans/recall-next-tier.md` Phase 2.
355#[derive(Debug, Clone, Deserialize)]
356pub struct EscalationConfig {
357    /// Master switch. Default `false` — current ladder runs unchanged.
358    #[serde(default)]
359    pub enabled: bool,
360    /// Per-tier waterfall trigger in ms. If the current engine hasn't returned
361    /// after this long, the next tier is started in parallel (firecrawl
362    /// `WaterfallNextEngineSignal`).
363    #[serde(default = "default_waterfall_timeout_ms")]
364    pub waterfall_timeout_ms: u64,
365    /// Hard global cap across the whole ladder.
366    #[serde(default = "default_escalation_global_timeout_ms")]
367    pub global_timeout_ms: u64,
368    /// Send `?proxy=residential&proxyCountry=…` to browserless on the
369    /// `ChromeStealthProxy` tier. Off by default — bears cost.
370    #[serde(default)]
371    pub residential_proxy: bool,
372    /// Country code passed to browserless when `residential_proxy = true`.
373    #[serde(default = "default_proxy_country")]
374    pub proxy_country: String,
375}
376
377impl Default for EscalationConfig {
378    fn default() -> Self {
379        Self {
380            enabled: false,
381            waterfall_timeout_ms: default_waterfall_timeout_ms(),
382            global_timeout_ms: default_escalation_global_timeout_ms(),
383            residential_proxy: false,
384            proxy_country: default_proxy_country(),
385        }
386    }
387}
388
389fn default_waterfall_timeout_ms() -> u64 {
390    8_000
391}
392fn default_escalation_global_timeout_ms() -> u64 {
393    60_000
394}
395fn default_proxy_country() -> String {
396    "us".to_string()
397}
398
399/// Anti-bot classifier policy. Default: detect+log only; escalation requires
400/// `escalate_on_signal = true` AND `escalation.enabled = true`.
401#[derive(Debug, Clone, Deserialize)]
402pub struct AntibotConfig {
403    /// Run the classifier on every fetch result. Cheap; default on.
404    #[serde(default = "default_true")]
405    pub enabled: bool,
406    /// When the classifier returns a non-`None` signal, advance to the next
407    /// engine tier (requires `escalation.enabled`).
408    #[serde(default)]
409    pub escalate_on_signal: bool,
410}
411
412impl Default for AntibotConfig {
413    fn default() -> Self {
414        Self {
415            enabled: true,
416            escalate_on_signal: false,
417        }
418    }
419}
420
421fn default_chrome_nav_budget_ms() -> u64 {
422    12_000
423}
424
425/// Per-knob configuration for the bounded browser-context pool. Loaded under
426/// `[renderer.chrome_pool]`. Inactive unless
427/// `chrome_context_pool_enabled = true` AND `chrome_backend = "vanilla"`.
428#[derive(Debug, Clone, Deserialize)]
429pub struct ChromePoolConfig {
430    /// Pool size. `None` → `max(2, num_cpus / 2)`. Caps simultaneous
431    /// in-flight chrome requests per pool.
432    #[serde(default)]
433    pub size: Option<usize>,
434    /// Recycle policy: v1 always recreates the context after each release.
435    /// Reserved for a future "reuse N navigations then recreate" mode.
436    #[serde(default = "default_recycle_after_navs")]
437    pub recycle_after_navs: u32,
438    /// Idle slots older than this are health-checked on next acquire.
439    #[serde(default = "default_idle_timeout_secs")]
440    pub idle_timeout_secs: u64,
441    /// `Browser.getVersion` probe deadline (idle-slot liveness).
442    #[serde(default = "default_health_check_secs")]
443    pub health_check_secs: u64,
444    /// SIGTERM drain window before phase 3 force-close.
445    #[serde(default = "default_shutdown_drain_secs")]
446    pub shutdown_drain_secs: u64,
447}
448
449impl Default for ChromePoolConfig {
450    fn default() -> Self {
451        Self {
452            size: None,
453            recycle_after_navs: default_recycle_after_navs(),
454            idle_timeout_secs: default_idle_timeout_secs(),
455            health_check_secs: default_health_check_secs(),
456            shutdown_drain_secs: default_shutdown_drain_secs(),
457        }
458    }
459}
460
461fn default_recycle_after_navs() -> u32 {
462    1
463}
464fn default_idle_timeout_secs() -> u64 {
465    300
466}
467fn default_health_check_secs() -> u64 {
468    60
469}
470fn default_shutdown_drain_secs() -> u64 {
471    30
472}
473
474/// Chrome backend kind. Set explicitly under `[renderer]` as
475/// `chrome_backend = "vanilla"` or `chrome_backend = "browserless"`. **Never
476/// inferred from URL substrings** — k8s service names, port-forwards, and
477/// custom routes break substring detection. See plan §C2.
478#[derive(Debug, Clone, Copy, Default, Deserialize, PartialEq, Eq)]
479#[serde(rename_all = "lowercase")]
480pub enum ChromeBackend {
481    /// chromedp/headless-shell or vanilla Chrome with `/json/version`. Pool
482    /// is enabled here when `chrome_context_pool_enabled = true`.
483    #[default]
484    Vanilla,
485    /// Browserless v2 / commercial CDP endpoint. Pool is **gated off** in v1
486    /// — see plan §"Out of scope (v1)".
487    Browserless,
488}
489
490impl Default for RendererConfig {
491    fn default() -> Self {
492        Self {
493            mode: RendererMode::default(),
494            page_timeout_ms: default_page_timeout(),
495            http_timeout_ms: None,
496            lightpanda_timeout_ms: None,
497            chrome_timeout_ms: None,
498            pool_size: default_pool_size(),
499            render_js_default: None,
500            lightpanda: None,
501            playwright: None,
502            chrome: None,
503            chrome_intercept_resources: false,
504            chrome_intercept_stylesheets: false,
505            chrome_host_intercept_disable: Vec::new(),
506            chrome_nav_budget_ms: default_chrome_nav_budget_ms(),
507            chrome_context_pool_enabled: false,
508            chrome_pool: ChromePoolConfig::default(),
509            chrome_backend: ChromeBackend::default(),
510            use_predictor: false,
511            escalation: EscalationConfig::default(),
512            antibot: AntibotConfig::default(),
513        }
514    }
515}
516fn default_page_timeout() -> u64 {
517    30000
518}
519
520impl RendererConfig {
521    /// Resolved per-tier nav timeout in milliseconds. Resolution rules:
522    ///   1. If the explicit per-tier field is set, use it verbatim.
523    ///   2. Otherwise fall back to `page_timeout_ms` (which itself defaults
524    ///      to 30s for backward compatibility with pre-multi-tier configs).
525    ///
526    /// New deployments are encouraged to set the per-tier knobs to 15/20/45s
527    /// (see config.docker.toml) — these match the bench-tuned values that
528    /// recover slow gov sites in the chrome tier without giving the http
529    /// tier permission to hog the request budget.
530    pub fn http_timeout(&self) -> u64 {
531        self.http_timeout_ms.unwrap_or(self.page_timeout_ms)
532    }
533    pub fn lightpanda_timeout(&self) -> u64 {
534        self.lightpanda_timeout_ms.unwrap_or(self.page_timeout_ms)
535    }
536    pub fn chrome_timeout(&self) -> u64 {
537        self.chrome_timeout_ms.unwrap_or(self.page_timeout_ms)
538    }
539
540    /// Number of active CDP tiers (lightpanda + playwright + chrome) under
541    /// the current `mode`. Mirrors the predicate used at runtime in
542    /// `crw-renderer/src/lib.rs` when constructing the renderer ladder:
543    /// `want(mode) && config.<tier>.is_some()`.
544    ///
545    /// Returns `0` when the binary is built without the `cdp` feature — in
546    /// that case no JS renderer can be constructed regardless of the config,
547    /// so the deadline auto-extension policy must collapse to HTTP-only.
548    pub fn cdp_tier_count(&self) -> usize {
549        if !cfg!(feature = "cdp") {
550            return 0;
551        }
552        let want =
553            |m: RendererMode| -> bool { matches!(self.mode, RendererMode::Auto) || self.mode == m };
554        let mut n = 0;
555        if want(RendererMode::Lightpanda) && self.lightpanda.is_some() {
556            n += 1;
557        }
558        if want(RendererMode::Playwright) && self.playwright.is_some() {
559            n += 1;
560        }
561        if want(RendererMode::Chrome) && self.chrome.is_some() {
562            n += 1;
563        }
564        n
565    }
566
567    /// Minimum request deadline budget (ms) required so that every configured
568    /// tier can use its full allowance when fallback exhausts the chain.
569    /// Sums the per-tier timeouts and adds [`CDP_TIER_OVERHEAD_MS`] for each
570    /// active CDP tier, matching the runtime ladder built in
571    /// `crw-renderer/src/lib.rs`.
572    pub fn min_deadline_for_full_ladder_ms(&self) -> u64 {
573        let want =
574            |m: RendererMode| -> bool { matches!(self.mode, RendererMode::Auto) || self.mode == m };
575
576        let mut sum: u64 = 0;
577        // HTTP prefetch runs ahead of any JS tier (content-type sniffing,
578        // direct PDF/binary handling) regardless of pinned mode. Skipped only
579        // when mode is `None` (no fetching at all).
580        if !matches!(self.mode, RendererMode::None) {
581            sum = sum.saturating_add(self.http_timeout());
582        }
583
584        // CDP tiers only contribute when the binary was built with the `cdp`
585        // feature; otherwise no JS renderer is constructable at runtime and
586        // including their budgets would over-extend the deadline.
587        if !cfg!(feature = "cdp") {
588            return sum;
589        }
590
591        let mut cdp_tier_count: u64 = 0;
592        if want(RendererMode::Lightpanda) && self.lightpanda.is_some() {
593            sum = sum.saturating_add(self.lightpanda_timeout());
594            cdp_tier_count += 1;
595        }
596        if want(RendererMode::Playwright) && self.playwright.is_some() {
597            sum = sum.saturating_add(self.chrome_timeout());
598            cdp_tier_count += 1;
599        }
600        if want(RendererMode::Chrome) && self.chrome.is_some() {
601            sum = sum.saturating_add(self.chrome_timeout());
602            cdp_tier_count += 1;
603        }
604        sum.saturating_add(cdp_tier_count.saturating_mul(CDP_TIER_OVERHEAD_MS))
605    }
606}
607fn default_pool_size() -> usize {
608    4
609}
610
611#[derive(Debug, Clone, Deserialize)]
612pub struct CdpEndpoint {
613    pub ws_url: String,
614}
615
616/// Stealth mode configuration for evading bot detection.
617#[derive(Debug, Clone, Deserialize)]
618pub struct StealthConfig {
619    /// Enable stealth mode globally.
620    #[serde(default)]
621    pub enabled: bool,
622    /// Custom user-agent pool. Empty = use built-in pool.
623    #[serde(default)]
624    pub user_agents: Vec<String>,
625    /// Jitter factor for rate limiting (0.0–1.0, default 0.2 = ±20%).
626    #[serde(default = "default_jitter")]
627    pub jitter_factor: f64,
628    /// Inject realistic browser headers (Accept, Sec-Fetch-*, etc.).
629    #[serde(default = "default_true")]
630    pub inject_headers: bool,
631}
632
633impl Default for StealthConfig {
634    fn default() -> Self {
635        Self {
636            enabled: false,
637            user_agents: vec![],
638            jitter_factor: default_jitter(),
639            inject_headers: true,
640        }
641    }
642}
643
644fn default_jitter() -> f64 {
645    0.2
646}
647
648/// Built-in realistic user-agent pool used when stealth is enabled.
649pub const BUILTIN_UA_POOL: &[&str] = &[
650    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
651    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
652    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
653    "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:133.0) Gecko/20100101 Firefox/133.0",
654    "Mozilla/5.0 (Macintosh; Intel Mac OS X 14_7_2) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.2 Safari/605.1.15",
655];
656
657#[derive(Debug, Clone, Deserialize)]
658pub struct CrawlerConfig {
659    #[serde(default = "default_concurrency")]
660    pub max_concurrency: usize,
661    #[serde(default = "default_rps")]
662    pub requests_per_second: f64,
663    #[serde(default = "default_true")]
664    pub respect_robots_txt: bool,
665    #[serde(default = "default_ua")]
666    pub user_agent: String,
667    #[serde(default = "default_depth")]
668    pub default_max_depth: u32,
669    #[serde(default = "default_max_pages")]
670    pub default_max_pages: u32,
671    /// Proxy URL for crawler requests. Supports HTTP, HTTPS, and SOCKS5
672    /// (e.g. "http://proxy:8080" or "socks5://user:pass@proxy:1080").
673    #[serde(default)]
674    pub proxy: Option<String>,
675    /// TTL in seconds for completed crawl jobs before cleanup (default: 3600)
676    #[serde(default = "default_job_ttl")]
677    pub job_ttl_secs: u64,
678    #[serde(default)]
679    pub stealth: StealthConfig,
680    /// Floor for the per-host limiter interval, in milliseconds. When a host
681    /// advertises `Crawl-delay` in robots.txt, the higher of the two wins.
682    /// Default `0` — robots.txt is the authoritative source, this is a
683    /// per-deployment safety net.
684    #[serde(default)]
685    pub per_host_min_interval_ms: u64,
686    /// Maximum concurrent in-flight requests against a single eTLD+1.
687    /// Default `1` — strict ethics posture; operators raise consciously via
688    /// config when scraping their own infrastructure.
689    #[serde(default = "default_per_host_max_concurrent")]
690    pub per_host_max_concurrent: u32,
691}
692
693fn default_per_host_max_concurrent() -> u32 {
694    1
695}
696
697impl Default for CrawlerConfig {
698    fn default() -> Self {
699        Self {
700            max_concurrency: default_concurrency(),
701            requests_per_second: default_rps(),
702            respect_robots_txt: true,
703            user_agent: default_ua(),
704            default_max_depth: default_depth(),
705            default_max_pages: default_max_pages(),
706            proxy: None,
707            job_ttl_secs: default_job_ttl(),
708            stealth: StealthConfig::default(),
709            per_host_min_interval_ms: 0,
710            per_host_max_concurrent: default_per_host_max_concurrent(),
711        }
712    }
713}
714
715fn default_concurrency() -> usize {
716    10
717}
718fn default_rps() -> f64 {
719    10.0
720}
721fn default_true() -> bool {
722    true
723}
724fn default_ua() -> String {
725    // Modern Chrome UA. The legacy "CRW/0.1" was rejected by UA-filtering sites
726    // (opencorporates, killeenisd, wsj) returning 403/404. Kept in sync with the
727    // Sec-Ch-Ua client hint in `crw-renderer/src/http_only.rs`.
728    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 \
729     (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36"
730        .into()
731}
732fn default_depth() -> u32 {
733    2
734}
735fn default_max_pages() -> u32 {
736    100
737}
738fn default_job_ttl() -> u64 {
739    3600
740}
741
742#[derive(Debug, Clone, Deserialize)]
743pub struct ExtractionConfig {
744    #[serde(default = "default_format")]
745    pub default_format: String,
746    #[serde(default = "default_true_ext")]
747    pub only_main_content: bool,
748    #[serde(default)]
749    pub llm: Option<LlmConfig>,
750    /// Hostname → CSS selector overrides applied before readability narrowing.
751    /// Match is exact host (no wildcard); user-supplied selector still wins.
752    #[serde(default)]
753    pub domain_selectors: std::collections::HashMap<String, String>,
754    #[serde(default)]
755    pub llm_fallback: LlmFallbackConfig,
756    /// Bytes below which an HTTP-tier extraction is treated as "thin"
757    /// and triggers a JS-renderer escalation. Default 100.
758    #[serde(default = "default_http_retry_threshold")]
759    pub http_retry_threshold_bytes: usize,
760    /// Bytes below which a LightPanda-tier extraction is treated as
761    /// "thin" and triggers a Chrome escalation. Default 2000 (LP often
762    /// returns SPA husks of 90–500B that pass HTML-shape checks).
763    #[serde(default = "default_lightpanda_retry_threshold")]
764    pub lightpanda_retry_threshold_bytes: usize,
765}
766
767fn default_http_retry_threshold() -> usize {
768    100
769}
770
771fn default_lightpanda_retry_threshold() -> usize {
772    2000
773}
774
775impl Default for ExtractionConfig {
776    fn default() -> Self {
777        Self {
778            default_format: default_format(),
779            only_main_content: true,
780            llm: None,
781            domain_selectors: std::collections::HashMap::new(),
782            llm_fallback: LlmFallbackConfig::default(),
783            http_retry_threshold_bytes: default_http_retry_threshold(),
784            lightpanda_retry_threshold_bytes: default_lightpanda_retry_threshold(),
785        }
786    }
787}
788
789#[derive(Debug, Clone, Deserialize)]
790pub struct LlmFallbackConfig {
791    #[serde(default)]
792    pub enable: bool,
793    #[serde(default = "default_llm_quality_threshold")]
794    pub quality_threshold: f32,
795    #[serde(default = "default_llm_max_html_bytes")]
796    pub max_html_bytes: usize,
797    /// When true (and `enable` is true), invoke the LLM on every page rather
798    /// than only when DOM-based extraction scores below `quality_threshold`.
799    /// Mirrors the "LLM as primary extractor" pattern used by Reader-LM,
800    /// Firecrawl, and similar services. Higher cost, higher recall.
801    #[serde(default)]
802    pub always_run: bool,
803}
804
805impl Default for LlmFallbackConfig {
806    fn default() -> Self {
807        Self {
808            enable: false,
809            quality_threshold: default_llm_quality_threshold(),
810            max_html_bytes: default_llm_max_html_bytes(),
811            always_run: false,
812        }
813    }
814}
815
816fn default_llm_quality_threshold() -> f32 {
817    0.3
818}
819fn default_llm_max_html_bytes() -> usize {
820    100_000
821}
822
823#[derive(Debug, Clone, Deserialize)]
824pub struct LlmConfig {
825    #[serde(default = "default_llm_provider")]
826    pub provider: String,
827    pub api_key: String,
828    #[serde(default = "default_llm_model")]
829    pub model: String,
830    #[serde(default)]
831    pub base_url: Option<String>,
832    #[serde(default = "default_llm_max_tokens")]
833    pub max_tokens: u32,
834    /// Azure OpenAI API version (e.g. "2024-05-01-preview"). Required when
835    /// `provider = "azure"`; ignored otherwise.
836    #[serde(default)]
837    pub azure_api_version: Option<String>,
838}
839
840fn default_llm_provider() -> String {
841    "anthropic".into()
842}
843fn default_llm_model() -> String {
844    "claude-sonnet-4-20250514".into()
845}
846fn default_llm_max_tokens() -> u32 {
847    4096
848}
849
850fn default_format() -> String {
851    "markdown".into()
852}
853fn default_true_ext() -> bool {
854    true
855}
856
857#[derive(Debug, Clone, Default, Deserialize)]
858pub struct AuthConfig {
859    #[serde(default)]
860    pub api_keys: Vec<String>,
861}
862
863impl AppConfig {
864    /// Load config from config.default.toml + environment variable overrides.
865    /// Env vars use `CRW_` prefix, `__` as separator. E.g. `CRW_SERVER__PORT=8080`.
866    pub fn load() -> Result<Self, config::ConfigError> {
867        let mut builder = config::Config::builder()
868            .add_source(config::File::with_name("config.default").required(false));
869
870        // Load optional override config file (e.g. config.docker.toml in containers).
871        if let Ok(extra) = std::env::var("CRW_CONFIG") {
872            builder = builder.add_source(config::File::with_name(&extra).required(true));
873        } else {
874            builder = builder.add_source(config::File::with_name("config.local").required(false));
875        }
876
877        let cfg = builder
878            .add_source(
879                config::Environment::with_prefix("CRW")
880                    .prefix_separator("_")
881                    .separator("__")
882                    .try_parsing(true),
883            )
884            .build()?;
885        cfg.try_deserialize()
886    }
887
888    /// Compute the effective end-to-end request deadline (ms). Implements the
889    /// issue-#35 auto-extension policy:
890    ///
891    /// 1. If the caller supplied an explicit `requested_deadline_ms`, return it
892    ///    verbatim — operators trust the request budget over our heuristic.
893    /// 2. Otherwise, when `request.auto_extend_deadline_for_ladder` is on,
894    ///    return `max(deadline_ms_default, ladder_min + wait_for_extra)`.
895    ///    `ladder_min` covers the configured tier ladder; `wait_for_extra`
896    ///    compensates for callers that bumped `wait_for_ms` above the default
897    ///    SPA budget (8s) — without it, a long `wait_for` would silently
898    ///    re-clamp inside CDP.
899    /// 3. When the policy is disabled, return `deadline_ms_default` unchanged.
900    ///
901    /// `wait_for_ms` is the per-request override (ScrapeRequest::wait_for /
902    /// CrawlRequest::wait_for); pass `None` for sub-fetches that don't
903    /// surface a wait_for to the caller (search/map enrichment).
904    pub fn effective_deadline_ms(
905        &self,
906        requested_deadline_ms: Option<u64>,
907        wait_for_ms: Option<u64>,
908    ) -> u64 {
909        if let Some(explicit) = requested_deadline_ms {
910            return explicit;
911        }
912        let default_ms = self.request.deadline_ms_default;
913        if !self.request.auto_extend_deadline_for_ladder {
914            return default_ms;
915        }
916        // Issue #35 is specifically about CDP tier overhead silently clamping
917        // chrome_timeout_ms. HTTP-only deployments don't suffer the same
918        // problem (the HTTP renderer respects deadline.remaining without the
919        // extra fetch/challenge/stability overhead). Skip the extension when
920        // no CDP tiers are configured so HTTP-only users keep the strict
921        // operator-configured default.
922        if self.renderer.cdp_tier_count() == 0 {
923            return default_ms;
924        }
925        let ladder_min = self.renderer.min_deadline_for_full_ladder_ms();
926        // Mirrors crw_renderer::cdp::SPA_SELECTOR_MAX_MS. The CDP module
927        // adds `wait_for_ms.unwrap_or(SPA_SELECTOR_MAX_MS)` to its internal
928        // timeout, so when the caller exceeds the default we need to extend
929        // the deadline per active CDP tier.
930        const SPA_DEFAULT_MS: u64 = 8_000;
931        // Clamp `wait_for_ms` to MAX_WAIT_FOR_MS so the inner deadline never
932        // exceeds the Tower envelope, which is sized off the same constant in
933        // `effective_request_timeout_secs`. A pathological caller passing
934        // `wait_for: 600_000` without `deadlineMs` would otherwise be cancelled
935        // by Tower before the inner CDP loop noticed the bigger budget.
936        let extra = if let Some(w) = wait_for_ms {
937            let bounded = w.min(MAX_WAIT_FOR_MS);
938            let per_tier = bounded.saturating_sub(SPA_DEFAULT_MS);
939            per_tier.saturating_mul(self.renderer.cdp_tier_count() as u64)
940        } else {
941            0
942        };
943        default_ms.max(ladder_min.saturating_add(extra))
944    }
945
946    /// Tower middleware outer timeout (seconds). Must accommodate the longest
947    /// legitimate handler runtime so a healthy request isn't cancelled by the
948    /// outer layer before the inner deadline fires.
949    ///
950    /// Covers the three route envelopes:
951    /// - `/scrape`, `/mcp` — auto-extended scrape deadline.
952    /// - `/search` — SearXNG fetch + bounded enrichment fan-out
953    ///   (`ceil(max_limit / max_concurrency)` batches × scrape_ms).
954    /// - `/crawl/jobs/:id`, `/map` — handler-side caps up to 300s.
955    ///
956    /// When auto-extend is disabled, returns the operator-configured baseline
957    /// unchanged.
958    pub fn effective_request_timeout_secs(&self) -> u64 {
959        let baseline = self.server.request_timeout_secs;
960        if !self.request.auto_extend_deadline_for_ladder {
961            return baseline;
962        }
963        const OUTER_BUFFER_SECS: u64 = 5;
964        // `/map` handler caps `req.timeout.unwrap_or(120).min(300)`; the outer
965        // must cover the upper bound so callers passing `timeout=300` aren't
966        // cancelled mid-flight.
967        const MAP_REQUEST_TIMEOUT_CEILING_MS: u64 = 300_000;
968        // Cover the worst-case implicit scrape: caller bumps `wait_for` to the
969        // configured maximum without supplying `deadlineMs`. The same
970        // [`MAX_WAIT_FOR_MS`] constant is used inside `effective_deadline_ms`
971        // to clamp the inner extension, so the inner deadline can never
972        // exceed this outer envelope.
973        let scrape_ms = self.effective_deadline_ms(None, Some(MAX_WAIT_FOR_MS));
974
975        // Search enrichment: bounded by max_concurrency. Worst case sequential
976        // batching with low concurrency: ceil(max_limit / max_concurrency)
977        // batches each bounded by scrape_ms.
978        let conc = (self.crawler.max_concurrency.max(1)) as u64;
979        let max_results = self.search.max_limit as u64;
980        let enrich_batches = max_results.div_ceil(conc);
981        let search_enrichment_ms = enrich_batches.saturating_mul(scrape_ms);
982        let search_ms = self.search.timeout_ms.saturating_add(search_enrichment_ms);
983
984        let max_handler_ms = scrape_ms.max(search_ms).max(MAP_REQUEST_TIMEOUT_CEILING_MS);
985        let needed_secs = max_handler_ms
986            .div_ceil(1_000)
987            .saturating_add(OUTER_BUFFER_SECS);
988        baseline.max(needed_secs)
989    }
990}
991
992#[cfg(test)]
993mod tests {
994    use super::*;
995
996    /// Env var tests modify process-wide state; serialize them to avoid cross-test
997    /// interference (e.g. `force_js` alias + `render_js_default` direct both set).
998    static ENV_LOCK: std::sync::Mutex<()> = std::sync::Mutex::new(());
999
1000    fn clear_renderer_env() {
1001        for k in [
1002            "CRW_RENDERER__MODE",
1003            "CRW_RENDERER__FORCE_JS",
1004            "CRW_RENDERER__RENDER_JS_DEFAULT",
1005            "CRW_RENDERER__LIGHTPANDA__WS_URL",
1006            "CRW_SERVER__PORT",
1007        ] {
1008            unsafe { std::env::remove_var(k) };
1009        }
1010    }
1011
1012    #[test]
1013    fn renderer_mode_parses_variants() {
1014        #[derive(Deserialize)]
1015        struct Wrap {
1016            mode: RendererMode,
1017        }
1018        let cases = [
1019            ("mode = \"auto\"", RendererMode::Auto),
1020            ("mode = \"none\"", RendererMode::None),
1021            ("mode = \"lightpanda\"", RendererMode::Lightpanda),
1022            ("mode = \"chrome\"", RendererMode::Chrome),
1023            ("mode = \"playwright\"", RendererMode::Playwright),
1024        ];
1025        for (toml_str, expected) in cases {
1026            let w: Wrap = toml::from_str(toml_str).unwrap();
1027            assert_eq!(w.mode, expected, "toml: {toml_str}");
1028        }
1029    }
1030
1031    #[test]
1032    fn renderer_mode_bogus_errors() {
1033        #[derive(Deserialize)]
1034        struct Wrap {
1035            #[allow(dead_code)]
1036            mode: RendererMode,
1037        }
1038        let err: Result<Wrap, _> = toml::from_str("mode = \"bogus\"");
1039        assert!(err.is_err(), "bogus mode should fail to parse");
1040    }
1041
1042    #[test]
1043    fn renderer_config_default_mode_is_auto() {
1044        let cfg = RendererConfig::default();
1045        assert_eq!(cfg.mode, RendererMode::Auto);
1046        assert_eq!(cfg.render_js_default, None);
1047    }
1048
1049    #[test]
1050    fn render_js_default_force_js_alias() {
1051        let cfg: RendererConfig = toml::from_str("force_js = true").unwrap();
1052        assert_eq!(cfg.render_js_default, Some(true));
1053    }
1054
1055    #[test]
1056    fn render_js_default_direct_field() {
1057        let cfg: RendererConfig = toml::from_str("render_js_default = false").unwrap();
1058        assert_eq!(cfg.render_js_default, Some(false));
1059    }
1060
1061    #[test]
1062    fn env_var_renderer_mode_chrome() {
1063        let _g = ENV_LOCK.lock().unwrap();
1064        clear_renderer_env();
1065        unsafe { std::env::set_var("CRW_RENDERER__MODE", "chrome") };
1066        let cfg = AppConfig::load().unwrap();
1067        clear_renderer_env();
1068        assert_eq!(cfg.renderer.mode, RendererMode::Chrome);
1069    }
1070
1071    #[test]
1072    fn env_var_force_js_alias_works() {
1073        let _g = ENV_LOCK.lock().unwrap();
1074        clear_renderer_env();
1075        unsafe { std::env::set_var("CRW_RENDERER__FORCE_JS", "true") };
1076        let cfg = AppConfig::load().unwrap();
1077        clear_renderer_env();
1078        assert_eq!(cfg.renderer.render_js_default, Some(true));
1079    }
1080
1081    #[test]
1082    fn env_var_render_js_default_direct() {
1083        let _g = ENV_LOCK.lock().unwrap();
1084        clear_renderer_env();
1085        unsafe { std::env::set_var("CRW_RENDERER__RENDER_JS_DEFAULT", "true") };
1086        let cfg = AppConfig::load().unwrap();
1087        clear_renderer_env();
1088        assert_eq!(cfg.renderer.render_js_default, Some(true));
1089    }
1090
1091    #[test]
1092    fn request_config_defaults_match_plan() {
1093        let r = RequestConfig::default();
1094        assert_eq!(r.deadline_ms_default, 8000);
1095        assert!(r.auto_extend_deadline_for_ladder);
1096    }
1097
1098    #[test]
1099    fn default_app_config_enables_auto_extend() {
1100        // Programmatic Default must mirror serde defaults — issue #35.
1101        let cfg = AppConfig::default();
1102        assert!(cfg.request.auto_extend_deadline_for_ladder);
1103        assert_eq!(cfg.request.deadline_ms_default, 8000);
1104    }
1105
1106    fn renderer_with_chrome_only(chrome_ms: u64) -> RendererConfig {
1107        RendererConfig {
1108            mode: RendererMode::Chrome,
1109            page_timeout_ms: chrome_ms,
1110            chrome_timeout_ms: Some(chrome_ms),
1111            chrome: Some(CdpEndpoint {
1112                ws_url: "ws://chrome:9222".into(),
1113            }),
1114            ..Default::default()
1115        }
1116    }
1117
1118    #[test]
1119    #[cfg(feature = "cdp")]
1120    fn min_deadline_full_ladder_chrome_only() {
1121        // chrome-only mode: http (page_timeout) + chrome + 1 * 28000.
1122        let r = renderer_with_chrome_only(30_000);
1123        // page_timeout_ms is set to chrome_ms here, so http_timeout() → 30s.
1124        assert_eq!(
1125            r.min_deadline_for_full_ladder_ms(),
1126            30_000 + 30_000 + 28_000
1127        );
1128    }
1129
1130    #[test]
1131    #[cfg(feature = "cdp")]
1132    fn min_deadline_full_ladder_auto_three_tiers() {
1133        let r = RendererConfig {
1134            mode: RendererMode::Auto,
1135            page_timeout_ms: 15_000,
1136            http_timeout_ms: Some(15_000),
1137            lightpanda_timeout_ms: Some(2_500),
1138            chrome_timeout_ms: Some(30_000),
1139            lightpanda: Some(CdpEndpoint {
1140                ws_url: "ws://lp:9222".into(),
1141            }),
1142            chrome: Some(CdpEndpoint {
1143                ws_url: "ws://chrome:9222".into(),
1144            }),
1145            ..Default::default()
1146        };
1147        // http(15) + lp(2.5) + chrome(30) + 2*28 = 47.5 + 56 = 103_500.
1148        assert_eq!(
1149            r.min_deadline_for_full_ladder_ms(),
1150            15_000 + 2_500 + 30_000 + 2 * 28_000
1151        );
1152        assert_eq!(r.cdp_tier_count(), 2);
1153    }
1154
1155    #[test]
1156    fn effective_deadline_explicit_bypasses_auto_extend() {
1157        let mut cfg = AppConfig::default();
1158        cfg.request.auto_extend_deadline_for_ladder = true;
1159        cfg.renderer = renderer_with_chrome_only(30_000);
1160        // Explicit override beats both default and ladder_min.
1161        assert_eq!(cfg.effective_deadline_ms(Some(5_000), None), 5_000);
1162        assert_eq!(cfg.effective_deadline_ms(Some(500_000), None), 500_000);
1163    }
1164
1165    #[test]
1166    #[cfg(feature = "cdp")]
1167    fn effective_deadline_auto_extend_raises_to_ladder_min() {
1168        let mut cfg = AppConfig::default();
1169        cfg.request.auto_extend_deadline_for_ladder = true;
1170        cfg.request.deadline_ms_default = 8_000;
1171        cfg.renderer = renderer_with_chrome_only(30_000);
1172        let expected = cfg.renderer.min_deadline_for_full_ladder_ms();
1173        assert!(expected > 8_000);
1174        assert_eq!(cfg.effective_deadline_ms(None, None), expected);
1175    }
1176
1177    #[test]
1178    fn effective_deadline_default_wins_when_higher_than_ladder() {
1179        let mut cfg = AppConfig::default();
1180        cfg.request.auto_extend_deadline_for_ladder = true;
1181        cfg.request.deadline_ms_default = 1_000_000;
1182        cfg.renderer = renderer_with_chrome_only(30_000);
1183        assert_eq!(cfg.effective_deadline_ms(None, None), 1_000_000);
1184    }
1185
1186    #[test]
1187    fn effective_deadline_auto_extend_disabled_returns_baseline() {
1188        let mut cfg = AppConfig::default();
1189        cfg.request.auto_extend_deadline_for_ladder = false;
1190        cfg.request.deadline_ms_default = 8_000;
1191        cfg.renderer = renderer_with_chrome_only(30_000);
1192        assert_eq!(cfg.effective_deadline_ms(None, None), 8_000);
1193    }
1194
1195    #[test]
1196    #[cfg(feature = "cdp")]
1197    fn effective_deadline_extends_for_long_wait_for() {
1198        let mut cfg = AppConfig::default();
1199        cfg.request.auto_extend_deadline_for_ladder = true;
1200        cfg.request.deadline_ms_default = 8_000;
1201        cfg.renderer = renderer_with_chrome_only(30_000);
1202        let base = cfg.renderer.min_deadline_for_full_ladder_ms();
1203        let tier_count = cfg.renderer.cdp_tier_count() as u64;
1204        // wait_for = 20000 → per-tier extra = 12000 over SPA_DEFAULT_MS (8000).
1205        let with_wait = cfg.effective_deadline_ms(None, Some(20_000));
1206        assert_eq!(with_wait, base + 12_000 * tier_count);
1207        // wait_for below SPA default → no extra.
1208        assert_eq!(cfg.effective_deadline_ms(None, Some(2_000)), base);
1209    }
1210
1211    #[test]
1212    fn effective_request_timeout_covers_map_ceiling() {
1213        let mut cfg = AppConfig::default();
1214        cfg.request.auto_extend_deadline_for_ladder = true;
1215        cfg.request.deadline_ms_default = 8_000;
1216        cfg.renderer = renderer_with_chrome_only(30_000);
1217        cfg.search.timeout_ms = 15_000;
1218        cfg.crawler.max_concurrency = 10;
1219        cfg.search.max_limit = 20;
1220        cfg.server.request_timeout_secs = 60;
1221        // Map ceiling 300s + 5s buffer = 305s minimum.
1222        assert!(cfg.effective_request_timeout_secs() >= 305);
1223    }
1224
1225    #[test]
1226    fn effective_request_timeout_disabled_returns_baseline() {
1227        let mut cfg = AppConfig::default();
1228        cfg.request.auto_extend_deadline_for_ladder = false;
1229        cfg.server.request_timeout_secs = 60;
1230        assert_eq!(cfg.effective_request_timeout_secs(), 60);
1231    }
1232
1233    #[test]
1234    fn effective_request_timeout_respects_operator_override() {
1235        let mut cfg = AppConfig::default();
1236        cfg.request.auto_extend_deadline_for_ladder = true;
1237        cfg.server.request_timeout_secs = 600; // operator-configured high
1238        cfg.renderer = renderer_with_chrome_only(30_000);
1239        // Operator's explicit 600s should win over the auto-computed 305s.
1240        assert_eq!(cfg.effective_request_timeout_secs(), 600);
1241    }
1242
1243    #[test]
1244    fn effective_request_timeout_search_sequential_batching() {
1245        // Low concurrency forces ceil(max_limit/conc) batches → larger search_ms.
1246        let mut cfg = AppConfig::default();
1247        cfg.request.auto_extend_deadline_for_ladder = true;
1248        cfg.request.deadline_ms_default = 8_000;
1249        cfg.renderer = renderer_with_chrome_only(30_000);
1250        cfg.search.timeout_ms = 15_000;
1251        cfg.search.max_limit = 20;
1252        cfg.crawler.max_concurrency = 1;
1253        cfg.server.request_timeout_secs = 60;
1254        // The Tower envelope must cover the worst-case implicit scrape with
1255        // `wait_for` bumped to MAX_WAIT_FOR_MS (60s), because callers can do
1256        // that without supplying `deadlineMs`. Mirror that in the expected.
1257        let secs = cfg.effective_request_timeout_secs();
1258        let scrape_ms = cfg.effective_deadline_ms(None, Some(60_000));
1259        let expected_search_ms = 15_000 + 20 * scrape_ms;
1260        let expected_max_ms = scrape_ms.max(expected_search_ms).max(300_000);
1261        let expected_secs = expected_max_ms.div_ceil(1_000) + 5;
1262        assert_eq!(secs, 60u64.max(expected_secs));
1263    }
1264
1265    #[test]
1266    #[cfg(not(feature = "cdp"))]
1267    fn cdp_tier_count_zero_without_cdp_feature() {
1268        // Even when chrome/lightpanda are configured, a binary built without
1269        // the `cdp` feature can never construct a JS renderer. The deadline
1270        // policy must observe that and collapse to HTTP-only behavior.
1271        let r = RendererConfig {
1272            mode: RendererMode::Auto,
1273            page_timeout_ms: 15_000,
1274            chrome_timeout_ms: Some(30_000),
1275            chrome: Some(CdpEndpoint {
1276                ws_url: "ws://chrome:9222".into(),
1277            }),
1278            lightpanda: Some(CdpEndpoint {
1279                ws_url: "ws://lp:9222".into(),
1280            }),
1281            ..Default::default()
1282        };
1283        assert_eq!(r.cdp_tier_count(), 0);
1284        // Only the HTTP tier contributes to the ladder budget.
1285        assert_eq!(r.min_deadline_for_full_ladder_ms(), 15_000);
1286    }
1287
1288    #[test]
1289    fn effective_deadline_skipped_for_http_only_mode() {
1290        // P2 from codex review: HTTP-only deployments don't suffer the CDP
1291        // clamping problem (no fetch/challenge/stability overhead). The
1292        // auto-extension must NOT silently bump their default from 8s to 30s
1293        // just because page_timeout_ms defaults high.
1294        let mut cfg = AppConfig::default();
1295        cfg.request.auto_extend_deadline_for_ladder = true;
1296        cfg.request.deadline_ms_default = 8_000;
1297        cfg.renderer = RendererConfig {
1298            mode: RendererMode::Auto,
1299            page_timeout_ms: 30_000,
1300            // No CDP endpoints configured.
1301            lightpanda: None,
1302            playwright: None,
1303            chrome: None,
1304            ..Default::default()
1305        };
1306        assert_eq!(cfg.renderer.cdp_tier_count(), 0);
1307        assert_eq!(cfg.effective_deadline_ms(None, None), 8_000);
1308        assert_eq!(cfg.effective_deadline_ms(None, Some(30_000)), 8_000);
1309    }
1310
1311    #[test]
1312    #[cfg(feature = "cdp")]
1313    fn min_deadline_full_ladder_playwright_only() {
1314        // Playwright tier contributes one chrome_timeout + one CDP overhead,
1315        // matching the runtime predicate in `crw-renderer/src/lib.rs`.
1316        let r = RendererConfig {
1317            mode: RendererMode::Playwright,
1318            page_timeout_ms: 15_000,
1319            http_timeout_ms: Some(15_000),
1320            chrome_timeout_ms: Some(30_000),
1321            playwright: Some(CdpEndpoint {
1322                ws_url: "ws://playwright:9222".into(),
1323            }),
1324            ..Default::default()
1325        };
1326        assert_eq!(r.cdp_tier_count(), 1);
1327        // http(15) + chrome-equivalent(30) + 1 * 28 overhead.
1328        assert_eq!(
1329            r.min_deadline_for_full_ladder_ms(),
1330            15_000 + 30_000 + 28_000
1331        );
1332    }
1333
1334    #[test]
1335    fn renderer_phase_toggles_default_off_or_safe() {
1336        let r = RendererConfig::default();
1337        assert!(!r.chrome_intercept_resources);
1338        assert!(!r.chrome_intercept_stylesheets);
1339        assert!(r.chrome_host_intercept_disable.is_empty());
1340        assert_eq!(r.chrome_nav_budget_ms, 12_000);
1341        assert!(!r.chrome_context_pool_enabled);
1342        assert!(!r.use_predictor);
1343    }
1344
1345    #[test]
1346    fn crawler_per_host_limiter_defaults() {
1347        let c = CrawlerConfig::default();
1348        assert_eq!(c.per_host_min_interval_ms, 0);
1349        assert_eq!(c.per_host_max_concurrent, 1);
1350    }
1351
1352    #[test]
1353    fn env_var_overrides_toml_defaults() {
1354        let _g = ENV_LOCK.lock().unwrap();
1355        clear_renderer_env();
1356        unsafe {
1357            std::env::set_var("CRW_SERVER__PORT", "4444");
1358            std::env::set_var("CRW_RENDERER__LIGHTPANDA__WS_URL", "ws://test:9999/");
1359        }
1360        let cfg = AppConfig::load().unwrap();
1361        clear_renderer_env();
1362
1363        assert_eq!(cfg.server.port, 4444, "env var should override server.port");
1364        assert_eq!(
1365            cfg.renderer.lightpanda.as_ref().unwrap().ws_url,
1366            "ws://test:9999/",
1367            "env var should override renderer.lightpanda.ws_url"
1368        );
1369    }
1370}