Skip to main content

crw_core/
config.rs

1use serde::Deserialize;
2
3#[derive(Debug, Clone, Deserialize, Default)]
4pub struct AppConfig {
5    #[serde(default)]
6    pub server: ServerConfig,
7    #[serde(default)]
8    pub renderer: RendererConfig,
9    #[serde(default)]
10    pub crawler: CrawlerConfig,
11    #[serde(default)]
12    pub extraction: ExtractionConfig,
13    #[serde(default)]
14    pub auth: AuthConfig,
15    #[serde(default)]
16    pub request: RequestConfig,
17    #[serde(default)]
18    pub search: SearchConfig,
19    #[serde(default)]
20    pub map: MapConfig,
21    /// `[client]` — settings for the local CLI/MCP when it proxies to the
22    /// hosted SaaS. Written by `crw setup` into the user-config file.
23    #[serde(default)]
24    pub client: ClientConfig,
25}
26
27/// `[client]` — cloud-proxy credentials populated by `crw setup` and read by
28/// `crw mcp` / `crw-mcp`. Both fields are `Option` so an unconfigured user runs
29/// in local mode without surprise overrides.
30#[derive(Debug, Clone, Default, Deserialize)]
31pub struct ClientConfig {
32    /// Base URL of the hosted CRW API, e.g. `https://api.fastcrw.com`.
33    #[serde(default)]
34    pub api_url: Option<String>,
35    /// API key for the hosted CRW API.
36    #[serde(default)]
37    pub api_key: Option<String>,
38}
39
40/// `[map]` section — currently only carries `[map.url_filter]`.
41#[derive(Debug, Clone, Deserialize, Default)]
42pub struct MapConfig {
43    #[serde(default)]
44    pub url_filter: MapUrlFilterConfig,
45}
46
47/// `[map.url_filter]` — raw TOML view of the filter knobs. Conversion to
48/// the runtime `UrlFilterCfg` lives in `crw-crawl` (which can see both this
49/// type and the filter module). Keeping this struct dependency-free here
50/// avoids a cycle (`crw-core` does not depend on `crw-crawl`).
51#[derive(Debug, Clone, Deserialize)]
52pub struct MapUrlFilterConfig {
53    /// Tier B — strip tracking params. Default: `true`.
54    #[serde(default = "default_true_filter")]
55    pub strip_tracking_params: bool,
56    /// Tier A — drop action URLs entirely. Default: `true`.
57    #[serde(default = "default_true_filter")]
58    pub drop_action_urls: bool,
59    /// When `true`, `.gov`/`.mil` hosts run Tier A too. Default `false`.
60    #[serde(default)]
61    pub gov_tld_drop_actions: bool,
62    /// Additive on top of `DEFAULT_TRACKING_PARAMS`.
63    #[serde(default)]
64    pub extra_tracking_params: Vec<String>,
65    /// Additive on top of `DEFAULT_ACTION_PARAMS`.
66    #[serde(default)]
67    pub extra_action_params: Vec<String>,
68    /// Additive on top of `ALWAYS_PRESERVE`.
69    #[serde(default)]
70    pub extra_preserve_params: Vec<String>,
71}
72
73impl Default for MapUrlFilterConfig {
74    fn default() -> Self {
75        Self {
76            strip_tracking_params: true,
77            drop_action_urls: true,
78            gov_tld_drop_actions: false,
79            extra_tracking_params: Vec::new(),
80            extra_action_params: Vec::new(),
81            extra_preserve_params: Vec::new(),
82        }
83    }
84}
85
86fn default_true_filter() -> bool {
87    true
88}
89
90/// Per-tier CDP overhead in milliseconds — sum of SPA selector poll budget,
91/// challenge retry budget, content-stability budget, and fetch overhead.
92/// Mirrors the constants in `crw-renderer::cdp`. The drift between the two
93/// is regression-tested by `crates/crw-server/tests/cdp_constants_test.rs`
94/// (gated behind `feature = "cdp"`).
95///
96/// Used by [`RendererConfig::min_deadline_for_full_ladder_ms`] so the request
97/// deadline accommodates each CDP tier's outer fetch timeout, not just its
98/// configured `page_timeout`.
99pub const CDP_TIER_OVERHEAD_MS: u64 = 28_000;
100
101/// Hard upper bound on the per-request `wait_for_ms` budget. The Tower outer
102/// timeout is sized so a worst-case implicit scrape (no `deadlineMs`,
103/// `wait_for` at this maximum) still completes inside it; values above this
104/// are clamped by [`AppConfig::effective_deadline_ms`] so the inner deadline
105/// can never escape the outer envelope. Documented as `(0, 60000]` in
106/// `types.rs::ScrapeRequest::wait_for`.
107pub const MAX_WAIT_FOR_MS: u64 = 60_000;
108
109/// Configuration for the `/v1/search` endpoint and its SearXNG backend.
110///
111/// When `searxng_url` is unset the endpoint returns HTTP 503 with
112/// `error_code: "search_disabled"` — the route remains mounted so that
113/// startup doesn't have to know whether search will ever be configured.
114#[derive(Debug, Clone, Deserialize)]
115pub struct SearchConfig {
116    /// Master switch. Defaults to `true`; set to `false` to refuse all
117    /// `/v1/search` requests even if `searxng_url` is configured.
118    #[serde(default = "default_true_search")]
119    pub enabled: bool,
120    /// Base URL of the SearXNG instance (e.g. `http://searxng:8080`).
121    /// `None` (the default) disables the endpoint with a clear error.
122    #[serde(default)]
123    pub searxng_url: Option<String>,
124    /// End-to-end timeout for the SearXNG call in milliseconds.
125    #[serde(default = "default_search_timeout_ms")]
126    pub timeout_ms: u64,
127    /// Default `limit` when the request omits it.
128    #[serde(default = "default_search_limit")]
129    pub default_limit: u32,
130    /// Hard cap on `limit` per request. SaaS uses 20.
131    #[serde(default = "default_search_max_limit")]
132    pub max_limit: u32,
133    /// SearXNG engines invoked when the request includes `categories: ["research"]`.
134    /// Defaults match the SaaS implementation.
135    #[serde(default = "default_research_engines")]
136    pub research_engines: Vec<String>,
137    /// SearXNG engines invoked when the request includes `categories: ["github"]`.
138    #[serde(default = "default_github_engines")]
139    pub github_engines: Vec<String>,
140}
141
142impl Default for SearchConfig {
143    fn default() -> Self {
144        Self {
145            enabled: true,
146            searxng_url: None,
147            timeout_ms: default_search_timeout_ms(),
148            default_limit: default_search_limit(),
149            max_limit: default_search_max_limit(),
150            research_engines: default_research_engines(),
151            github_engines: default_github_engines(),
152        }
153    }
154}
155
156fn default_true_search() -> bool {
157    true
158}
159fn default_search_timeout_ms() -> u64 {
160    15_000
161}
162fn default_search_limit() -> u32 {
163    5
164}
165fn default_search_max_limit() -> u32 {
166    20
167}
168fn default_research_engines() -> Vec<String> {
169    vec![
170        "arxiv".into(),
171        "crossref".into(),
172        "google scholar".into(),
173        "semantic scholar".into(),
174    ]
175}
176fn default_github_engines() -> Vec<String> {
177    vec!["github".into()]
178}
179
180/// Per-request defaults that apply to every scrape, crawl, or map call when
181/// the caller does not specify an override. Currently only governs the
182/// end-to-end deadline budget (see `crw-core/src/deadline.rs`).
183#[derive(Debug, Clone, Deserialize)]
184pub struct RequestConfig {
185    /// Default end-to-end deadline budget in milliseconds when a request does
186    /// not specify `deadlineMs`. The SLO p95 latency metric is computed only
187    /// over requests with `deadline_ms <= 8000`; longer values land in a
188    /// separate slow-path histogram.
189    #[serde(default = "default_deadline_ms")]
190    pub deadline_ms_default: u64,
191    /// When `true` (default), an implicit deadline (no per-request `deadlineMs`)
192    /// is auto-extended to `max(deadline_ms_default, ladder_min)` where
193    /// `ladder_min = sum(http+lightpanda+chrome timeouts) + N_cdp_tiers * 28s`.
194    /// This prevents `chrome_timeout_ms = 30000` from appearing inert when
195    /// `deadline_ms_default` is small (issue #35).
196    ///
197    /// Set to `false` to enforce a strict SLO regardless of tier sizing —
198    /// requests that would have completed under the extended budget will
199    /// instead time out at `deadline_ms_default`.
200    #[serde(default = "default_true_request")]
201    pub auto_extend_deadline_for_ladder: bool,
202}
203
204impl Default for RequestConfig {
205    fn default() -> Self {
206        Self {
207            deadline_ms_default: default_deadline_ms(),
208            auto_extend_deadline_for_ladder: true,
209        }
210    }
211}
212
213fn default_true_request() -> bool {
214    true
215}
216
217fn default_deadline_ms() -> u64 {
218    8000
219}
220
221#[derive(Debug, Clone, Deserialize)]
222pub struct ServerConfig {
223    #[serde(default = "default_host")]
224    pub host: String,
225    #[serde(default = "default_port")]
226    pub port: u16,
227    #[serde(default = "default_request_timeout")]
228    pub request_timeout_secs: u64,
229    /// Maximum requests per second (global). 0 = unlimited.
230    #[serde(default = "default_rate_limit_rps")]
231    pub rate_limit_rps: u64,
232}
233
234impl Default for ServerConfig {
235    fn default() -> Self {
236        Self {
237            host: default_host(),
238            port: default_port(),
239            request_timeout_secs: default_request_timeout(),
240            rate_limit_rps: default_rate_limit_rps(),
241        }
242    }
243}
244
245fn default_rate_limit_rps() -> u64 {
246    10
247}
248
249fn default_host() -> String {
250    "0.0.0.0".into()
251}
252fn default_port() -> u16 {
253    3000
254}
255fn default_request_timeout() -> u64 {
256    60
257}
258
259/// Selects which JS renderer(s) the [`FallbackRenderer`] will build.
260///
261/// - `Auto` (default): try every configured CDP endpoint (Lightpanda, Playwright, Chrome)
262///   in order. If none is configured, JS rendering is disabled but HTTP still works.
263/// - `None`: HTTP-only. Never attempt JS rendering.
264/// - `Lightpanda` / `Chrome` / `Playwright`: require the matching `[renderer.<name>]`
265///   endpoint; fail startup if missing. Only the named backend is used.
266///
267/// [`FallbackRenderer`]: https://docs.rs/crw-renderer/latest/crw_renderer/struct.FallbackRenderer.html
268#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, Deserialize)]
269#[serde(rename_all = "lowercase")]
270pub enum RendererMode {
271    #[default]
272    Auto,
273    None,
274    Lightpanda,
275    Chrome,
276    Playwright,
277}
278
279#[derive(Debug, Clone, Deserialize)]
280pub struct RendererConfig {
281    #[serde(default)]
282    pub mode: RendererMode,
283    /// Generic per-page navigation timeout. Used as the fallback when no
284    /// per-tier override is configured. Kept for backward compatibility — the
285    /// per-tier knobs below are preferred for new deployments.
286    #[serde(default = "default_page_timeout")]
287    pub page_timeout_ms: u64,
288    /// Override for the HTTP-only fetcher request timeout. Falls back to
289    /// `page_timeout_ms` when unset. HTTP responses arrive quickly when they
290    /// arrive at all, so 15s is generous and keeps slow upstreams from
291    /// hogging the request budget that should be spent on JS retries.
292    #[serde(default)]
293    pub http_timeout_ms: Option<u64>,
294    /// Override for the LightPanda CDP renderer. LightPanda completes most
295    /// renders in <10s; if it stalls past 20s it almost always means an
296    /// adversarial page that Chrome will render anyway, so failing fast and
297    /// escalating beats waiting it out.
298    #[serde(default)]
299    pub lightpanda_timeout_ms: Option<u64>,
300    /// Override for the full-Chromium tier. Chrome is the slow path
301    /// (gov/legal SPAs need 30–40s for `networkidle`); the larger budget here
302    /// recovers ~6 URLs per fc-wins iteration without affecting the fast path.
303    #[serde(default)]
304    pub chrome_timeout_ms: Option<u64>,
305    #[serde(default = "default_pool_size")]
306    pub pool_size: usize,
307    /// If set, applies to every request that doesn't specify `renderJs` explicitly.
308    /// `Some(true)` = force JS rendering; `Some(false)` = skip JS; `None` = auto-detect.
309    ///
310    /// Accepts the `force_js` alias for backward compatibility.
311    #[serde(default, alias = "force_js")]
312    pub render_js_default: Option<bool>,
313    #[serde(default)]
314    pub lightpanda: Option<CdpEndpoint>,
315    #[serde(default)]
316    pub playwright: Option<CdpEndpoint>,
317    #[serde(default)]
318    pub chrome: Option<CdpEndpoint>,
319    /// Enable Chrome resource interception (`Fetch.enable` blocking of media,
320    /// fonts, trackers). Default `false`; flipped after the CDP-fake suite
321    /// validates pump + cleanup behaviour. See plan Phase 2.
322    #[serde(default)]
323    pub chrome_intercept_resources: bool,
324    /// Additionally block `stylesheet` requests when interception is enabled.
325    /// Default `false` — kept off in v1 because some extractors depend on
326    /// CSS-driven visibility / lazy-content triggers.
327    #[serde(default)]
328    pub chrome_intercept_stylesheets: bool,
329    /// Per-host opt-out for chrome interception. Hosts in this list run with
330    /// interception disabled even when `chrome_intercept_resources = true`.
331    #[serde(default)]
332    pub chrome_host_intercept_disable: Vec<String>,
333    /// Hard chrome-tier navigation budget in ms. Wraps `wait_for_page_ready`
334    /// in an inner race; on budget hit the renderer snapshots whatever DOM is
335    /// present and returns `truncated = true`. Calibrated as
336    /// `p90(successful chrome renders)` clamped to `[8_000, 12_000]`.
337    #[serde(default = "default_chrome_nav_budget_ms")]
338    pub chrome_nav_budget_ms: u64,
339    /// Enable the bounded browser-context pool. Default `false`; v1 ships
340    /// `RECYCLE_AFTER_NAV = 1` (recreate every release) before optimising to
341    /// reuse-with-clearing. See plan Phase 4. **Gated off when
342    /// `chrome_backend = "browserless"`** — browserless v2's
343    /// `Target.createBrowserContext` semantics with long-lived sessions are
344    /// unproven; lib.rs forces this to `false` with a WARN log in that case.
345    #[serde(default)]
346    pub chrome_context_pool_enabled: bool,
347    /// Per-knob pool configuration. Read only when
348    /// `chrome_context_pool_enabled = true` AND backend is `Vanilla`.
349    #[serde(default)]
350    pub chrome_pool: ChromePoolConfig,
351    /// Which Chrome backend the WS URL points at. **Explicit** — never sniff
352    /// from URL substrings (k8s svc names, port-forwards, custom routes break
353    /// substring detection per plan §C2). Default `Vanilla`.
354    #[serde(default)]
355    pub chrome_backend: ChromeBackend,
356    /// Enable the success-ratio renderer predictor in `HostPreferences`.
357    /// Default `false`; flipped after the predictor replay harness gates
358    /// on the 1k bench (false-skip < 2 %, false-escalate < 5 %, churn < 3 / 1k).
359    #[serde(default)]
360    pub use_predictor: bool,
361    /// Engine escalation policy (firecrawl-shaped: race + on-error). When
362    /// disabled (default), the renderer keeps its current ladder unchanged.
363    #[serde(default)]
364    pub escalation: EscalationConfig,
365    /// Anti-bot detection policy (crawl4ai 3-tier classifier).
366    #[serde(default)]
367    pub antibot: AntibotConfig,
368}
369
370/// Engine escalation policy — adds `ChromeStealth` and `ChromeStealthProxy`
371/// tiers behind a feature flag. See `plans/recall-next-tier.md` Phase 2.
372#[derive(Debug, Clone, Deserialize)]
373pub struct EscalationConfig {
374    /// Master switch. Default `false` — current ladder runs unchanged.
375    #[serde(default)]
376    pub enabled: bool,
377    /// Per-tier waterfall trigger in ms. If the current engine hasn't returned
378    /// after this long, the next tier is started in parallel (firecrawl
379    /// `WaterfallNextEngineSignal`).
380    #[serde(default = "default_waterfall_timeout_ms")]
381    pub waterfall_timeout_ms: u64,
382    /// Hard global cap across the whole ladder.
383    #[serde(default = "default_escalation_global_timeout_ms")]
384    pub global_timeout_ms: u64,
385    /// Send `?proxy=residential&proxyCountry=…` to browserless on the
386    /// `ChromeStealthProxy` tier. Off by default — bears cost.
387    #[serde(default)]
388    pub residential_proxy: bool,
389    /// Country code passed to browserless when `residential_proxy = true`.
390    #[serde(default = "default_proxy_country")]
391    pub proxy_country: String,
392}
393
394impl Default for EscalationConfig {
395    fn default() -> Self {
396        Self {
397            enabled: false,
398            waterfall_timeout_ms: default_waterfall_timeout_ms(),
399            global_timeout_ms: default_escalation_global_timeout_ms(),
400            residential_proxy: false,
401            proxy_country: default_proxy_country(),
402        }
403    }
404}
405
406fn default_waterfall_timeout_ms() -> u64 {
407    8_000
408}
409fn default_escalation_global_timeout_ms() -> u64 {
410    60_000
411}
412fn default_proxy_country() -> String {
413    "us".to_string()
414}
415
416/// Anti-bot classifier policy. Default: detect+log only; escalation requires
417/// `escalate_on_signal = true` AND `escalation.enabled = true`.
418#[derive(Debug, Clone, Deserialize)]
419pub struct AntibotConfig {
420    /// Run the classifier on every fetch result. Cheap; default on.
421    #[serde(default = "default_true")]
422    pub enabled: bool,
423    /// When the classifier returns a non-`None` signal, advance to the next
424    /// engine tier (requires `escalation.enabled`).
425    #[serde(default)]
426    pub escalate_on_signal: bool,
427}
428
429impl Default for AntibotConfig {
430    fn default() -> Self {
431        Self {
432            enabled: true,
433            escalate_on_signal: false,
434        }
435    }
436}
437
438fn default_chrome_nav_budget_ms() -> u64 {
439    12_000
440}
441
442/// Per-knob configuration for the bounded browser-context pool. Loaded under
443/// `[renderer.chrome_pool]`. Inactive unless
444/// `chrome_context_pool_enabled = true` AND `chrome_backend = "vanilla"`.
445#[derive(Debug, Clone, Deserialize)]
446pub struct ChromePoolConfig {
447    /// Pool size. `None` → `max(2, num_cpus / 2)`. Caps simultaneous
448    /// in-flight chrome requests per pool.
449    #[serde(default)]
450    pub size: Option<usize>,
451    /// Recycle policy: v1 always recreates the context after each release.
452    /// Reserved for a future "reuse N navigations then recreate" mode.
453    #[serde(default = "default_recycle_after_navs")]
454    pub recycle_after_navs: u32,
455    /// Idle slots older than this are health-checked on next acquire.
456    #[serde(default = "default_idle_timeout_secs")]
457    pub idle_timeout_secs: u64,
458    /// `Browser.getVersion` probe deadline (idle-slot liveness).
459    #[serde(default = "default_health_check_secs")]
460    pub health_check_secs: u64,
461    /// SIGTERM drain window before phase 3 force-close.
462    #[serde(default = "default_shutdown_drain_secs")]
463    pub shutdown_drain_secs: u64,
464}
465
466impl Default for ChromePoolConfig {
467    fn default() -> Self {
468        Self {
469            size: None,
470            recycle_after_navs: default_recycle_after_navs(),
471            idle_timeout_secs: default_idle_timeout_secs(),
472            health_check_secs: default_health_check_secs(),
473            shutdown_drain_secs: default_shutdown_drain_secs(),
474        }
475    }
476}
477
478fn default_recycle_after_navs() -> u32 {
479    1
480}
481fn default_idle_timeout_secs() -> u64 {
482    300
483}
484fn default_health_check_secs() -> u64 {
485    60
486}
487fn default_shutdown_drain_secs() -> u64 {
488    30
489}
490
491/// Chrome backend kind. Set explicitly under `[renderer]` as
492/// `chrome_backend = "vanilla"` or `chrome_backend = "browserless"`. **Never
493/// inferred from URL substrings** — k8s service names, port-forwards, and
494/// custom routes break substring detection. See plan §C2.
495#[derive(Debug, Clone, Copy, Default, Deserialize, PartialEq, Eq)]
496#[serde(rename_all = "lowercase")]
497pub enum ChromeBackend {
498    /// chromedp/headless-shell or vanilla Chrome with `/json/version`. Pool
499    /// is enabled here when `chrome_context_pool_enabled = true`.
500    #[default]
501    Vanilla,
502    /// Browserless v2 / commercial CDP endpoint. Pool is **gated off** in v1
503    /// — see plan §"Out of scope (v1)".
504    Browserless,
505}
506
507impl Default for RendererConfig {
508    fn default() -> Self {
509        Self {
510            mode: RendererMode::default(),
511            page_timeout_ms: default_page_timeout(),
512            http_timeout_ms: None,
513            lightpanda_timeout_ms: None,
514            chrome_timeout_ms: None,
515            pool_size: default_pool_size(),
516            render_js_default: None,
517            lightpanda: None,
518            playwright: None,
519            chrome: None,
520            chrome_intercept_resources: false,
521            chrome_intercept_stylesheets: false,
522            chrome_host_intercept_disable: Vec::new(),
523            chrome_nav_budget_ms: default_chrome_nav_budget_ms(),
524            chrome_context_pool_enabled: false,
525            chrome_pool: ChromePoolConfig::default(),
526            chrome_backend: ChromeBackend::default(),
527            use_predictor: false,
528            escalation: EscalationConfig::default(),
529            antibot: AntibotConfig::default(),
530        }
531    }
532}
533fn default_page_timeout() -> u64 {
534    30000
535}
536
537impl RendererConfig {
538    /// Resolved per-tier nav timeout in milliseconds. Resolution rules:
539    ///   1. If the explicit per-tier field is set, use it verbatim.
540    ///   2. Otherwise fall back to `page_timeout_ms` (which itself defaults
541    ///      to 30s for backward compatibility with pre-multi-tier configs).
542    ///
543    /// New deployments are encouraged to set the per-tier knobs to 15/20/45s
544    /// (see config.docker.toml) — these match the bench-tuned values that
545    /// recover slow gov sites in the chrome tier without giving the http
546    /// tier permission to hog the request budget.
547    pub fn http_timeout(&self) -> u64 {
548        self.http_timeout_ms.unwrap_or(self.page_timeout_ms)
549    }
550    pub fn lightpanda_timeout(&self) -> u64 {
551        self.lightpanda_timeout_ms.unwrap_or(self.page_timeout_ms)
552    }
553    pub fn chrome_timeout(&self) -> u64 {
554        self.chrome_timeout_ms.unwrap_or(self.page_timeout_ms)
555    }
556
557    /// Number of active CDP tiers (lightpanda + playwright + chrome) under
558    /// the current `mode`. Mirrors the predicate used at runtime in
559    /// `crw-renderer/src/lib.rs` when constructing the renderer ladder:
560    /// `want(mode) && config.<tier>.is_some()`.
561    ///
562    /// Returns `0` when the binary is built without the `cdp` feature — in
563    /// that case no JS renderer can be constructed regardless of the config,
564    /// so the deadline auto-extension policy must collapse to HTTP-only.
565    pub fn cdp_tier_count(&self) -> usize {
566        if !cfg!(feature = "cdp") {
567            return 0;
568        }
569        let want =
570            |m: RendererMode| -> bool { matches!(self.mode, RendererMode::Auto) || self.mode == m };
571        let mut n = 0;
572        if want(RendererMode::Lightpanda) && self.lightpanda.is_some() {
573            n += 1;
574        }
575        if want(RendererMode::Playwright) && self.playwright.is_some() {
576            n += 1;
577        }
578        if want(RendererMode::Chrome) && self.chrome.is_some() {
579            n += 1;
580        }
581        n
582    }
583
584    /// Minimum request deadline budget (ms) required so that every configured
585    /// tier can use its full allowance when fallback exhausts the chain.
586    /// Sums the per-tier timeouts and adds [`CDP_TIER_OVERHEAD_MS`] for each
587    /// active CDP tier, matching the runtime ladder built in
588    /// `crw-renderer/src/lib.rs`.
589    pub fn min_deadline_for_full_ladder_ms(&self) -> u64 {
590        let want =
591            |m: RendererMode| -> bool { matches!(self.mode, RendererMode::Auto) || self.mode == m };
592
593        let mut sum: u64 = 0;
594        // HTTP prefetch runs ahead of any JS tier (content-type sniffing,
595        // direct PDF/binary handling) regardless of pinned mode. Skipped only
596        // when mode is `None` (no fetching at all).
597        if !matches!(self.mode, RendererMode::None) {
598            sum = sum.saturating_add(self.http_timeout());
599        }
600
601        // CDP tiers only contribute when the binary was built with the `cdp`
602        // feature; otherwise no JS renderer is constructable at runtime and
603        // including their budgets would over-extend the deadline.
604        if !cfg!(feature = "cdp") {
605            return sum;
606        }
607
608        let mut cdp_tier_count: u64 = 0;
609        if want(RendererMode::Lightpanda) && self.lightpanda.is_some() {
610            sum = sum.saturating_add(self.lightpanda_timeout());
611            cdp_tier_count += 1;
612        }
613        if want(RendererMode::Playwright) && self.playwright.is_some() {
614            sum = sum.saturating_add(self.chrome_timeout());
615            cdp_tier_count += 1;
616        }
617        if want(RendererMode::Chrome) && self.chrome.is_some() {
618            sum = sum.saturating_add(self.chrome_timeout());
619            cdp_tier_count += 1;
620        }
621        sum.saturating_add(cdp_tier_count.saturating_mul(CDP_TIER_OVERHEAD_MS))
622    }
623}
624fn default_pool_size() -> usize {
625    4
626}
627
628#[derive(Debug, Clone, Deserialize)]
629pub struct CdpEndpoint {
630    pub ws_url: String,
631}
632
633/// Stealth mode configuration for evading bot detection.
634#[derive(Debug, Clone, Deserialize)]
635pub struct StealthConfig {
636    /// Enable stealth mode globally.
637    #[serde(default)]
638    pub enabled: bool,
639    /// Custom user-agent pool. Empty = use built-in pool.
640    #[serde(default)]
641    pub user_agents: Vec<String>,
642    /// Jitter factor for rate limiting (0.0–1.0, default 0.2 = ±20%).
643    #[serde(default = "default_jitter")]
644    pub jitter_factor: f64,
645    /// Inject realistic browser headers (Accept, Sec-Fetch-*, etc.).
646    #[serde(default = "default_true")]
647    pub inject_headers: bool,
648}
649
650impl Default for StealthConfig {
651    fn default() -> Self {
652        Self {
653            enabled: false,
654            user_agents: vec![],
655            jitter_factor: default_jitter(),
656            inject_headers: true,
657        }
658    }
659}
660
661fn default_jitter() -> f64 {
662    0.2
663}
664
665/// Built-in realistic user-agent pool used when stealth is enabled.
666pub const BUILTIN_UA_POOL: &[&str] = &[
667    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
668    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
669    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
670    "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:133.0) Gecko/20100101 Firefox/133.0",
671    "Mozilla/5.0 (Macintosh; Intel Mac OS X 14_7_2) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.2 Safari/605.1.15",
672];
673
674#[derive(Debug, Clone, Deserialize)]
675pub struct CrawlerConfig {
676    #[serde(default = "default_concurrency")]
677    pub max_concurrency: usize,
678    #[serde(default = "default_rps")]
679    pub requests_per_second: f64,
680    #[serde(default = "default_true")]
681    pub respect_robots_txt: bool,
682    #[serde(default = "default_ua")]
683    pub user_agent: String,
684    #[serde(default = "default_depth")]
685    pub default_max_depth: u32,
686    #[serde(default = "default_max_pages")]
687    pub default_max_pages: u32,
688    /// Proxy URL for crawler requests. Supports HTTP, HTTPS, and SOCKS5
689    /// (e.g. "http://proxy:8080" or "socks5://user:pass@proxy:1080").
690    #[serde(default)]
691    pub proxy: Option<String>,
692    /// TTL in seconds for completed crawl jobs before cleanup (default: 3600)
693    #[serde(default = "default_job_ttl")]
694    pub job_ttl_secs: u64,
695    #[serde(default)]
696    pub stealth: StealthConfig,
697    /// Floor for the per-host limiter interval, in milliseconds. When a host
698    /// advertises `Crawl-delay` in robots.txt, the higher of the two wins.
699    /// Default `0` — robots.txt is the authoritative source, this is a
700    /// per-deployment safety net.
701    #[serde(default)]
702    pub per_host_min_interval_ms: u64,
703    /// Maximum concurrent in-flight requests against a single eTLD+1.
704    /// Default `1` — strict ethics posture; operators raise consciously via
705    /// config when scraping their own infrastructure.
706    #[serde(default = "default_per_host_max_concurrent")]
707    pub per_host_max_concurrent: u32,
708}
709
710fn default_per_host_max_concurrent() -> u32 {
711    1
712}
713
714impl Default for CrawlerConfig {
715    fn default() -> Self {
716        Self {
717            max_concurrency: default_concurrency(),
718            requests_per_second: default_rps(),
719            respect_robots_txt: true,
720            user_agent: default_ua(),
721            default_max_depth: default_depth(),
722            default_max_pages: default_max_pages(),
723            proxy: None,
724            job_ttl_secs: default_job_ttl(),
725            stealth: StealthConfig::default(),
726            per_host_min_interval_ms: 0,
727            per_host_max_concurrent: default_per_host_max_concurrent(),
728        }
729    }
730}
731
732fn default_concurrency() -> usize {
733    10
734}
735fn default_rps() -> f64 {
736    10.0
737}
738fn default_true() -> bool {
739    true
740}
741fn default_ua() -> String {
742    // Modern Chrome UA. The legacy "CRW/0.1" was rejected by UA-filtering sites
743    // (opencorporates, killeenisd, wsj) returning 403/404. Kept in sync with the
744    // Sec-Ch-Ua client hint in `crw-renderer/src/http_only.rs`.
745    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 \
746     (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36"
747        .into()
748}
749fn default_depth() -> u32 {
750    2
751}
752fn default_max_pages() -> u32 {
753    100
754}
755fn default_job_ttl() -> u64 {
756    3600
757}
758
759#[derive(Debug, Clone, Deserialize)]
760pub struct ExtractionConfig {
761    #[serde(default = "default_format")]
762    pub default_format: String,
763    #[serde(default = "default_true_ext")]
764    pub only_main_content: bool,
765    #[serde(default)]
766    pub llm: Option<LlmConfig>,
767    /// Hostname → CSS selector overrides applied before readability narrowing.
768    /// Match is exact host (no wildcard); user-supplied selector still wins.
769    #[serde(default)]
770    pub domain_selectors: std::collections::HashMap<String, String>,
771    #[serde(default)]
772    pub llm_fallback: LlmFallbackConfig,
773    /// Bytes below which an HTTP-tier extraction is treated as "thin"
774    /// and triggers a JS-renderer escalation. Default 100.
775    #[serde(default = "default_http_retry_threshold")]
776    pub http_retry_threshold_bytes: usize,
777    /// Bytes below which a LightPanda-tier extraction is treated as
778    /// "thin" and triggers a Chrome escalation. Default 2000 (LP often
779    /// returns SPA husks of 90–500B that pass HTML-shape checks).
780    #[serde(default = "default_lightpanda_retry_threshold")]
781    pub lightpanda_retry_threshold_bytes: usize,
782}
783
784fn default_http_retry_threshold() -> usize {
785    100
786}
787
788fn default_lightpanda_retry_threshold() -> usize {
789    2000
790}
791
792impl Default for ExtractionConfig {
793    fn default() -> Self {
794        Self {
795            default_format: default_format(),
796            only_main_content: true,
797            llm: None,
798            domain_selectors: std::collections::HashMap::new(),
799            llm_fallback: LlmFallbackConfig::default(),
800            http_retry_threshold_bytes: default_http_retry_threshold(),
801            lightpanda_retry_threshold_bytes: default_lightpanda_retry_threshold(),
802        }
803    }
804}
805
806#[derive(Debug, Clone, Deserialize)]
807pub struct LlmFallbackConfig {
808    #[serde(default)]
809    pub enable: bool,
810    #[serde(default = "default_llm_quality_threshold")]
811    pub quality_threshold: f32,
812    #[serde(default = "default_llm_max_html_bytes")]
813    pub max_html_bytes: usize,
814    /// When true (and `enable` is true), invoke the LLM on every page rather
815    /// than only when DOM-based extraction scores below `quality_threshold`.
816    /// Mirrors the "LLM as primary extractor" pattern used by Reader-LM,
817    /// Firecrawl, and similar services. Higher cost, higher recall.
818    #[serde(default)]
819    pub always_run: bool,
820}
821
822impl Default for LlmFallbackConfig {
823    fn default() -> Self {
824        Self {
825            enable: false,
826            quality_threshold: default_llm_quality_threshold(),
827            max_html_bytes: default_llm_max_html_bytes(),
828            always_run: false,
829        }
830    }
831}
832
833fn default_llm_quality_threshold() -> f32 {
834    0.3
835}
836fn default_llm_max_html_bytes() -> usize {
837    100_000
838}
839
840#[derive(Debug, Clone, Deserialize)]
841pub struct LlmConfig {
842    #[serde(default = "default_llm_provider")]
843    pub provider: String,
844    pub api_key: String,
845    #[serde(default = "default_llm_model")]
846    pub model: String,
847    #[serde(default)]
848    pub base_url: Option<String>,
849    #[serde(default = "default_llm_max_tokens")]
850    pub max_tokens: u32,
851    /// Azure OpenAI API version (e.g. "2024-05-01-preview"). Required when
852    /// `provider = "azure"`; ignored otherwise.
853    #[serde(default)]
854    pub azure_api_version: Option<String>,
855    /// Max parallel LLM calls for fan-out (e.g. per-result search summaries).
856    /// Bounded to avoid hitting provider rate limits.
857    #[serde(default = "default_llm_max_concurrency")]
858    pub max_concurrency: usize,
859    /// Byte cap on content sent to the LLM in a single call. Content beyond
860    /// the cap is truncated on a UTF-8 char boundary.
861    #[serde(default = "default_llm_max_html_bytes")]
862    pub max_html_bytes: usize,
863    /// When set, opencore refuses LLM-touching requests that lack this header
864    /// AND do not supply `llm_api_key` in the body. SaaS deploys set this so
865    /// direct public callers can't access LLM features.
866    #[serde(default)]
867    pub require_byok_header: Option<String>,
868}
869
870impl Default for LlmConfig {
871    fn default() -> Self {
872        Self {
873            provider: default_llm_provider(),
874            api_key: String::new(),
875            model: default_llm_model(),
876            base_url: None,
877            max_tokens: default_llm_max_tokens(),
878            azure_api_version: None,
879            max_concurrency: default_llm_max_concurrency(),
880            max_html_bytes: default_llm_max_html_bytes(),
881            require_byok_header: None,
882        }
883    }
884}
885
886fn default_llm_max_concurrency() -> usize {
887    4
888}
889
890fn default_llm_provider() -> String {
891    "anthropic".into()
892}
893fn default_llm_model() -> String {
894    "claude-sonnet-4-20250514".into()
895}
896fn default_llm_max_tokens() -> u32 {
897    4096
898}
899
900fn default_format() -> String {
901    "markdown".into()
902}
903fn default_true_ext() -> bool {
904    true
905}
906
907#[derive(Debug, Clone, Default, Deserialize)]
908pub struct AuthConfig {
909    #[serde(default)]
910    pub api_keys: Vec<String>,
911}
912
913/// Path of the per-user config file written by `crw setup`. Returns `None` if
914/// the home directory cannot be resolved (e.g. headless container with no
915/// `$HOME`). Honors `$CRW_USER_CONFIG_DIR` for tests so we don't have to
916/// monkey-patch `$HOME`.
917pub fn user_config_path() -> Option<std::path::PathBuf> {
918    if let Ok(dir) = std::env::var("CRW_USER_CONFIG_DIR") {
919        return Some(std::path::PathBuf::from(dir).join("config.toml"));
920    }
921    let home = std::env::var_os("HOME")?;
922    Some(
923        std::path::PathBuf::from(home)
924            .join(".config")
925            .join("crw")
926            .join("config.toml"),
927    )
928}
929
930impl AppConfig {
931    /// Load config from config.default.toml + per-user config + environment
932    /// variable overrides.
933    ///
934    /// Precedence (highest wins):
935    ///   1. `CRW_*` env vars (CI/Docker)
936    ///   2. `$CRW_CONFIG` file (or `config.local.toml` in cwd)
937    ///   3. `~/.config/crw/config.toml` (written by `crw setup`)
938    ///   4. `config.default.toml` (bundled defaults)
939    ///
940    /// Env stays on top so a one-off `CRW_FOO=bar crw …` always wins over
941    /// whatever the user has saved, matching how every other shell tool works.
942    pub fn load() -> Result<Self, config::ConfigError> {
943        let mut builder = config::Config::builder()
944            .add_source(config::File::with_name("config.default").required(false));
945
946        // User-level config — written atomically by `crw setup`. Optional, so
947        // a never-configured machine simply reads defaults + env.
948        if let Some(user_cfg) = user_config_path()
949            && user_cfg.exists()
950        {
951            builder = builder.add_source(config::File::from(user_cfg).required(false));
952        }
953
954        // Load optional override config file (e.g. config.docker.toml in containers).
955        if let Ok(extra) = std::env::var("CRW_CONFIG") {
956            builder = builder.add_source(config::File::with_name(&extra).required(true));
957        } else {
958            builder = builder.add_source(config::File::with_name("config.local").required(false));
959        }
960
961        let cfg = builder
962            .add_source(
963                config::Environment::with_prefix("CRW")
964                    .prefix_separator("_")
965                    .separator("__")
966                    .try_parsing(true),
967            )
968            .build()?;
969        cfg.try_deserialize()
970    }
971
972    /// Compute the effective end-to-end request deadline (ms). Implements the
973    /// issue-#35 auto-extension policy:
974    ///
975    /// 1. If the caller supplied an explicit `requested_deadline_ms`, return it
976    ///    verbatim — operators trust the request budget over our heuristic.
977    /// 2. Otherwise, when `request.auto_extend_deadline_for_ladder` is on,
978    ///    return `max(deadline_ms_default, ladder_min + wait_for_extra)`.
979    ///    `ladder_min` covers the configured tier ladder; `wait_for_extra`
980    ///    compensates for callers that bumped `wait_for_ms` above the default
981    ///    SPA budget (8s) — without it, a long `wait_for` would silently
982    ///    re-clamp inside CDP.
983    /// 3. When the policy is disabled, return `deadline_ms_default` unchanged.
984    ///
985    /// `wait_for_ms` is the per-request override (ScrapeRequest::wait_for /
986    /// CrawlRequest::wait_for); pass `None` for sub-fetches that don't
987    /// surface a wait_for to the caller (search/map enrichment).
988    pub fn effective_deadline_ms(
989        &self,
990        requested_deadline_ms: Option<u64>,
991        wait_for_ms: Option<u64>,
992    ) -> u64 {
993        if let Some(explicit) = requested_deadline_ms {
994            return explicit;
995        }
996        let default_ms = self.request.deadline_ms_default;
997        if !self.request.auto_extend_deadline_for_ladder {
998            return default_ms;
999        }
1000        // Issue #35 is specifically about CDP tier overhead silently clamping
1001        // chrome_timeout_ms. HTTP-only deployments don't suffer the same
1002        // problem (the HTTP renderer respects deadline.remaining without the
1003        // extra fetch/challenge/stability overhead). Skip the extension when
1004        // no CDP tiers are configured so HTTP-only users keep the strict
1005        // operator-configured default.
1006        if self.renderer.cdp_tier_count() == 0 {
1007            return default_ms;
1008        }
1009        let ladder_min = self.renderer.min_deadline_for_full_ladder_ms();
1010        // Mirrors crw_renderer::cdp::SPA_SELECTOR_MAX_MS. The CDP module
1011        // adds `wait_for_ms.unwrap_or(SPA_SELECTOR_MAX_MS)` to its internal
1012        // timeout, so when the caller exceeds the default we need to extend
1013        // the deadline per active CDP tier.
1014        const SPA_DEFAULT_MS: u64 = 8_000;
1015        // Clamp `wait_for_ms` to MAX_WAIT_FOR_MS so the inner deadline never
1016        // exceeds the Tower envelope, which is sized off the same constant in
1017        // `effective_request_timeout_secs`. A pathological caller passing
1018        // `wait_for: 600_000` without `deadlineMs` would otherwise be cancelled
1019        // by Tower before the inner CDP loop noticed the bigger budget.
1020        let extra = if let Some(w) = wait_for_ms {
1021            let bounded = w.min(MAX_WAIT_FOR_MS);
1022            let per_tier = bounded.saturating_sub(SPA_DEFAULT_MS);
1023            per_tier.saturating_mul(self.renderer.cdp_tier_count() as u64)
1024        } else {
1025            0
1026        };
1027        default_ms.max(ladder_min.saturating_add(extra))
1028    }
1029
1030    /// Tower middleware outer timeout (seconds). Must accommodate the longest
1031    /// legitimate handler runtime so a healthy request isn't cancelled by the
1032    /// outer layer before the inner deadline fires.
1033    ///
1034    /// Covers the three route envelopes:
1035    /// - `/scrape`, `/mcp` — auto-extended scrape deadline.
1036    /// - `/search` — SearXNG fetch + bounded enrichment fan-out
1037    ///   (`ceil(max_limit / max_concurrency)` batches × scrape_ms).
1038    /// - `/crawl/jobs/:id`, `/map` — handler-side caps up to 300s.
1039    ///
1040    /// When auto-extend is disabled, returns the operator-configured baseline
1041    /// unchanged.
1042    pub fn effective_request_timeout_secs(&self) -> u64 {
1043        let baseline = self.server.request_timeout_secs;
1044        if !self.request.auto_extend_deadline_for_ladder {
1045            return baseline;
1046        }
1047        const OUTER_BUFFER_SECS: u64 = 5;
1048        // `/map` handler caps `req.timeout.unwrap_or(120).min(300)`; the outer
1049        // must cover the upper bound so callers passing `timeout=300` aren't
1050        // cancelled mid-flight.
1051        const MAP_REQUEST_TIMEOUT_CEILING_MS: u64 = 300_000;
1052        // Cover the worst-case implicit scrape: caller bumps `wait_for` to the
1053        // configured maximum without supplying `deadlineMs`. The same
1054        // [`MAX_WAIT_FOR_MS`] constant is used inside `effective_deadline_ms`
1055        // to clamp the inner extension, so the inner deadline can never
1056        // exceed this outer envelope.
1057        let scrape_ms = self.effective_deadline_ms(None, Some(MAX_WAIT_FOR_MS));
1058
1059        // Search enrichment: bounded by max_concurrency. Worst case sequential
1060        // batching with low concurrency: ceil(max_limit / max_concurrency)
1061        // batches each bounded by scrape_ms.
1062        let conc = (self.crawler.max_concurrency.max(1)) as u64;
1063        let max_results = self.search.max_limit as u64;
1064        let enrich_batches = max_results.div_ceil(conc);
1065        let search_enrichment_ms = enrich_batches.saturating_mul(scrape_ms);
1066        let search_ms = self.search.timeout_ms.saturating_add(search_enrichment_ms);
1067
1068        let max_handler_ms = scrape_ms.max(search_ms).max(MAP_REQUEST_TIMEOUT_CEILING_MS);
1069        let needed_secs = max_handler_ms
1070            .div_ceil(1_000)
1071            .saturating_add(OUTER_BUFFER_SECS);
1072        baseline.max(needed_secs)
1073    }
1074}
1075
1076#[cfg(test)]
1077mod tests {
1078    use super::*;
1079
1080    /// Env var tests modify process-wide state; serialize them to avoid cross-test
1081    /// interference (e.g. `force_js` alias + `render_js_default` direct both set).
1082    static ENV_LOCK: std::sync::Mutex<()> = std::sync::Mutex::new(());
1083
1084    fn clear_renderer_env() {
1085        for k in [
1086            "CRW_RENDERER__MODE",
1087            "CRW_RENDERER__FORCE_JS",
1088            "CRW_RENDERER__RENDER_JS_DEFAULT",
1089            "CRW_RENDERER__LIGHTPANDA__WS_URL",
1090            "CRW_SERVER__PORT",
1091        ] {
1092            unsafe { std::env::remove_var(k) };
1093        }
1094    }
1095
1096    #[test]
1097    fn renderer_mode_parses_variants() {
1098        #[derive(Deserialize)]
1099        struct Wrap {
1100            mode: RendererMode,
1101        }
1102        let cases = [
1103            ("mode = \"auto\"", RendererMode::Auto),
1104            ("mode = \"none\"", RendererMode::None),
1105            ("mode = \"lightpanda\"", RendererMode::Lightpanda),
1106            ("mode = \"chrome\"", RendererMode::Chrome),
1107            ("mode = \"playwright\"", RendererMode::Playwright),
1108        ];
1109        for (toml_str, expected) in cases {
1110            let w: Wrap = toml::from_str(toml_str).unwrap();
1111            assert_eq!(w.mode, expected, "toml: {toml_str}");
1112        }
1113    }
1114
1115    #[test]
1116    fn renderer_mode_bogus_errors() {
1117        #[derive(Deserialize)]
1118        struct Wrap {
1119            #[allow(dead_code)]
1120            mode: RendererMode,
1121        }
1122        let err: Result<Wrap, _> = toml::from_str("mode = \"bogus\"");
1123        assert!(err.is_err(), "bogus mode should fail to parse");
1124    }
1125
1126    #[test]
1127    fn renderer_config_default_mode_is_auto() {
1128        let cfg = RendererConfig::default();
1129        assert_eq!(cfg.mode, RendererMode::Auto);
1130        assert_eq!(cfg.render_js_default, None);
1131    }
1132
1133    #[test]
1134    fn render_js_default_force_js_alias() {
1135        let cfg: RendererConfig = toml::from_str("force_js = true").unwrap();
1136        assert_eq!(cfg.render_js_default, Some(true));
1137    }
1138
1139    #[test]
1140    fn render_js_default_direct_field() {
1141        let cfg: RendererConfig = toml::from_str("render_js_default = false").unwrap();
1142        assert_eq!(cfg.render_js_default, Some(false));
1143    }
1144
1145    #[test]
1146    fn env_var_renderer_mode_chrome() {
1147        let _g = ENV_LOCK.lock().unwrap();
1148        clear_renderer_env();
1149        unsafe { std::env::set_var("CRW_RENDERER__MODE", "chrome") };
1150        let cfg = AppConfig::load().unwrap();
1151        clear_renderer_env();
1152        assert_eq!(cfg.renderer.mode, RendererMode::Chrome);
1153    }
1154
1155    #[test]
1156    fn env_var_force_js_alias_works() {
1157        let _g = ENV_LOCK.lock().unwrap();
1158        clear_renderer_env();
1159        unsafe { std::env::set_var("CRW_RENDERER__FORCE_JS", "true") };
1160        let cfg = AppConfig::load().unwrap();
1161        clear_renderer_env();
1162        assert_eq!(cfg.renderer.render_js_default, Some(true));
1163    }
1164
1165    #[test]
1166    fn env_var_render_js_default_direct() {
1167        let _g = ENV_LOCK.lock().unwrap();
1168        clear_renderer_env();
1169        unsafe { std::env::set_var("CRW_RENDERER__RENDER_JS_DEFAULT", "true") };
1170        let cfg = AppConfig::load().unwrap();
1171        clear_renderer_env();
1172        assert_eq!(cfg.renderer.render_js_default, Some(true));
1173    }
1174
1175    #[test]
1176    fn request_config_defaults_match_plan() {
1177        let r = RequestConfig::default();
1178        assert_eq!(r.deadline_ms_default, 8000);
1179        assert!(r.auto_extend_deadline_for_ladder);
1180    }
1181
1182    #[test]
1183    fn default_app_config_enables_auto_extend() {
1184        // Programmatic Default must mirror serde defaults — issue #35.
1185        let cfg = AppConfig::default();
1186        assert!(cfg.request.auto_extend_deadline_for_ladder);
1187        assert_eq!(cfg.request.deadline_ms_default, 8000);
1188    }
1189
1190    fn renderer_with_chrome_only(chrome_ms: u64) -> RendererConfig {
1191        RendererConfig {
1192            mode: RendererMode::Chrome,
1193            page_timeout_ms: chrome_ms,
1194            chrome_timeout_ms: Some(chrome_ms),
1195            chrome: Some(CdpEndpoint {
1196                ws_url: "ws://chrome:9222".into(),
1197            }),
1198            ..Default::default()
1199        }
1200    }
1201
1202    #[test]
1203    #[cfg(feature = "cdp")]
1204    fn min_deadline_full_ladder_chrome_only() {
1205        // chrome-only mode: http (page_timeout) + chrome + 1 * 28000.
1206        let r = renderer_with_chrome_only(30_000);
1207        // page_timeout_ms is set to chrome_ms here, so http_timeout() → 30s.
1208        assert_eq!(
1209            r.min_deadline_for_full_ladder_ms(),
1210            30_000 + 30_000 + 28_000
1211        );
1212    }
1213
1214    #[test]
1215    #[cfg(feature = "cdp")]
1216    fn min_deadline_full_ladder_auto_three_tiers() {
1217        let r = RendererConfig {
1218            mode: RendererMode::Auto,
1219            page_timeout_ms: 15_000,
1220            http_timeout_ms: Some(15_000),
1221            lightpanda_timeout_ms: Some(2_500),
1222            chrome_timeout_ms: Some(30_000),
1223            lightpanda: Some(CdpEndpoint {
1224                ws_url: "ws://lp:9222".into(),
1225            }),
1226            chrome: Some(CdpEndpoint {
1227                ws_url: "ws://chrome:9222".into(),
1228            }),
1229            ..Default::default()
1230        };
1231        // http(15) + lp(2.5) + chrome(30) + 2*28 = 47.5 + 56 = 103_500.
1232        assert_eq!(
1233            r.min_deadline_for_full_ladder_ms(),
1234            15_000 + 2_500 + 30_000 + 2 * 28_000
1235        );
1236        assert_eq!(r.cdp_tier_count(), 2);
1237    }
1238
1239    #[test]
1240    fn effective_deadline_explicit_bypasses_auto_extend() {
1241        let mut cfg = AppConfig::default();
1242        cfg.request.auto_extend_deadline_for_ladder = true;
1243        cfg.renderer = renderer_with_chrome_only(30_000);
1244        // Explicit override beats both default and ladder_min.
1245        assert_eq!(cfg.effective_deadline_ms(Some(5_000), None), 5_000);
1246        assert_eq!(cfg.effective_deadline_ms(Some(500_000), None), 500_000);
1247    }
1248
1249    #[test]
1250    #[cfg(feature = "cdp")]
1251    fn effective_deadline_auto_extend_raises_to_ladder_min() {
1252        let mut cfg = AppConfig::default();
1253        cfg.request.auto_extend_deadline_for_ladder = true;
1254        cfg.request.deadline_ms_default = 8_000;
1255        cfg.renderer = renderer_with_chrome_only(30_000);
1256        let expected = cfg.renderer.min_deadline_for_full_ladder_ms();
1257        assert!(expected > 8_000);
1258        assert_eq!(cfg.effective_deadline_ms(None, None), expected);
1259    }
1260
1261    #[test]
1262    fn effective_deadline_default_wins_when_higher_than_ladder() {
1263        let mut cfg = AppConfig::default();
1264        cfg.request.auto_extend_deadline_for_ladder = true;
1265        cfg.request.deadline_ms_default = 1_000_000;
1266        cfg.renderer = renderer_with_chrome_only(30_000);
1267        assert_eq!(cfg.effective_deadline_ms(None, None), 1_000_000);
1268    }
1269
1270    #[test]
1271    fn effective_deadline_auto_extend_disabled_returns_baseline() {
1272        let mut cfg = AppConfig::default();
1273        cfg.request.auto_extend_deadline_for_ladder = false;
1274        cfg.request.deadline_ms_default = 8_000;
1275        cfg.renderer = renderer_with_chrome_only(30_000);
1276        assert_eq!(cfg.effective_deadline_ms(None, None), 8_000);
1277    }
1278
1279    #[test]
1280    #[cfg(feature = "cdp")]
1281    fn effective_deadline_extends_for_long_wait_for() {
1282        let mut cfg = AppConfig::default();
1283        cfg.request.auto_extend_deadline_for_ladder = true;
1284        cfg.request.deadline_ms_default = 8_000;
1285        cfg.renderer = renderer_with_chrome_only(30_000);
1286        let base = cfg.renderer.min_deadline_for_full_ladder_ms();
1287        let tier_count = cfg.renderer.cdp_tier_count() as u64;
1288        // wait_for = 20000 → per-tier extra = 12000 over SPA_DEFAULT_MS (8000).
1289        let with_wait = cfg.effective_deadline_ms(None, Some(20_000));
1290        assert_eq!(with_wait, base + 12_000 * tier_count);
1291        // wait_for below SPA default → no extra.
1292        assert_eq!(cfg.effective_deadline_ms(None, Some(2_000)), base);
1293    }
1294
1295    #[test]
1296    fn effective_request_timeout_covers_map_ceiling() {
1297        let mut cfg = AppConfig::default();
1298        cfg.request.auto_extend_deadline_for_ladder = true;
1299        cfg.request.deadline_ms_default = 8_000;
1300        cfg.renderer = renderer_with_chrome_only(30_000);
1301        cfg.search.timeout_ms = 15_000;
1302        cfg.crawler.max_concurrency = 10;
1303        cfg.search.max_limit = 20;
1304        cfg.server.request_timeout_secs = 60;
1305        // Map ceiling 300s + 5s buffer = 305s minimum.
1306        assert!(cfg.effective_request_timeout_secs() >= 305);
1307    }
1308
1309    #[test]
1310    fn effective_request_timeout_disabled_returns_baseline() {
1311        let mut cfg = AppConfig::default();
1312        cfg.request.auto_extend_deadline_for_ladder = false;
1313        cfg.server.request_timeout_secs = 60;
1314        assert_eq!(cfg.effective_request_timeout_secs(), 60);
1315    }
1316
1317    #[test]
1318    fn effective_request_timeout_respects_operator_override() {
1319        let mut cfg = AppConfig::default();
1320        cfg.request.auto_extend_deadline_for_ladder = true;
1321        cfg.server.request_timeout_secs = 600; // operator-configured high
1322        cfg.renderer = renderer_with_chrome_only(30_000);
1323        // Operator's explicit 600s should win over the auto-computed 305s.
1324        assert_eq!(cfg.effective_request_timeout_secs(), 600);
1325    }
1326
1327    #[test]
1328    fn effective_request_timeout_search_sequential_batching() {
1329        // Low concurrency forces ceil(max_limit/conc) batches → larger search_ms.
1330        let mut cfg = AppConfig::default();
1331        cfg.request.auto_extend_deadline_for_ladder = true;
1332        cfg.request.deadline_ms_default = 8_000;
1333        cfg.renderer = renderer_with_chrome_only(30_000);
1334        cfg.search.timeout_ms = 15_000;
1335        cfg.search.max_limit = 20;
1336        cfg.crawler.max_concurrency = 1;
1337        cfg.server.request_timeout_secs = 60;
1338        // The Tower envelope must cover the worst-case implicit scrape with
1339        // `wait_for` bumped to MAX_WAIT_FOR_MS (60s), because callers can do
1340        // that without supplying `deadlineMs`. Mirror that in the expected.
1341        let secs = cfg.effective_request_timeout_secs();
1342        let scrape_ms = cfg.effective_deadline_ms(None, Some(60_000));
1343        let expected_search_ms = 15_000 + 20 * scrape_ms;
1344        let expected_max_ms = scrape_ms.max(expected_search_ms).max(300_000);
1345        let expected_secs = expected_max_ms.div_ceil(1_000) + 5;
1346        assert_eq!(secs, 60u64.max(expected_secs));
1347    }
1348
1349    #[test]
1350    #[cfg(not(feature = "cdp"))]
1351    fn cdp_tier_count_zero_without_cdp_feature() {
1352        // Even when chrome/lightpanda are configured, a binary built without
1353        // the `cdp` feature can never construct a JS renderer. The deadline
1354        // policy must observe that and collapse to HTTP-only behavior.
1355        let r = RendererConfig {
1356            mode: RendererMode::Auto,
1357            page_timeout_ms: 15_000,
1358            chrome_timeout_ms: Some(30_000),
1359            chrome: Some(CdpEndpoint {
1360                ws_url: "ws://chrome:9222".into(),
1361            }),
1362            lightpanda: Some(CdpEndpoint {
1363                ws_url: "ws://lp:9222".into(),
1364            }),
1365            ..Default::default()
1366        };
1367        assert_eq!(r.cdp_tier_count(), 0);
1368        // Only the HTTP tier contributes to the ladder budget.
1369        assert_eq!(r.min_deadline_for_full_ladder_ms(), 15_000);
1370    }
1371
1372    #[test]
1373    fn effective_deadline_skipped_for_http_only_mode() {
1374        // P2 from codex review: HTTP-only deployments don't suffer the CDP
1375        // clamping problem (no fetch/challenge/stability overhead). The
1376        // auto-extension must NOT silently bump their default from 8s to 30s
1377        // just because page_timeout_ms defaults high.
1378        let mut cfg = AppConfig::default();
1379        cfg.request.auto_extend_deadline_for_ladder = true;
1380        cfg.request.deadline_ms_default = 8_000;
1381        cfg.renderer = RendererConfig {
1382            mode: RendererMode::Auto,
1383            page_timeout_ms: 30_000,
1384            // No CDP endpoints configured.
1385            lightpanda: None,
1386            playwright: None,
1387            chrome: None,
1388            ..Default::default()
1389        };
1390        assert_eq!(cfg.renderer.cdp_tier_count(), 0);
1391        assert_eq!(cfg.effective_deadline_ms(None, None), 8_000);
1392        assert_eq!(cfg.effective_deadline_ms(None, Some(30_000)), 8_000);
1393    }
1394
1395    #[test]
1396    #[cfg(feature = "cdp")]
1397    fn min_deadline_full_ladder_playwright_only() {
1398        // Playwright tier contributes one chrome_timeout + one CDP overhead,
1399        // matching the runtime predicate in `crw-renderer/src/lib.rs`.
1400        let r = RendererConfig {
1401            mode: RendererMode::Playwright,
1402            page_timeout_ms: 15_000,
1403            http_timeout_ms: Some(15_000),
1404            chrome_timeout_ms: Some(30_000),
1405            playwright: Some(CdpEndpoint {
1406                ws_url: "ws://playwright:9222".into(),
1407            }),
1408            ..Default::default()
1409        };
1410        assert_eq!(r.cdp_tier_count(), 1);
1411        // http(15) + chrome-equivalent(30) + 1 * 28 overhead.
1412        assert_eq!(
1413            r.min_deadline_for_full_ladder_ms(),
1414            15_000 + 30_000 + 28_000
1415        );
1416    }
1417
1418    #[test]
1419    fn renderer_phase_toggles_default_off_or_safe() {
1420        let r = RendererConfig::default();
1421        assert!(!r.chrome_intercept_resources);
1422        assert!(!r.chrome_intercept_stylesheets);
1423        assert!(r.chrome_host_intercept_disable.is_empty());
1424        assert_eq!(r.chrome_nav_budget_ms, 12_000);
1425        assert!(!r.chrome_context_pool_enabled);
1426        assert!(!r.use_predictor);
1427    }
1428
1429    #[test]
1430    fn crawler_per_host_limiter_defaults() {
1431        let c = CrawlerConfig::default();
1432        assert_eq!(c.per_host_min_interval_ms, 0);
1433        assert_eq!(c.per_host_max_concurrent, 1);
1434    }
1435
1436    #[test]
1437    fn env_var_overrides_toml_defaults() {
1438        let _g = ENV_LOCK.lock().unwrap();
1439        clear_renderer_env();
1440        unsafe {
1441            std::env::set_var("CRW_SERVER__PORT", "4444");
1442            std::env::set_var("CRW_RENDERER__LIGHTPANDA__WS_URL", "ws://test:9999/");
1443        }
1444        let cfg = AppConfig::load().unwrap();
1445        clear_renderer_env();
1446
1447        assert_eq!(cfg.server.port, 4444, "env var should override server.port");
1448        assert_eq!(
1449            cfg.renderer.lightpanda.as_ref().unwrap().ws_url,
1450            "ws://test:9999/",
1451            "env var should override renderer.lightpanda.ws_url"
1452        );
1453    }
1454
1455    #[test]
1456    fn user_config_path_honors_override_env() {
1457        let _g = ENV_LOCK.lock().unwrap();
1458        let tmp = std::env::temp_dir().join(format!("crw-cfg-test-{}", std::process::id()));
1459        unsafe {
1460            std::env::set_var("CRW_USER_CONFIG_DIR", &tmp);
1461        }
1462        let p = user_config_path().unwrap();
1463        unsafe {
1464            std::env::remove_var("CRW_USER_CONFIG_DIR");
1465        }
1466        assert_eq!(p, tmp.join("config.toml"));
1467    }
1468
1469    #[test]
1470    fn user_config_file_is_picked_up_by_load() {
1471        let _g = ENV_LOCK.lock().unwrap();
1472        clear_renderer_env();
1473        let tmp = std::env::temp_dir().join(format!("crw-load-test-{}", std::process::id()));
1474        std::fs::create_dir_all(&tmp).unwrap();
1475        let cfg_path = tmp.join("config.toml");
1476        std::fs::write(
1477            &cfg_path,
1478            r#"
1479[client]
1480api_url = "https://api.example.com"
1481api_key = "test-key-123"
1482
1483[search]
1484searxng_url = "http://localhost:9999"
1485
1486[extraction.llm]
1487provider = "deepseek"
1488api_key = "sk-test"
1489model = "deepseek-chat"
1490"#,
1491        )
1492        .unwrap();
1493
1494        unsafe {
1495            std::env::set_var("CRW_USER_CONFIG_DIR", &tmp);
1496        }
1497        let cfg = AppConfig::load().unwrap();
1498        unsafe {
1499            std::env::remove_var("CRW_USER_CONFIG_DIR");
1500        }
1501        std::fs::remove_dir_all(&tmp).ok();
1502
1503        assert_eq!(
1504            cfg.client.api_url.as_deref(),
1505            Some("https://api.example.com")
1506        );
1507        assert_eq!(cfg.client.api_key.as_deref(), Some("test-key-123"));
1508        assert_eq!(
1509            cfg.search.searxng_url.as_deref(),
1510            Some("http://localhost:9999")
1511        );
1512        let llm = cfg.extraction.llm.expect("llm config present");
1513        assert_eq!(llm.provider, "deepseek");
1514        assert_eq!(llm.api_key, "sk-test");
1515    }
1516
1517    #[test]
1518    fn env_var_beats_user_config() {
1519        let _g = ENV_LOCK.lock().unwrap();
1520        clear_renderer_env();
1521        let tmp = std::env::temp_dir().join(format!("crw-prec-test-{}", std::process::id()));
1522        std::fs::create_dir_all(&tmp).unwrap();
1523        std::fs::write(
1524            tmp.join("config.toml"),
1525            r#"
1526[search]
1527searxng_url = "http://from-file:8080"
1528"#,
1529        )
1530        .unwrap();
1531
1532        unsafe {
1533            std::env::set_var("CRW_USER_CONFIG_DIR", &tmp);
1534            std::env::set_var("CRW_SEARCH__SEARXNG_URL", "http://from-env:8080");
1535        }
1536        let cfg = AppConfig::load().unwrap();
1537        unsafe {
1538            std::env::remove_var("CRW_USER_CONFIG_DIR");
1539            std::env::remove_var("CRW_SEARCH__SEARXNG_URL");
1540        }
1541        std::fs::remove_dir_all(&tmp).ok();
1542
1543        assert_eq!(
1544            cfg.search.searxng_url.as_deref(),
1545            Some("http://from-env:8080"),
1546            "env var must win over user config file"
1547        );
1548    }
1549}