Skip to main content

crw_core/
config.rs

1use serde::Deserialize;
2
3#[derive(Debug, Clone, Deserialize, Default)]
4pub struct AppConfig {
5    #[serde(default)]
6    pub server: ServerConfig,
7    #[serde(default)]
8    pub renderer: RendererConfig,
9    #[serde(default)]
10    pub crawler: CrawlerConfig,
11    #[serde(default)]
12    pub extraction: ExtractionConfig,
13    #[serde(default)]
14    pub auth: AuthConfig,
15    #[serde(default)]
16    pub request: RequestConfig,
17    #[serde(default)]
18    pub search: SearchConfig,
19    #[serde(default)]
20    pub map: MapConfig,
21}
22
23/// `[map]` section — currently only carries `[map.url_filter]`.
24#[derive(Debug, Clone, Deserialize, Default)]
25pub struct MapConfig {
26    #[serde(default)]
27    pub url_filter: MapUrlFilterConfig,
28}
29
30/// `[map.url_filter]` — raw TOML view of the filter knobs. Conversion to
31/// the runtime `UrlFilterCfg` lives in `crw-crawl` (which can see both this
32/// type and the filter module). Keeping this struct dependency-free here
33/// avoids a cycle (`crw-core` does not depend on `crw-crawl`).
34#[derive(Debug, Clone, Deserialize)]
35pub struct MapUrlFilterConfig {
36    /// Tier B — strip tracking params. Default: `true`.
37    #[serde(default = "default_true_filter")]
38    pub strip_tracking_params: bool,
39    /// Tier A — drop action URLs entirely. Default: `true`.
40    #[serde(default = "default_true_filter")]
41    pub drop_action_urls: bool,
42    /// When `true`, `.gov`/`.mil` hosts run Tier A too. Default `false`.
43    #[serde(default)]
44    pub gov_tld_drop_actions: bool,
45    /// Additive on top of `DEFAULT_TRACKING_PARAMS`.
46    #[serde(default)]
47    pub extra_tracking_params: Vec<String>,
48    /// Additive on top of `DEFAULT_ACTION_PARAMS`.
49    #[serde(default)]
50    pub extra_action_params: Vec<String>,
51    /// Additive on top of `ALWAYS_PRESERVE`.
52    #[serde(default)]
53    pub extra_preserve_params: Vec<String>,
54}
55
56impl Default for MapUrlFilterConfig {
57    fn default() -> Self {
58        Self {
59            strip_tracking_params: true,
60            drop_action_urls: true,
61            gov_tld_drop_actions: false,
62            extra_tracking_params: Vec::new(),
63            extra_action_params: Vec::new(),
64            extra_preserve_params: Vec::new(),
65        }
66    }
67}
68
69fn default_true_filter() -> bool {
70    true
71}
72
73/// Per-tier CDP overhead in milliseconds — sum of SPA selector poll budget,
74/// challenge retry budget, content-stability budget, and fetch overhead.
75/// Mirrors the constants in `crw-renderer::cdp`. The drift between the two
76/// is regression-tested by `crates/crw-server/tests/cdp_constants_test.rs`
77/// (gated behind `feature = "cdp"`).
78///
79/// Used by [`RendererConfig::min_deadline_for_full_ladder_ms`] so the request
80/// deadline accommodates each CDP tier's outer fetch timeout, not just its
81/// configured `page_timeout`.
82pub const CDP_TIER_OVERHEAD_MS: u64 = 28_000;
83
84/// Hard upper bound on the per-request `wait_for_ms` budget. The Tower outer
85/// timeout is sized so a worst-case implicit scrape (no `deadlineMs`,
86/// `wait_for` at this maximum) still completes inside it; values above this
87/// are clamped by [`AppConfig::effective_deadline_ms`] so the inner deadline
88/// can never escape the outer envelope. Documented as `(0, 60000]` in
89/// `types.rs::ScrapeRequest::wait_for`.
90pub const MAX_WAIT_FOR_MS: u64 = 60_000;
91
92/// Configuration for the `/v1/search` endpoint and its SearXNG backend.
93///
94/// When `searxng_url` is unset the endpoint returns HTTP 503 with
95/// `error_code: "search_disabled"` — the route remains mounted so that
96/// startup doesn't have to know whether search will ever be configured.
97#[derive(Debug, Clone, Deserialize)]
98pub struct SearchConfig {
99    /// Master switch. Defaults to `true`; set to `false` to refuse all
100    /// `/v1/search` requests even if `searxng_url` is configured.
101    #[serde(default = "default_true_search")]
102    pub enabled: bool,
103    /// Base URL of the SearXNG instance (e.g. `http://searxng:8080`).
104    /// `None` (the default) disables the endpoint with a clear error.
105    #[serde(default)]
106    pub searxng_url: Option<String>,
107    /// End-to-end timeout for the SearXNG call in milliseconds.
108    #[serde(default = "default_search_timeout_ms")]
109    pub timeout_ms: u64,
110    /// Default `limit` when the request omits it.
111    #[serde(default = "default_search_limit")]
112    pub default_limit: u32,
113    /// Hard cap on `limit` per request. SaaS uses 20.
114    #[serde(default = "default_search_max_limit")]
115    pub max_limit: u32,
116    /// SearXNG engines invoked when the request includes `categories: ["research"]`.
117    /// Defaults match the SaaS implementation.
118    #[serde(default = "default_research_engines")]
119    pub research_engines: Vec<String>,
120    /// SearXNG engines invoked when the request includes `categories: ["github"]`.
121    #[serde(default = "default_github_engines")]
122    pub github_engines: Vec<String>,
123}
124
125impl Default for SearchConfig {
126    fn default() -> Self {
127        Self {
128            enabled: true,
129            searxng_url: None,
130            timeout_ms: default_search_timeout_ms(),
131            default_limit: default_search_limit(),
132            max_limit: default_search_max_limit(),
133            research_engines: default_research_engines(),
134            github_engines: default_github_engines(),
135        }
136    }
137}
138
139fn default_true_search() -> bool {
140    true
141}
142fn default_search_timeout_ms() -> u64 {
143    15_000
144}
145fn default_search_limit() -> u32 {
146    5
147}
148fn default_search_max_limit() -> u32 {
149    20
150}
151fn default_research_engines() -> Vec<String> {
152    vec![
153        "arxiv".into(),
154        "crossref".into(),
155        "google scholar".into(),
156        "semantic scholar".into(),
157    ]
158}
159fn default_github_engines() -> Vec<String> {
160    vec!["github".into()]
161}
162
163/// Per-request defaults that apply to every scrape, crawl, or map call when
164/// the caller does not specify an override. Currently only governs the
165/// end-to-end deadline budget (see `crw-core/src/deadline.rs`).
166#[derive(Debug, Clone, Deserialize)]
167pub struct RequestConfig {
168    /// Default end-to-end deadline budget in milliseconds when a request does
169    /// not specify `deadlineMs`. The SLO p95 latency metric is computed only
170    /// over requests with `deadline_ms <= 8000`; longer values land in a
171    /// separate slow-path histogram.
172    #[serde(default = "default_deadline_ms")]
173    pub deadline_ms_default: u64,
174    /// When `true` (default), an implicit deadline (no per-request `deadlineMs`)
175    /// is auto-extended to `max(deadline_ms_default, ladder_min)` where
176    /// `ladder_min = sum(http+lightpanda+chrome timeouts) + N_cdp_tiers * 28s`.
177    /// This prevents `chrome_timeout_ms = 30000` from appearing inert when
178    /// `deadline_ms_default` is small (issue #35).
179    ///
180    /// Set to `false` to enforce a strict SLO regardless of tier sizing —
181    /// requests that would have completed under the extended budget will
182    /// instead time out at `deadline_ms_default`.
183    #[serde(default = "default_true_request")]
184    pub auto_extend_deadline_for_ladder: bool,
185}
186
187impl Default for RequestConfig {
188    fn default() -> Self {
189        Self {
190            deadline_ms_default: default_deadline_ms(),
191            auto_extend_deadline_for_ladder: true,
192        }
193    }
194}
195
196fn default_true_request() -> bool {
197    true
198}
199
200fn default_deadline_ms() -> u64 {
201    8000
202}
203
204#[derive(Debug, Clone, Deserialize)]
205pub struct ServerConfig {
206    #[serde(default = "default_host")]
207    pub host: String,
208    #[serde(default = "default_port")]
209    pub port: u16,
210    #[serde(default = "default_request_timeout")]
211    pub request_timeout_secs: u64,
212    /// Maximum requests per second (global). 0 = unlimited.
213    #[serde(default = "default_rate_limit_rps")]
214    pub rate_limit_rps: u64,
215}
216
217impl Default for ServerConfig {
218    fn default() -> Self {
219        Self {
220            host: default_host(),
221            port: default_port(),
222            request_timeout_secs: default_request_timeout(),
223            rate_limit_rps: default_rate_limit_rps(),
224        }
225    }
226}
227
228fn default_rate_limit_rps() -> u64 {
229    10
230}
231
232fn default_host() -> String {
233    "0.0.0.0".into()
234}
235fn default_port() -> u16 {
236    3000
237}
238fn default_request_timeout() -> u64 {
239    60
240}
241
242/// Selects which JS renderer(s) the [`FallbackRenderer`] will build.
243///
244/// - `Auto` (default): try every configured CDP endpoint (Lightpanda, Playwright, Chrome)
245///   in order. If none is configured, JS rendering is disabled but HTTP still works.
246/// - `None`: HTTP-only. Never attempt JS rendering.
247/// - `Lightpanda` / `Chrome` / `Playwright`: require the matching `[renderer.<name>]`
248///   endpoint; fail startup if missing. Only the named backend is used.
249///
250/// [`FallbackRenderer`]: https://docs.rs/crw-renderer/latest/crw_renderer/struct.FallbackRenderer.html
251#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, Deserialize)]
252#[serde(rename_all = "lowercase")]
253pub enum RendererMode {
254    #[default]
255    Auto,
256    None,
257    Lightpanda,
258    Chrome,
259    Playwright,
260}
261
262#[derive(Debug, Clone, Deserialize)]
263pub struct RendererConfig {
264    #[serde(default)]
265    pub mode: RendererMode,
266    /// Generic per-page navigation timeout. Used as the fallback when no
267    /// per-tier override is configured. Kept for backward compatibility — the
268    /// per-tier knobs below are preferred for new deployments.
269    #[serde(default = "default_page_timeout")]
270    pub page_timeout_ms: u64,
271    /// Override for the HTTP-only fetcher request timeout. Falls back to
272    /// `page_timeout_ms` when unset. HTTP responses arrive quickly when they
273    /// arrive at all, so 15s is generous and keeps slow upstreams from
274    /// hogging the request budget that should be spent on JS retries.
275    #[serde(default)]
276    pub http_timeout_ms: Option<u64>,
277    /// Override for the LightPanda CDP renderer. LightPanda completes most
278    /// renders in <10s; if it stalls past 20s it almost always means an
279    /// adversarial page that Chrome will render anyway, so failing fast and
280    /// escalating beats waiting it out.
281    #[serde(default)]
282    pub lightpanda_timeout_ms: Option<u64>,
283    /// Override for the full-Chromium tier. Chrome is the slow path
284    /// (gov/legal SPAs need 30–40s for `networkidle`); the larger budget here
285    /// recovers ~6 URLs per fc-wins iteration without affecting the fast path.
286    #[serde(default)]
287    pub chrome_timeout_ms: Option<u64>,
288    #[serde(default = "default_pool_size")]
289    pub pool_size: usize,
290    /// If set, applies to every request that doesn't specify `renderJs` explicitly.
291    /// `Some(true)` = force JS rendering; `Some(false)` = skip JS; `None` = auto-detect.
292    ///
293    /// Accepts the `force_js` alias for backward compatibility.
294    #[serde(default, alias = "force_js")]
295    pub render_js_default: Option<bool>,
296    #[serde(default)]
297    pub lightpanda: Option<CdpEndpoint>,
298    #[serde(default)]
299    pub playwright: Option<CdpEndpoint>,
300    #[serde(default)]
301    pub chrome: Option<CdpEndpoint>,
302    /// Enable Chrome resource interception (`Fetch.enable` blocking of media,
303    /// fonts, trackers). Default `false`; flipped after the CDP-fake suite
304    /// validates pump + cleanup behaviour. See plan Phase 2.
305    #[serde(default)]
306    pub chrome_intercept_resources: bool,
307    /// Additionally block `stylesheet` requests when interception is enabled.
308    /// Default `false` — kept off in v1 because some extractors depend on
309    /// CSS-driven visibility / lazy-content triggers.
310    #[serde(default)]
311    pub chrome_intercept_stylesheets: bool,
312    /// Per-host opt-out for chrome interception. Hosts in this list run with
313    /// interception disabled even when `chrome_intercept_resources = true`.
314    #[serde(default)]
315    pub chrome_host_intercept_disable: Vec<String>,
316    /// Hard chrome-tier navigation budget in ms. Wraps `wait_for_page_ready`
317    /// in an inner race; on budget hit the renderer snapshots whatever DOM is
318    /// present and returns `truncated = true`. Calibrated as
319    /// `p90(successful chrome renders)` clamped to `[8_000, 12_000]`.
320    #[serde(default = "default_chrome_nav_budget_ms")]
321    pub chrome_nav_budget_ms: u64,
322    /// Enable the bounded browser-context pool. Default `false`; v1 ships
323    /// `RECYCLE_AFTER_NAV = 1` (recreate every release) before optimising to
324    /// reuse-with-clearing. See plan Phase 4.
325    #[serde(default)]
326    pub chrome_context_pool_enabled: bool,
327    /// Enable the success-ratio renderer predictor in `HostPreferences`.
328    /// Default `false`; flipped after the predictor replay harness gates
329    /// on the 1k bench (false-skip < 2 %, false-escalate < 5 %, churn < 3 / 1k).
330    #[serde(default)]
331    pub use_predictor: bool,
332    /// Engine escalation policy (firecrawl-shaped: race + on-error). When
333    /// disabled (default), the renderer keeps its current ladder unchanged.
334    #[serde(default)]
335    pub escalation: EscalationConfig,
336    /// Anti-bot detection policy (crawl4ai 3-tier classifier).
337    #[serde(default)]
338    pub antibot: AntibotConfig,
339}
340
341/// Engine escalation policy — adds `ChromeStealth` and `ChromeStealthProxy`
342/// tiers behind a feature flag. See `plans/recall-next-tier.md` Phase 2.
343#[derive(Debug, Clone, Deserialize)]
344pub struct EscalationConfig {
345    /// Master switch. Default `false` — current ladder runs unchanged.
346    #[serde(default)]
347    pub enabled: bool,
348    /// Per-tier waterfall trigger in ms. If the current engine hasn't returned
349    /// after this long, the next tier is started in parallel (firecrawl
350    /// `WaterfallNextEngineSignal`).
351    #[serde(default = "default_waterfall_timeout_ms")]
352    pub waterfall_timeout_ms: u64,
353    /// Hard global cap across the whole ladder.
354    #[serde(default = "default_escalation_global_timeout_ms")]
355    pub global_timeout_ms: u64,
356    /// Send `?proxy=residential&proxyCountry=…` to browserless on the
357    /// `ChromeStealthProxy` tier. Off by default — bears cost.
358    #[serde(default)]
359    pub residential_proxy: bool,
360    /// Country code passed to browserless when `residential_proxy = true`.
361    #[serde(default = "default_proxy_country")]
362    pub proxy_country: String,
363}
364
365impl Default for EscalationConfig {
366    fn default() -> Self {
367        Self {
368            enabled: false,
369            waterfall_timeout_ms: default_waterfall_timeout_ms(),
370            global_timeout_ms: default_escalation_global_timeout_ms(),
371            residential_proxy: false,
372            proxy_country: default_proxy_country(),
373        }
374    }
375}
376
377fn default_waterfall_timeout_ms() -> u64 {
378    8_000
379}
380fn default_escalation_global_timeout_ms() -> u64 {
381    60_000
382}
383fn default_proxy_country() -> String {
384    "us".to_string()
385}
386
387/// Anti-bot classifier policy. Default: detect+log only; escalation requires
388/// `escalate_on_signal = true` AND `escalation.enabled = true`.
389#[derive(Debug, Clone, Deserialize)]
390pub struct AntibotConfig {
391    /// Run the classifier on every fetch result. Cheap; default on.
392    #[serde(default = "default_true")]
393    pub enabled: bool,
394    /// When the classifier returns a non-`None` signal, advance to the next
395    /// engine tier (requires `escalation.enabled`).
396    #[serde(default)]
397    pub escalate_on_signal: bool,
398}
399
400impl Default for AntibotConfig {
401    fn default() -> Self {
402        Self {
403            enabled: true,
404            escalate_on_signal: false,
405        }
406    }
407}
408
409fn default_chrome_nav_budget_ms() -> u64 {
410    12_000
411}
412
413impl Default for RendererConfig {
414    fn default() -> Self {
415        Self {
416            mode: RendererMode::default(),
417            page_timeout_ms: default_page_timeout(),
418            http_timeout_ms: None,
419            lightpanda_timeout_ms: None,
420            chrome_timeout_ms: None,
421            pool_size: default_pool_size(),
422            render_js_default: None,
423            lightpanda: None,
424            playwright: None,
425            chrome: None,
426            chrome_intercept_resources: false,
427            chrome_intercept_stylesheets: false,
428            chrome_host_intercept_disable: Vec::new(),
429            chrome_nav_budget_ms: default_chrome_nav_budget_ms(),
430            chrome_context_pool_enabled: false,
431            use_predictor: false,
432            escalation: EscalationConfig::default(),
433            antibot: AntibotConfig::default(),
434        }
435    }
436}
437fn default_page_timeout() -> u64 {
438    30000
439}
440
441impl RendererConfig {
442    /// Resolved per-tier nav timeout in milliseconds. Resolution rules:
443    ///   1. If the explicit per-tier field is set, use it verbatim.
444    ///   2. Otherwise fall back to `page_timeout_ms` (which itself defaults
445    ///      to 30s for backward compatibility with pre-multi-tier configs).
446    ///
447    /// New deployments are encouraged to set the per-tier knobs to 15/20/45s
448    /// (see config.docker.toml) — these match the bench-tuned values that
449    /// recover slow gov sites in the chrome tier without giving the http
450    /// tier permission to hog the request budget.
451    pub fn http_timeout(&self) -> u64 {
452        self.http_timeout_ms.unwrap_or(self.page_timeout_ms)
453    }
454    pub fn lightpanda_timeout(&self) -> u64 {
455        self.lightpanda_timeout_ms.unwrap_or(self.page_timeout_ms)
456    }
457    pub fn chrome_timeout(&self) -> u64 {
458        self.chrome_timeout_ms.unwrap_or(self.page_timeout_ms)
459    }
460
461    /// Number of active CDP tiers (lightpanda + playwright + chrome) under
462    /// the current `mode`. Mirrors the predicate used at runtime in
463    /// `crw-renderer/src/lib.rs` when constructing the renderer ladder:
464    /// `want(mode) && config.<tier>.is_some()`.
465    ///
466    /// Returns `0` when the binary is built without the `cdp` feature — in
467    /// that case no JS renderer can be constructed regardless of the config,
468    /// so the deadline auto-extension policy must collapse to HTTP-only.
469    pub fn cdp_tier_count(&self) -> usize {
470        if !cfg!(feature = "cdp") {
471            return 0;
472        }
473        let want =
474            |m: RendererMode| -> bool { matches!(self.mode, RendererMode::Auto) || self.mode == m };
475        let mut n = 0;
476        if want(RendererMode::Lightpanda) && self.lightpanda.is_some() {
477            n += 1;
478        }
479        if want(RendererMode::Playwright) && self.playwright.is_some() {
480            n += 1;
481        }
482        if want(RendererMode::Chrome) && self.chrome.is_some() {
483            n += 1;
484        }
485        n
486    }
487
488    /// Minimum request deadline budget (ms) required so that every configured
489    /// tier can use its full allowance when fallback exhausts the chain.
490    /// Sums the per-tier timeouts and adds [`CDP_TIER_OVERHEAD_MS`] for each
491    /// active CDP tier, matching the runtime ladder built in
492    /// `crw-renderer/src/lib.rs`.
493    pub fn min_deadline_for_full_ladder_ms(&self) -> u64 {
494        let want =
495            |m: RendererMode| -> bool { matches!(self.mode, RendererMode::Auto) || self.mode == m };
496
497        let mut sum: u64 = 0;
498        // HTTP prefetch runs ahead of any JS tier (content-type sniffing,
499        // direct PDF/binary handling) regardless of pinned mode. Skipped only
500        // when mode is `None` (no fetching at all).
501        if !matches!(self.mode, RendererMode::None) {
502            sum = sum.saturating_add(self.http_timeout());
503        }
504
505        // CDP tiers only contribute when the binary was built with the `cdp`
506        // feature; otherwise no JS renderer is constructable at runtime and
507        // including their budgets would over-extend the deadline.
508        if !cfg!(feature = "cdp") {
509            return sum;
510        }
511
512        let mut cdp_tier_count: u64 = 0;
513        if want(RendererMode::Lightpanda) && self.lightpanda.is_some() {
514            sum = sum.saturating_add(self.lightpanda_timeout());
515            cdp_tier_count += 1;
516        }
517        if want(RendererMode::Playwright) && self.playwright.is_some() {
518            sum = sum.saturating_add(self.chrome_timeout());
519            cdp_tier_count += 1;
520        }
521        if want(RendererMode::Chrome) && self.chrome.is_some() {
522            sum = sum.saturating_add(self.chrome_timeout());
523            cdp_tier_count += 1;
524        }
525        sum.saturating_add(cdp_tier_count.saturating_mul(CDP_TIER_OVERHEAD_MS))
526    }
527}
528fn default_pool_size() -> usize {
529    4
530}
531
532#[derive(Debug, Clone, Deserialize)]
533pub struct CdpEndpoint {
534    pub ws_url: String,
535}
536
537/// Stealth mode configuration for evading bot detection.
538#[derive(Debug, Clone, Deserialize)]
539pub struct StealthConfig {
540    /// Enable stealth mode globally.
541    #[serde(default)]
542    pub enabled: bool,
543    /// Custom user-agent pool. Empty = use built-in pool.
544    #[serde(default)]
545    pub user_agents: Vec<String>,
546    /// Jitter factor for rate limiting (0.0–1.0, default 0.2 = ±20%).
547    #[serde(default = "default_jitter")]
548    pub jitter_factor: f64,
549    /// Inject realistic browser headers (Accept, Sec-Fetch-*, etc.).
550    #[serde(default = "default_true")]
551    pub inject_headers: bool,
552}
553
554impl Default for StealthConfig {
555    fn default() -> Self {
556        Self {
557            enabled: false,
558            user_agents: vec![],
559            jitter_factor: default_jitter(),
560            inject_headers: true,
561        }
562    }
563}
564
565fn default_jitter() -> f64 {
566    0.2
567}
568
569/// Built-in realistic user-agent pool used when stealth is enabled.
570pub const BUILTIN_UA_POOL: &[&str] = &[
571    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
572    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
573    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
574    "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:133.0) Gecko/20100101 Firefox/133.0",
575    "Mozilla/5.0 (Macintosh; Intel Mac OS X 14_7_2) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.2 Safari/605.1.15",
576];
577
578#[derive(Debug, Clone, Deserialize)]
579pub struct CrawlerConfig {
580    #[serde(default = "default_concurrency")]
581    pub max_concurrency: usize,
582    #[serde(default = "default_rps")]
583    pub requests_per_second: f64,
584    #[serde(default = "default_true")]
585    pub respect_robots_txt: bool,
586    #[serde(default = "default_ua")]
587    pub user_agent: String,
588    #[serde(default = "default_depth")]
589    pub default_max_depth: u32,
590    #[serde(default = "default_max_pages")]
591    pub default_max_pages: u32,
592    /// Proxy URL for crawler requests. Supports HTTP, HTTPS, and SOCKS5
593    /// (e.g. "http://proxy:8080" or "socks5://user:pass@proxy:1080").
594    #[serde(default)]
595    pub proxy: Option<String>,
596    /// TTL in seconds for completed crawl jobs before cleanup (default: 3600)
597    #[serde(default = "default_job_ttl")]
598    pub job_ttl_secs: u64,
599    #[serde(default)]
600    pub stealth: StealthConfig,
601    /// Floor for the per-host limiter interval, in milliseconds. When a host
602    /// advertises `Crawl-delay` in robots.txt, the higher of the two wins.
603    /// Default `0` — robots.txt is the authoritative source, this is a
604    /// per-deployment safety net.
605    #[serde(default)]
606    pub per_host_min_interval_ms: u64,
607    /// Maximum concurrent in-flight requests against a single eTLD+1.
608    /// Default `1` — strict ethics posture; operators raise consciously via
609    /// config when scraping their own infrastructure.
610    #[serde(default = "default_per_host_max_concurrent")]
611    pub per_host_max_concurrent: u32,
612}
613
614fn default_per_host_max_concurrent() -> u32 {
615    1
616}
617
618impl Default for CrawlerConfig {
619    fn default() -> Self {
620        Self {
621            max_concurrency: default_concurrency(),
622            requests_per_second: default_rps(),
623            respect_robots_txt: true,
624            user_agent: default_ua(),
625            default_max_depth: default_depth(),
626            default_max_pages: default_max_pages(),
627            proxy: None,
628            job_ttl_secs: default_job_ttl(),
629            stealth: StealthConfig::default(),
630            per_host_min_interval_ms: 0,
631            per_host_max_concurrent: default_per_host_max_concurrent(),
632        }
633    }
634}
635
636fn default_concurrency() -> usize {
637    10
638}
639fn default_rps() -> f64 {
640    10.0
641}
642fn default_true() -> bool {
643    true
644}
645fn default_ua() -> String {
646    // Modern Chrome UA. The legacy "CRW/0.1" was rejected by UA-filtering sites
647    // (opencorporates, killeenisd, wsj) returning 403/404. Kept in sync with the
648    // Sec-Ch-Ua client hint in `crw-renderer/src/http_only.rs`.
649    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 \
650     (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36"
651        .into()
652}
653fn default_depth() -> u32 {
654    2
655}
656fn default_max_pages() -> u32 {
657    100
658}
659fn default_job_ttl() -> u64 {
660    3600
661}
662
663#[derive(Debug, Clone, Deserialize)]
664pub struct ExtractionConfig {
665    #[serde(default = "default_format")]
666    pub default_format: String,
667    #[serde(default = "default_true_ext")]
668    pub only_main_content: bool,
669    #[serde(default)]
670    pub llm: Option<LlmConfig>,
671    /// Hostname → CSS selector overrides applied before readability narrowing.
672    /// Match is exact host (no wildcard); user-supplied selector still wins.
673    #[serde(default)]
674    pub domain_selectors: std::collections::HashMap<String, String>,
675    #[serde(default)]
676    pub llm_fallback: LlmFallbackConfig,
677    /// Bytes below which an HTTP-tier extraction is treated as "thin"
678    /// and triggers a JS-renderer escalation. Default 100.
679    #[serde(default = "default_http_retry_threshold")]
680    pub http_retry_threshold_bytes: usize,
681    /// Bytes below which a LightPanda-tier extraction is treated as
682    /// "thin" and triggers a Chrome escalation. Default 2000 (LP often
683    /// returns SPA husks of 90–500B that pass HTML-shape checks).
684    #[serde(default = "default_lightpanda_retry_threshold")]
685    pub lightpanda_retry_threshold_bytes: usize,
686}
687
688fn default_http_retry_threshold() -> usize {
689    100
690}
691
692fn default_lightpanda_retry_threshold() -> usize {
693    2000
694}
695
696impl Default for ExtractionConfig {
697    fn default() -> Self {
698        Self {
699            default_format: default_format(),
700            only_main_content: true,
701            llm: None,
702            domain_selectors: std::collections::HashMap::new(),
703            llm_fallback: LlmFallbackConfig::default(),
704            http_retry_threshold_bytes: default_http_retry_threshold(),
705            lightpanda_retry_threshold_bytes: default_lightpanda_retry_threshold(),
706        }
707    }
708}
709
710#[derive(Debug, Clone, Deserialize)]
711pub struct LlmFallbackConfig {
712    #[serde(default)]
713    pub enable: bool,
714    #[serde(default = "default_llm_quality_threshold")]
715    pub quality_threshold: f32,
716    #[serde(default = "default_llm_max_html_bytes")]
717    pub max_html_bytes: usize,
718    /// When true (and `enable` is true), invoke the LLM on every page rather
719    /// than only when DOM-based extraction scores below `quality_threshold`.
720    /// Mirrors the "LLM as primary extractor" pattern used by Reader-LM,
721    /// Firecrawl, and similar services. Higher cost, higher recall.
722    #[serde(default)]
723    pub always_run: bool,
724}
725
726impl Default for LlmFallbackConfig {
727    fn default() -> Self {
728        Self {
729            enable: false,
730            quality_threshold: default_llm_quality_threshold(),
731            max_html_bytes: default_llm_max_html_bytes(),
732            always_run: false,
733        }
734    }
735}
736
737fn default_llm_quality_threshold() -> f32 {
738    0.3
739}
740fn default_llm_max_html_bytes() -> usize {
741    100_000
742}
743
744#[derive(Debug, Clone, Deserialize)]
745pub struct LlmConfig {
746    #[serde(default = "default_llm_provider")]
747    pub provider: String,
748    pub api_key: String,
749    #[serde(default = "default_llm_model")]
750    pub model: String,
751    #[serde(default)]
752    pub base_url: Option<String>,
753    #[serde(default = "default_llm_max_tokens")]
754    pub max_tokens: u32,
755    /// Azure OpenAI API version (e.g. "2024-05-01-preview"). Required when
756    /// `provider = "azure"`; ignored otherwise.
757    #[serde(default)]
758    pub azure_api_version: Option<String>,
759}
760
761fn default_llm_provider() -> String {
762    "anthropic".into()
763}
764fn default_llm_model() -> String {
765    "claude-sonnet-4-20250514".into()
766}
767fn default_llm_max_tokens() -> u32 {
768    4096
769}
770
771fn default_format() -> String {
772    "markdown".into()
773}
774fn default_true_ext() -> bool {
775    true
776}
777
778#[derive(Debug, Clone, Default, Deserialize)]
779pub struct AuthConfig {
780    #[serde(default)]
781    pub api_keys: Vec<String>,
782}
783
784impl AppConfig {
785    /// Load config from config.default.toml + environment variable overrides.
786    /// Env vars use `CRW_` prefix, `__` as separator. E.g. `CRW_SERVER__PORT=8080`.
787    pub fn load() -> Result<Self, config::ConfigError> {
788        let mut builder = config::Config::builder()
789            .add_source(config::File::with_name("config.default").required(false));
790
791        // Load optional override config file (e.g. config.docker.toml in containers).
792        if let Ok(extra) = std::env::var("CRW_CONFIG") {
793            builder = builder.add_source(config::File::with_name(&extra).required(true));
794        } else {
795            builder = builder.add_source(config::File::with_name("config.local").required(false));
796        }
797
798        let cfg = builder
799            .add_source(
800                config::Environment::with_prefix("CRW")
801                    .prefix_separator("_")
802                    .separator("__")
803                    .try_parsing(true),
804            )
805            .build()?;
806        cfg.try_deserialize()
807    }
808
809    /// Compute the effective end-to-end request deadline (ms). Implements the
810    /// issue-#35 auto-extension policy:
811    ///
812    /// 1. If the caller supplied an explicit `requested_deadline_ms`, return it
813    ///    verbatim — operators trust the request budget over our heuristic.
814    /// 2. Otherwise, when `request.auto_extend_deadline_for_ladder` is on,
815    ///    return `max(deadline_ms_default, ladder_min + wait_for_extra)`.
816    ///    `ladder_min` covers the configured tier ladder; `wait_for_extra`
817    ///    compensates for callers that bumped `wait_for_ms` above the default
818    ///    SPA budget (8s) — without it, a long `wait_for` would silently
819    ///    re-clamp inside CDP.
820    /// 3. When the policy is disabled, return `deadline_ms_default` unchanged.
821    ///
822    /// `wait_for_ms` is the per-request override (ScrapeRequest::wait_for /
823    /// CrawlRequest::wait_for); pass `None` for sub-fetches that don't
824    /// surface a wait_for to the caller (search/map enrichment).
825    pub fn effective_deadline_ms(
826        &self,
827        requested_deadline_ms: Option<u64>,
828        wait_for_ms: Option<u64>,
829    ) -> u64 {
830        if let Some(explicit) = requested_deadline_ms {
831            return explicit;
832        }
833        let default_ms = self.request.deadline_ms_default;
834        if !self.request.auto_extend_deadline_for_ladder {
835            return default_ms;
836        }
837        // Issue #35 is specifically about CDP tier overhead silently clamping
838        // chrome_timeout_ms. HTTP-only deployments don't suffer the same
839        // problem (the HTTP renderer respects deadline.remaining without the
840        // extra fetch/challenge/stability overhead). Skip the extension when
841        // no CDP tiers are configured so HTTP-only users keep the strict
842        // operator-configured default.
843        if self.renderer.cdp_tier_count() == 0 {
844            return default_ms;
845        }
846        let ladder_min = self.renderer.min_deadline_for_full_ladder_ms();
847        // Mirrors crw_renderer::cdp::SPA_SELECTOR_MAX_MS. The CDP module
848        // adds `wait_for_ms.unwrap_or(SPA_SELECTOR_MAX_MS)` to its internal
849        // timeout, so when the caller exceeds the default we need to extend
850        // the deadline per active CDP tier.
851        const SPA_DEFAULT_MS: u64 = 8_000;
852        // Clamp `wait_for_ms` to MAX_WAIT_FOR_MS so the inner deadline never
853        // exceeds the Tower envelope, which is sized off the same constant in
854        // `effective_request_timeout_secs`. A pathological caller passing
855        // `wait_for: 600_000` without `deadlineMs` would otherwise be cancelled
856        // by Tower before the inner CDP loop noticed the bigger budget.
857        let extra = if let Some(w) = wait_for_ms {
858            let bounded = w.min(MAX_WAIT_FOR_MS);
859            let per_tier = bounded.saturating_sub(SPA_DEFAULT_MS);
860            per_tier.saturating_mul(self.renderer.cdp_tier_count() as u64)
861        } else {
862            0
863        };
864        default_ms.max(ladder_min.saturating_add(extra))
865    }
866
867    /// Tower middleware outer timeout (seconds). Must accommodate the longest
868    /// legitimate handler runtime so a healthy request isn't cancelled by the
869    /// outer layer before the inner deadline fires.
870    ///
871    /// Covers the three route envelopes:
872    /// - `/scrape`, `/mcp` — auto-extended scrape deadline.
873    /// - `/search` — SearXNG fetch + bounded enrichment fan-out
874    ///   (`ceil(max_limit / max_concurrency)` batches × scrape_ms).
875    /// - `/crawl/jobs/:id`, `/map` — handler-side caps up to 300s.
876    ///
877    /// When auto-extend is disabled, returns the operator-configured baseline
878    /// unchanged.
879    pub fn effective_request_timeout_secs(&self) -> u64 {
880        let baseline = self.server.request_timeout_secs;
881        if !self.request.auto_extend_deadline_for_ladder {
882            return baseline;
883        }
884        const OUTER_BUFFER_SECS: u64 = 5;
885        // `/map` handler caps `req.timeout.unwrap_or(120).min(300)`; the outer
886        // must cover the upper bound so callers passing `timeout=300` aren't
887        // cancelled mid-flight.
888        const MAP_REQUEST_TIMEOUT_CEILING_MS: u64 = 300_000;
889        // Cover the worst-case implicit scrape: caller bumps `wait_for` to the
890        // configured maximum without supplying `deadlineMs`. The same
891        // [`MAX_WAIT_FOR_MS`] constant is used inside `effective_deadline_ms`
892        // to clamp the inner extension, so the inner deadline can never
893        // exceed this outer envelope.
894        let scrape_ms = self.effective_deadline_ms(None, Some(MAX_WAIT_FOR_MS));
895
896        // Search enrichment: bounded by max_concurrency. Worst case sequential
897        // batching with low concurrency: ceil(max_limit / max_concurrency)
898        // batches each bounded by scrape_ms.
899        let conc = (self.crawler.max_concurrency.max(1)) as u64;
900        let max_results = self.search.max_limit as u64;
901        let enrich_batches = max_results.div_ceil(conc);
902        let search_enrichment_ms = enrich_batches.saturating_mul(scrape_ms);
903        let search_ms = self.search.timeout_ms.saturating_add(search_enrichment_ms);
904
905        let max_handler_ms = scrape_ms.max(search_ms).max(MAP_REQUEST_TIMEOUT_CEILING_MS);
906        let needed_secs = max_handler_ms
907            .div_ceil(1_000)
908            .saturating_add(OUTER_BUFFER_SECS);
909        baseline.max(needed_secs)
910    }
911}
912
913#[cfg(test)]
914mod tests {
915    use super::*;
916
917    /// Env var tests modify process-wide state; serialize them to avoid cross-test
918    /// interference (e.g. `force_js` alias + `render_js_default` direct both set).
919    static ENV_LOCK: std::sync::Mutex<()> = std::sync::Mutex::new(());
920
921    fn clear_renderer_env() {
922        for k in [
923            "CRW_RENDERER__MODE",
924            "CRW_RENDERER__FORCE_JS",
925            "CRW_RENDERER__RENDER_JS_DEFAULT",
926            "CRW_RENDERER__LIGHTPANDA__WS_URL",
927            "CRW_SERVER__PORT",
928        ] {
929            unsafe { std::env::remove_var(k) };
930        }
931    }
932
933    #[test]
934    fn renderer_mode_parses_variants() {
935        #[derive(Deserialize)]
936        struct Wrap {
937            mode: RendererMode,
938        }
939        let cases = [
940            ("mode = \"auto\"", RendererMode::Auto),
941            ("mode = \"none\"", RendererMode::None),
942            ("mode = \"lightpanda\"", RendererMode::Lightpanda),
943            ("mode = \"chrome\"", RendererMode::Chrome),
944            ("mode = \"playwright\"", RendererMode::Playwright),
945        ];
946        for (toml_str, expected) in cases {
947            let w: Wrap = toml::from_str(toml_str).unwrap();
948            assert_eq!(w.mode, expected, "toml: {toml_str}");
949        }
950    }
951
952    #[test]
953    fn renderer_mode_bogus_errors() {
954        #[derive(Deserialize)]
955        struct Wrap {
956            #[allow(dead_code)]
957            mode: RendererMode,
958        }
959        let err: Result<Wrap, _> = toml::from_str("mode = \"bogus\"");
960        assert!(err.is_err(), "bogus mode should fail to parse");
961    }
962
963    #[test]
964    fn renderer_config_default_mode_is_auto() {
965        let cfg = RendererConfig::default();
966        assert_eq!(cfg.mode, RendererMode::Auto);
967        assert_eq!(cfg.render_js_default, None);
968    }
969
970    #[test]
971    fn render_js_default_force_js_alias() {
972        let cfg: RendererConfig = toml::from_str("force_js = true").unwrap();
973        assert_eq!(cfg.render_js_default, Some(true));
974    }
975
976    #[test]
977    fn render_js_default_direct_field() {
978        let cfg: RendererConfig = toml::from_str("render_js_default = false").unwrap();
979        assert_eq!(cfg.render_js_default, Some(false));
980    }
981
982    #[test]
983    fn env_var_renderer_mode_chrome() {
984        let _g = ENV_LOCK.lock().unwrap();
985        clear_renderer_env();
986        unsafe { std::env::set_var("CRW_RENDERER__MODE", "chrome") };
987        let cfg = AppConfig::load().unwrap();
988        clear_renderer_env();
989        assert_eq!(cfg.renderer.mode, RendererMode::Chrome);
990    }
991
992    #[test]
993    fn env_var_force_js_alias_works() {
994        let _g = ENV_LOCK.lock().unwrap();
995        clear_renderer_env();
996        unsafe { std::env::set_var("CRW_RENDERER__FORCE_JS", "true") };
997        let cfg = AppConfig::load().unwrap();
998        clear_renderer_env();
999        assert_eq!(cfg.renderer.render_js_default, Some(true));
1000    }
1001
1002    #[test]
1003    fn env_var_render_js_default_direct() {
1004        let _g = ENV_LOCK.lock().unwrap();
1005        clear_renderer_env();
1006        unsafe { std::env::set_var("CRW_RENDERER__RENDER_JS_DEFAULT", "true") };
1007        let cfg = AppConfig::load().unwrap();
1008        clear_renderer_env();
1009        assert_eq!(cfg.renderer.render_js_default, Some(true));
1010    }
1011
1012    #[test]
1013    fn request_config_defaults_match_plan() {
1014        let r = RequestConfig::default();
1015        assert_eq!(r.deadline_ms_default, 8000);
1016        assert!(r.auto_extend_deadline_for_ladder);
1017    }
1018
1019    #[test]
1020    fn default_app_config_enables_auto_extend() {
1021        // Programmatic Default must mirror serde defaults — issue #35.
1022        let cfg = AppConfig::default();
1023        assert!(cfg.request.auto_extend_deadline_for_ladder);
1024        assert_eq!(cfg.request.deadline_ms_default, 8000);
1025    }
1026
1027    fn renderer_with_chrome_only(chrome_ms: u64) -> RendererConfig {
1028        RendererConfig {
1029            mode: RendererMode::Chrome,
1030            page_timeout_ms: chrome_ms,
1031            chrome_timeout_ms: Some(chrome_ms),
1032            chrome: Some(CdpEndpoint {
1033                ws_url: "ws://chrome:9222".into(),
1034            }),
1035            ..Default::default()
1036        }
1037    }
1038
1039    #[test]
1040    #[cfg(feature = "cdp")]
1041    fn min_deadline_full_ladder_chrome_only() {
1042        // chrome-only mode: http (page_timeout) + chrome + 1 * 28000.
1043        let r = renderer_with_chrome_only(30_000);
1044        // page_timeout_ms is set to chrome_ms here, so http_timeout() → 30s.
1045        assert_eq!(
1046            r.min_deadline_for_full_ladder_ms(),
1047            30_000 + 30_000 + 28_000
1048        );
1049    }
1050
1051    #[test]
1052    #[cfg(feature = "cdp")]
1053    fn min_deadline_full_ladder_auto_three_tiers() {
1054        let r = RendererConfig {
1055            mode: RendererMode::Auto,
1056            page_timeout_ms: 15_000,
1057            http_timeout_ms: Some(15_000),
1058            lightpanda_timeout_ms: Some(2_500),
1059            chrome_timeout_ms: Some(30_000),
1060            lightpanda: Some(CdpEndpoint {
1061                ws_url: "ws://lp:9222".into(),
1062            }),
1063            chrome: Some(CdpEndpoint {
1064                ws_url: "ws://chrome:9222".into(),
1065            }),
1066            ..Default::default()
1067        };
1068        // http(15) + lp(2.5) + chrome(30) + 2*28 = 47.5 + 56 = 103_500.
1069        assert_eq!(
1070            r.min_deadline_for_full_ladder_ms(),
1071            15_000 + 2_500 + 30_000 + 2 * 28_000
1072        );
1073        assert_eq!(r.cdp_tier_count(), 2);
1074    }
1075
1076    #[test]
1077    fn effective_deadline_explicit_bypasses_auto_extend() {
1078        let mut cfg = AppConfig::default();
1079        cfg.request.auto_extend_deadline_for_ladder = true;
1080        cfg.renderer = renderer_with_chrome_only(30_000);
1081        // Explicit override beats both default and ladder_min.
1082        assert_eq!(cfg.effective_deadline_ms(Some(5_000), None), 5_000);
1083        assert_eq!(cfg.effective_deadline_ms(Some(500_000), None), 500_000);
1084    }
1085
1086    #[test]
1087    #[cfg(feature = "cdp")]
1088    fn effective_deadline_auto_extend_raises_to_ladder_min() {
1089        let mut cfg = AppConfig::default();
1090        cfg.request.auto_extend_deadline_for_ladder = true;
1091        cfg.request.deadline_ms_default = 8_000;
1092        cfg.renderer = renderer_with_chrome_only(30_000);
1093        let expected = cfg.renderer.min_deadline_for_full_ladder_ms();
1094        assert!(expected > 8_000);
1095        assert_eq!(cfg.effective_deadline_ms(None, None), expected);
1096    }
1097
1098    #[test]
1099    fn effective_deadline_default_wins_when_higher_than_ladder() {
1100        let mut cfg = AppConfig::default();
1101        cfg.request.auto_extend_deadline_for_ladder = true;
1102        cfg.request.deadline_ms_default = 1_000_000;
1103        cfg.renderer = renderer_with_chrome_only(30_000);
1104        assert_eq!(cfg.effective_deadline_ms(None, None), 1_000_000);
1105    }
1106
1107    #[test]
1108    fn effective_deadline_auto_extend_disabled_returns_baseline() {
1109        let mut cfg = AppConfig::default();
1110        cfg.request.auto_extend_deadline_for_ladder = false;
1111        cfg.request.deadline_ms_default = 8_000;
1112        cfg.renderer = renderer_with_chrome_only(30_000);
1113        assert_eq!(cfg.effective_deadline_ms(None, None), 8_000);
1114    }
1115
1116    #[test]
1117    #[cfg(feature = "cdp")]
1118    fn effective_deadline_extends_for_long_wait_for() {
1119        let mut cfg = AppConfig::default();
1120        cfg.request.auto_extend_deadline_for_ladder = true;
1121        cfg.request.deadline_ms_default = 8_000;
1122        cfg.renderer = renderer_with_chrome_only(30_000);
1123        let base = cfg.renderer.min_deadline_for_full_ladder_ms();
1124        let tier_count = cfg.renderer.cdp_tier_count() as u64;
1125        // wait_for = 20000 → per-tier extra = 12000 over SPA_DEFAULT_MS (8000).
1126        let with_wait = cfg.effective_deadline_ms(None, Some(20_000));
1127        assert_eq!(with_wait, base + 12_000 * tier_count);
1128        // wait_for below SPA default → no extra.
1129        assert_eq!(cfg.effective_deadline_ms(None, Some(2_000)), base);
1130    }
1131
1132    #[test]
1133    fn effective_request_timeout_covers_map_ceiling() {
1134        let mut cfg = AppConfig::default();
1135        cfg.request.auto_extend_deadline_for_ladder = true;
1136        cfg.request.deadline_ms_default = 8_000;
1137        cfg.renderer = renderer_with_chrome_only(30_000);
1138        cfg.search.timeout_ms = 15_000;
1139        cfg.crawler.max_concurrency = 10;
1140        cfg.search.max_limit = 20;
1141        cfg.server.request_timeout_secs = 60;
1142        // Map ceiling 300s + 5s buffer = 305s minimum.
1143        assert!(cfg.effective_request_timeout_secs() >= 305);
1144    }
1145
1146    #[test]
1147    fn effective_request_timeout_disabled_returns_baseline() {
1148        let mut cfg = AppConfig::default();
1149        cfg.request.auto_extend_deadline_for_ladder = false;
1150        cfg.server.request_timeout_secs = 60;
1151        assert_eq!(cfg.effective_request_timeout_secs(), 60);
1152    }
1153
1154    #[test]
1155    fn effective_request_timeout_respects_operator_override() {
1156        let mut cfg = AppConfig::default();
1157        cfg.request.auto_extend_deadline_for_ladder = true;
1158        cfg.server.request_timeout_secs = 600; // operator-configured high
1159        cfg.renderer = renderer_with_chrome_only(30_000);
1160        // Operator's explicit 600s should win over the auto-computed 305s.
1161        assert_eq!(cfg.effective_request_timeout_secs(), 600);
1162    }
1163
1164    #[test]
1165    fn effective_request_timeout_search_sequential_batching() {
1166        // Low concurrency forces ceil(max_limit/conc) batches → larger search_ms.
1167        let mut cfg = AppConfig::default();
1168        cfg.request.auto_extend_deadline_for_ladder = true;
1169        cfg.request.deadline_ms_default = 8_000;
1170        cfg.renderer = renderer_with_chrome_only(30_000);
1171        cfg.search.timeout_ms = 15_000;
1172        cfg.search.max_limit = 20;
1173        cfg.crawler.max_concurrency = 1;
1174        cfg.server.request_timeout_secs = 60;
1175        // The Tower envelope must cover the worst-case implicit scrape with
1176        // `wait_for` bumped to MAX_WAIT_FOR_MS (60s), because callers can do
1177        // that without supplying `deadlineMs`. Mirror that in the expected.
1178        let secs = cfg.effective_request_timeout_secs();
1179        let scrape_ms = cfg.effective_deadline_ms(None, Some(60_000));
1180        let expected_search_ms = 15_000 + 20 * scrape_ms;
1181        let expected_max_ms = scrape_ms.max(expected_search_ms).max(300_000);
1182        let expected_secs = expected_max_ms.div_ceil(1_000) + 5;
1183        assert_eq!(secs, 60u64.max(expected_secs));
1184    }
1185
1186    #[test]
1187    #[cfg(not(feature = "cdp"))]
1188    fn cdp_tier_count_zero_without_cdp_feature() {
1189        // Even when chrome/lightpanda are configured, a binary built without
1190        // the `cdp` feature can never construct a JS renderer. The deadline
1191        // policy must observe that and collapse to HTTP-only behavior.
1192        let r = RendererConfig {
1193            mode: RendererMode::Auto,
1194            page_timeout_ms: 15_000,
1195            chrome_timeout_ms: Some(30_000),
1196            chrome: Some(CdpEndpoint {
1197                ws_url: "ws://chrome:9222".into(),
1198            }),
1199            lightpanda: Some(CdpEndpoint {
1200                ws_url: "ws://lp:9222".into(),
1201            }),
1202            ..Default::default()
1203        };
1204        assert_eq!(r.cdp_tier_count(), 0);
1205        // Only the HTTP tier contributes to the ladder budget.
1206        assert_eq!(r.min_deadline_for_full_ladder_ms(), 15_000);
1207    }
1208
1209    #[test]
1210    fn effective_deadline_skipped_for_http_only_mode() {
1211        // P2 from codex review: HTTP-only deployments don't suffer the CDP
1212        // clamping problem (no fetch/challenge/stability overhead). The
1213        // auto-extension must NOT silently bump their default from 8s to 30s
1214        // just because page_timeout_ms defaults high.
1215        let mut cfg = AppConfig::default();
1216        cfg.request.auto_extend_deadline_for_ladder = true;
1217        cfg.request.deadline_ms_default = 8_000;
1218        cfg.renderer = RendererConfig {
1219            mode: RendererMode::Auto,
1220            page_timeout_ms: 30_000,
1221            // No CDP endpoints configured.
1222            lightpanda: None,
1223            playwright: None,
1224            chrome: None,
1225            ..Default::default()
1226        };
1227        assert_eq!(cfg.renderer.cdp_tier_count(), 0);
1228        assert_eq!(cfg.effective_deadline_ms(None, None), 8_000);
1229        assert_eq!(cfg.effective_deadline_ms(None, Some(30_000)), 8_000);
1230    }
1231
1232    #[test]
1233    #[cfg(feature = "cdp")]
1234    fn min_deadline_full_ladder_playwright_only() {
1235        // Playwright tier contributes one chrome_timeout + one CDP overhead,
1236        // matching the runtime predicate in `crw-renderer/src/lib.rs`.
1237        let r = RendererConfig {
1238            mode: RendererMode::Playwright,
1239            page_timeout_ms: 15_000,
1240            http_timeout_ms: Some(15_000),
1241            chrome_timeout_ms: Some(30_000),
1242            playwright: Some(CdpEndpoint {
1243                ws_url: "ws://playwright:9222".into(),
1244            }),
1245            ..Default::default()
1246        };
1247        assert_eq!(r.cdp_tier_count(), 1);
1248        // http(15) + chrome-equivalent(30) + 1 * 28 overhead.
1249        assert_eq!(
1250            r.min_deadline_for_full_ladder_ms(),
1251            15_000 + 30_000 + 28_000
1252        );
1253    }
1254
1255    #[test]
1256    fn renderer_phase_toggles_default_off_or_safe() {
1257        let r = RendererConfig::default();
1258        assert!(!r.chrome_intercept_resources);
1259        assert!(!r.chrome_intercept_stylesheets);
1260        assert!(r.chrome_host_intercept_disable.is_empty());
1261        assert_eq!(r.chrome_nav_budget_ms, 12_000);
1262        assert!(!r.chrome_context_pool_enabled);
1263        assert!(!r.use_predictor);
1264    }
1265
1266    #[test]
1267    fn crawler_per_host_limiter_defaults() {
1268        let c = CrawlerConfig::default();
1269        assert_eq!(c.per_host_min_interval_ms, 0);
1270        assert_eq!(c.per_host_max_concurrent, 1);
1271    }
1272
1273    #[test]
1274    fn env_var_overrides_toml_defaults() {
1275        let _g = ENV_LOCK.lock().unwrap();
1276        clear_renderer_env();
1277        unsafe {
1278            std::env::set_var("CRW_SERVER__PORT", "4444");
1279            std::env::set_var("CRW_RENDERER__LIGHTPANDA__WS_URL", "ws://test:9999/");
1280        }
1281        let cfg = AppConfig::load().unwrap();
1282        clear_renderer_env();
1283
1284        assert_eq!(cfg.server.port, 4444, "env var should override server.port");
1285        assert_eq!(
1286            cfg.renderer.lightpanda.as_ref().unwrap().ws_url,
1287            "ws://test:9999/",
1288            "env var should override renderer.lightpanda.ws_url"
1289        );
1290    }
1291}