Skip to main content

crw_core/
config.rs

1use serde::Deserialize;
2
3#[derive(Debug, Clone, Deserialize, Default)]
4pub struct AppConfig {
5    #[serde(default)]
6    pub server: ServerConfig,
7    #[serde(default)]
8    pub renderer: RendererConfig,
9    #[serde(default)]
10    pub crawler: CrawlerConfig,
11    #[serde(default)]
12    pub extraction: ExtractionConfig,
13    #[serde(default)]
14    pub auth: AuthConfig,
15    #[serde(default)]
16    pub request: RequestConfig,
17    #[serde(default)]
18    pub search: SearchConfig,
19    #[serde(default)]
20    pub map: MapConfig,
21    /// `[client]` — settings for the local CLI/MCP when it proxies to the
22    /// hosted SaaS. Written by `crw setup` into the user-config file.
23    #[serde(default)]
24    pub client: ClientConfig,
25}
26
27/// `[client]` — cloud-proxy credentials populated by `crw setup` and read by
28/// `crw mcp` / `crw-mcp`. Both fields are `Option` so an unconfigured user runs
29/// in local mode without surprise overrides.
30#[derive(Debug, Clone, Default, Deserialize)]
31pub struct ClientConfig {
32    /// Base URL of the hosted CRW API, e.g. `https://api.fastcrw.com`.
33    #[serde(default)]
34    pub api_url: Option<String>,
35    /// API key for the hosted CRW API.
36    #[serde(default)]
37    pub api_key: Option<String>,
38}
39
40/// `[map]` section — currently only carries `[map.url_filter]`.
41#[derive(Debug, Clone, Deserialize, Default)]
42pub struct MapConfig {
43    #[serde(default)]
44    pub url_filter: MapUrlFilterConfig,
45}
46
47/// `[map.url_filter]` — raw TOML view of the filter knobs. Conversion to
48/// the runtime `UrlFilterCfg` lives in `crw-crawl` (which can see both this
49/// type and the filter module). Keeping this struct dependency-free here
50/// avoids a cycle (`crw-core` does not depend on `crw-crawl`).
51#[derive(Debug, Clone, Deserialize)]
52pub struct MapUrlFilterConfig {
53    /// Tier B — strip tracking params. Default: `true`.
54    #[serde(default = "default_true_filter")]
55    pub strip_tracking_params: bool,
56    /// Tier A — drop action URLs entirely. Default: `true`.
57    #[serde(default = "default_true_filter")]
58    pub drop_action_urls: bool,
59    /// When `true`, `.gov`/`.mil` hosts run Tier A too. Default `false`.
60    #[serde(default)]
61    pub gov_tld_drop_actions: bool,
62    /// Additive on top of `DEFAULT_TRACKING_PARAMS`.
63    #[serde(default)]
64    pub extra_tracking_params: Vec<String>,
65    /// Additive on top of `DEFAULT_ACTION_PARAMS`.
66    #[serde(default)]
67    pub extra_action_params: Vec<String>,
68    /// Additive on top of `ALWAYS_PRESERVE`.
69    #[serde(default)]
70    pub extra_preserve_params: Vec<String>,
71}
72
73impl Default for MapUrlFilterConfig {
74    fn default() -> Self {
75        Self {
76            strip_tracking_params: true,
77            drop_action_urls: true,
78            gov_tld_drop_actions: false,
79            extra_tracking_params: Vec::new(),
80            extra_action_params: Vec::new(),
81            extra_preserve_params: Vec::new(),
82        }
83    }
84}
85
86fn default_true_filter() -> bool {
87    true
88}
89
90/// Per-tier CDP overhead in milliseconds — sum of SPA selector poll budget,
91/// challenge retry budget, content-stability budget, and fetch overhead.
92/// Mirrors the constants in `crw-renderer::cdp`. The drift between the two
93/// is regression-tested by `crates/crw-server/tests/cdp_constants_test.rs`
94/// (gated behind `feature = "cdp"`).
95///
96/// Used by [`RendererConfig::min_deadline_for_full_ladder_ms`] so the request
97/// deadline accommodates each CDP tier's outer fetch timeout, not just its
98/// configured `page_timeout`.
99pub const CDP_TIER_OVERHEAD_MS: u64 = 28_000;
100
101/// Hard upper bound on the per-request `wait_for_ms` budget. The Tower outer
102/// timeout is sized so a worst-case implicit scrape (no `deadlineMs`,
103/// `wait_for` at this maximum) still completes inside it; values above this
104/// are clamped by [`AppConfig::effective_deadline_ms`] so the inner deadline
105/// can never escape the outer envelope. Documented as `(0, 60000]` in
106/// `types.rs::ScrapeRequest::wait_for`.
107pub const MAX_WAIT_FOR_MS: u64 = 60_000;
108
109/// Configuration for the `/v1/search` endpoint and its SearXNG backend.
110///
111/// When `searxng_url` is unset the endpoint returns HTTP 503 with
112/// `error_code: "search_disabled"` — the route remains mounted so that
113/// startup doesn't have to know whether search will ever be configured.
114#[derive(Debug, Clone, Deserialize)]
115pub struct SearchConfig {
116    /// Master switch. Defaults to `true`; set to `false` to refuse all
117    /// `/v1/search` requests even if `searxng_url` is configured.
118    #[serde(default = "default_true_search")]
119    pub enabled: bool,
120    /// Base URL of the SearXNG instance (e.g. `http://searxng:8080`).
121    /// `None` (the default) disables the endpoint with a clear error.
122    #[serde(default)]
123    pub searxng_url: Option<String>,
124    /// End-to-end timeout for the SearXNG call in milliseconds.
125    #[serde(default = "default_search_timeout_ms")]
126    pub timeout_ms: u64,
127    /// Default `limit` when the request omits it.
128    #[serde(default = "default_search_limit")]
129    pub default_limit: u32,
130    /// Hard cap on `limit` per request. SaaS uses 20.
131    #[serde(default = "default_search_max_limit")]
132    pub max_limit: u32,
133    /// SearXNG engines invoked when the request includes `categories: ["research"]`.
134    /// Defaults match the SaaS implementation.
135    #[serde(default = "default_research_engines")]
136    pub research_engines: Vec<String>,
137    /// SearXNG engines invoked when the request includes `categories: ["github"]`.
138    #[serde(default = "default_github_engines")]
139    pub github_engines: Vec<String>,
140    /// Re-rank the flat result pool for the LLM answer / summarize path
141    /// (RRF + junk/coverage/geo filter + BM25 + domain dedupe) instead of the
142    /// raw SearXNG-score sort. Defaults to `true`. The plain (non-LLM) path is
143    /// unaffected and keeps SaaS byte-parity regardless of this flag.
144    #[serde(default = "default_true_search")]
145    pub rerank_enabled: bool,
146    /// Multi-query expansion for the LLM answer / summarize path: before the
147    /// SearXNG fetch, generate an entity/keyword-focused rewrite of the query,
148    /// fetch both the original and the rewrite, and UNION the candidate pools
149    /// (recall can only increase — the original's results are always kept).
150    /// Targets "retrieval-miss" failures where the answer's source never
151    /// surfaced for the user's phrasing. Costs one extra small LLM call + one
152    /// extra SearXNG fetch. Defaults to `false` (gated); the plain path and the
153    /// answer layer are untouched, so precision/SaaS-parity are preserved.
154    #[serde(default)]
155    pub query_expand: bool,
156    /// Number of LLM-generated query rewrites to fetch + union when
157    /// `query_expand` is on. `1` reproduces the original single-variant
158    /// behavior. Higher values request more DIVERSE reformulations
159    /// (abbreviation/acronym-expanded, keyword-focused) and fetch their pools
160    /// in parallel, raising recall on retrieval-miss queries (e.g. an
161    /// unexpanded acronym whose page never surfaced) at the cost of one extra
162    /// SearXNG fetch each. Clamped to `MAX_QUERY_EXPAND_VARIANTS` in the route.
163    #[serde(default = "default_query_expand_variants")]
164    pub query_expand_variants: usize,
165    /// Adaptive multi-round retrieval (the "evidence-scout" loop). When the
166    /// round-1 answer ABSTAINS (sources lacked the fact), an LLM scout reads the
167    /// round-1 evidence and emits targeted follow-up queries (acronym-expanded,
168    /// exact-entity, predicate/date-specific); their results are scraped, unioned
169    /// into the pool, and the answer is re-synthesized ONCE. Bounded (one extra
170    /// round, capped follow-up queries) so worst-case stays within the request
171    /// deadline. Only fires on abstention, so ~most queries keep the single-shot
172    /// fast path. Recall-only + monotone-safe: a still-abstaining round-2 is
173    /// discarded, keeping round-1. Targets "the answer page never entered the
174    /// first pool" — the dominant remaining miss. Defaults to `false` (gated).
175    #[serde(default)]
176    pub multi_round: bool,
177    /// Passage-level relevance gate for the LLM answer path: split each scraped
178    /// source into passages and feed the answer LLM only the query-relevant
179    /// ones (DeepSeek-scored, no new ML deps). Subtractive — removes noise, never
180    /// adds sources or forces commits; falls back to the full source on any
181    /// failure (byte-identical to off), so it is monotone-safe. Defaults to
182    /// `false` (gated); answer prompt + plain path untouched.
183    #[serde(default)]
184    pub passage_select: bool,
185    /// Page-2 fallback for the LLM answer / summarize path: if the reranked
186    /// (junk-filtered, deduped) candidate pool comes back thinner than the
187    /// answer needs (`< answer_top_n`), fetch the SAME query's SearXNG page 2
188    /// once and union it in, then re-rank. The trigger is evaluated POST-rerank,
189    /// so a junk-heavy first page does not suppress it; the extra fetch only
190    /// fires on already-under-yielding queries (QPS never doubles across the
191    /// corpus). Recall-only + abstention is untouched (a sparse page1+page2 pool
192    /// still abstains). Defaults to `false` (gated); requires `rerank_enabled`.
193    #[serde(default)]
194    pub page2_fallback: bool,
195    /// Calibrated answer path (gated): reduce recoverable OVER-abstentions by
196    /// (a) feeding more sources to the answer LLM by default (top_n 5->8, so the
197    /// answer in result #6-8 or behind a failed top-5 scrape still reaches it)
198    /// and (b) swapping the answer prompt's abstention rule for an anti-hedge
199    /// variant — commit when the sources DO contain the answer (even indirectly
200    /// / one inference step), abstain ONLY when they genuinely lack it. The
201    /// "use ONLY sources" grounding is untouched, so this is the precise inverse
202    /// of the cycle-1 blunt "always commit" failure (which forced commits on
203    /// no-source cases). Default false; A/B with an INCORRECT-guard before flip.
204    #[serde(default)]
205    pub answer_calibrated: bool,
206    /// Moat-hardening abstention (gated). Appends a clause making the answer
207    /// model (a) REJECT a false/unverifiable premise instead of answering as
208    /// though it were true, (b) report when sources CONFLICT rather than picking
209    /// one confidently, and (c) abstain when not confident. Targets the
210    /// adversarial failure SealQA Seal-0 exposed: 32% confident-WRONG
211    /// (hallucination) on conflicting-source / false-premise questions, where
212    /// the "use ONLY sources" rule alone is insufficient. Complements (does not
213    /// replace) `answer_calibrated`. Default false; A/B requires Seal-0
214    /// hallucination DOWN with SimpleQA accuracy NOT regressed before flip.
215    #[serde(default)]
216    pub answer_guarded: bool,
217    /// Use SearXNG structured sources (gated, W0). SearXNG's `infoboxes[]` /
218    /// `answers[]` arrays carry Wikidata/Wikipedia knowledge-panel facts
219    /// (entity attributes like religion/capital/director) that the `results[]`
220    /// transform path discards. With this on, those facts are parsed and pinned
221    /// as a high-trust source at the FRONT of the answer pool (still
222    /// UNTRUSTED-wrapped — widens evidence, never bypasses the safety wrapper).
223    /// Targets the obscure-entity recall gap (PopQA). Default false; A/B on
224    /// diag500 gold-in-sources with the wrong-non-abstain invariant before flip.
225    #[serde(default)]
226    pub use_structured_sources: bool,
227    /// Deterministic Wikidata entity-relation lookup (gated, W3). For
228    /// `<relation> of <entity>` questions (PopQA's obscure long tail that web
229    /// search can't surface), classify -> wbsearchentities -> property fetch and
230    /// pin the fact as a structured source (UNTRUSTED-wrapped, runs in parallel
231    /// with SearXNG, 3s-bounded, any error falls through). Free open data, no
232    /// AI, no SPARQL hot-path. Default false; A/B on diag500 PopQA accuracy +
233    /// the wrong-non-abstain invariant before flip.
234    #[serde(default)]
235    pub wikidata_lookup: bool,
236    /// Snippet fallback for the LLM answer path (gated): when a top-N result's
237    /// scrape failed (empty `markdown`), the result is normally dropped from the
238    /// answer pool — if it was the answer-bearing page, crw abstains though
239    /// retrieval succeeded (diagnosed Pattern A). With this on, such results
240    /// fall back to their SearXNG `description` snippet as a thin source instead
241    /// of vanishing. The snippet is verbatim upstream text, so it cannot inject
242    /// a fact not already present — near-zero INCORRECT exposure. Default false.
243    #[serde(default)]
244    pub snippet_fallback: bool,
245}
246
247impl Default for SearchConfig {
248    fn default() -> Self {
249        Self {
250            enabled: true,
251            searxng_url: None,
252            timeout_ms: default_search_timeout_ms(),
253            default_limit: default_search_limit(),
254            max_limit: default_search_max_limit(),
255            research_engines: default_research_engines(),
256            github_engines: default_github_engines(),
257            rerank_enabled: true,
258            query_expand: false,
259            query_expand_variants: default_query_expand_variants(),
260            multi_round: false,
261            passage_select: false,
262            page2_fallback: false,
263            answer_calibrated: false,
264            answer_guarded: false,
265            use_structured_sources: false,
266            wikidata_lookup: false,
267            snippet_fallback: false,
268        }
269    }
270}
271
272fn default_query_expand_variants() -> usize {
273    1
274}
275fn default_true_search() -> bool {
276    true
277}
278fn default_search_timeout_ms() -> u64 {
279    15_000
280}
281fn default_search_limit() -> u32 {
282    5
283}
284fn default_search_max_limit() -> u32 {
285    20
286}
287fn default_research_engines() -> Vec<String> {
288    vec![
289        "arxiv".into(),
290        "crossref".into(),
291        "google scholar".into(),
292        "semantic scholar".into(),
293    ]
294}
295fn default_github_engines() -> Vec<String> {
296    vec!["github".into()]
297}
298
299/// Per-request defaults that apply to every scrape, crawl, or map call when
300/// the caller does not specify an override. Currently only governs the
301/// end-to-end deadline budget (see `crw-core/src/deadline.rs`).
302#[derive(Debug, Clone, Deserialize)]
303pub struct RequestConfig {
304    /// Default end-to-end deadline budget in milliseconds when a request does
305    /// not specify `deadlineMs`. The SLO p95 latency metric is computed only
306    /// over requests with `deadline_ms <= 8000`; longer values land in a
307    /// separate slow-path histogram.
308    #[serde(default = "default_deadline_ms")]
309    pub deadline_ms_default: u64,
310    /// When `true` (default), an implicit deadline (no per-request `deadlineMs`)
311    /// is auto-extended to `max(deadline_ms_default, ladder_min)` where
312    /// `ladder_min = sum(http+lightpanda+chrome timeouts) + N_cdp_tiers * 28s`.
313    /// This prevents `chrome_timeout_ms = 30000` from appearing inert when
314    /// `deadline_ms_default` is small (issue #35).
315    ///
316    /// Set to `false` to enforce a strict SLO regardless of tier sizing —
317    /// requests that would have completed under the extended budget will
318    /// instead time out at `deadline_ms_default`.
319    #[serde(default = "default_true_request")]
320    pub auto_extend_deadline_for_ladder: bool,
321}
322
323impl Default for RequestConfig {
324    fn default() -> Self {
325        Self {
326            deadline_ms_default: default_deadline_ms(),
327            auto_extend_deadline_for_ladder: true,
328        }
329    }
330}
331
332fn default_true_request() -> bool {
333    true
334}
335
336fn default_deadline_ms() -> u64 {
337    8000
338}
339
340#[derive(Debug, Clone, Deserialize)]
341pub struct ServerConfig {
342    #[serde(default = "default_host")]
343    pub host: String,
344    #[serde(default = "default_port")]
345    pub port: u16,
346    #[serde(default = "default_request_timeout")]
347    pub request_timeout_secs: u64,
348    /// Maximum requests per second (global). 0 = unlimited.
349    #[serde(default = "default_rate_limit_rps")]
350    pub rate_limit_rps: u64,
351}
352
353impl Default for ServerConfig {
354    fn default() -> Self {
355        Self {
356            host: default_host(),
357            port: default_port(),
358            request_timeout_secs: default_request_timeout(),
359            rate_limit_rps: default_rate_limit_rps(),
360        }
361    }
362}
363
364fn default_rate_limit_rps() -> u64 {
365    10
366}
367
368fn default_host() -> String {
369    "0.0.0.0".into()
370}
371fn default_port() -> u16 {
372    3000
373}
374fn default_request_timeout() -> u64 {
375    60
376}
377
378/// Selects which JS renderer(s) the [`FallbackRenderer`] will build.
379///
380/// - `Auto` (default): try every configured CDP endpoint (Lightpanda, Playwright, Chrome)
381///   in order. If none is configured, JS rendering is disabled but HTTP still works.
382/// - `None`: HTTP-only. Never attempt JS rendering.
383/// - `Lightpanda` / `Chrome` / `Playwright`: require the matching `[renderer.<name>]`
384///   endpoint; fail startup if missing. Only the named backend is used.
385///
386/// [`FallbackRenderer`]: https://docs.rs/crw-renderer/latest/crw_renderer/struct.FallbackRenderer.html
387#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, Deserialize)]
388#[serde(rename_all = "lowercase")]
389pub enum RendererMode {
390    #[default]
391    Auto,
392    None,
393    Lightpanda,
394    Chrome,
395    Playwright,
396}
397
398#[derive(Debug, Clone, Deserialize)]
399pub struct RendererConfig {
400    #[serde(default)]
401    pub mode: RendererMode,
402    /// Generic per-page navigation timeout. Used as the fallback when no
403    /// per-tier override is configured. Kept for backward compatibility — the
404    /// per-tier knobs below are preferred for new deployments.
405    #[serde(default = "default_page_timeout")]
406    pub page_timeout_ms: u64,
407    /// Override for the HTTP-only fetcher request timeout. Falls back to
408    /// `page_timeout_ms` when unset. HTTP responses arrive quickly when they
409    /// arrive at all, so 15s is generous and keeps slow upstreams from
410    /// hogging the request budget that should be spent on JS retries.
411    #[serde(default)]
412    pub http_timeout_ms: Option<u64>,
413    /// Override for the LightPanda CDP renderer. LightPanda completes most
414    /// renders in <10s; if it stalls past 20s it almost always means an
415    /// adversarial page that Chrome will render anyway, so failing fast and
416    /// escalating beats waiting it out.
417    #[serde(default)]
418    pub lightpanda_timeout_ms: Option<u64>,
419    /// Override for the full-Chromium tier. Chrome is the slow path
420    /// (gov/legal SPAs need 30–40s for `networkidle`); the larger budget here
421    /// recovers ~6 URLs per fc-wins iteration without affecting the fast path.
422    #[serde(default)]
423    pub chrome_timeout_ms: Option<u64>,
424    #[serde(default = "default_pool_size")]
425    pub pool_size: usize,
426    /// If set, applies to every request that doesn't specify `renderJs` explicitly.
427    /// `Some(true)` = force JS rendering; `Some(false)` = skip JS; `None` = auto-detect.
428    ///
429    /// Accepts the `force_js` alias for backward compatibility.
430    #[serde(default, alias = "force_js")]
431    pub render_js_default: Option<bool>,
432    #[serde(default)]
433    pub lightpanda: Option<CdpEndpoint>,
434    #[serde(default)]
435    pub playwright: Option<CdpEndpoint>,
436    #[serde(default)]
437    pub chrome: Option<CdpEndpoint>,
438    /// Residential-proxy Chrome tier (opt-in 4th renderer). Same Chromium
439    /// browser as `chrome`, but egress routed through a forwarder that adds
440    /// upstream proxy auth (e.g. DataImpulse). Tried after Chrome fails —
441    /// covers IP-blocked targets where the browser fingerprint is fine but
442    /// the VPS egress IP is flagged.
443    #[serde(default)]
444    pub chrome_proxy: Option<CdpEndpoint>,
445    /// Per-tier nav timeout override for `chrome_proxy`. When unset, defaults
446    /// to `chrome_timeout() + 15_000` — the proxy hop adds latency, so the
447    /// fallback tier needs more headroom than direct Chrome.
448    #[serde(default)]
449    pub chrome_proxy_timeout_ms: Option<u64>,
450    /// Enable Chrome resource interception (`Fetch.enable` blocking of media,
451    /// fonts, trackers). Default `false`; flipped after the CDP-fake suite
452    /// validates pump + cleanup behaviour. See plan Phase 2.
453    #[serde(default)]
454    pub chrome_intercept_resources: bool,
455    /// Additionally block `stylesheet` requests when interception is enabled.
456    /// Default `false` — kept off in v1 because some extractors depend on
457    /// CSS-driven visibility / lazy-content triggers.
458    #[serde(default)]
459    pub chrome_intercept_stylesheets: bool,
460    /// Per-host opt-out for chrome interception. Hosts in this list run with
461    /// interception disabled even when `chrome_intercept_resources = true`.
462    #[serde(default)]
463    pub chrome_host_intercept_disable: Vec<String>,
464    /// Hard chrome-tier navigation budget in ms. Wraps `wait_for_page_ready`
465    /// in an inner race; on budget hit the renderer snapshots whatever DOM is
466    /// present and returns `truncated = true`. Calibrated as
467    /// `p90(successful chrome renders)` clamped to `[8_000, 12_000]`.
468    #[serde(default = "default_chrome_nav_budget_ms")]
469    pub chrome_nav_budget_ms: u64,
470    /// Enable the bounded browser-context pool. Default `false`; v1 ships
471    /// `RECYCLE_AFTER_NAV = 1` (recreate every release) before optimising to
472    /// reuse-with-clearing. See plan Phase 4. **Gated off when
473    /// `chrome_backend = "browserless"`** — browserless v2's
474    /// `Target.createBrowserContext` semantics with long-lived sessions are
475    /// unproven; lib.rs forces this to `false` with a WARN log in that case.
476    #[serde(default)]
477    pub chrome_context_pool_enabled: bool,
478    /// Per-knob pool configuration. Read only when
479    /// `chrome_context_pool_enabled = true` AND backend is `Vanilla`.
480    #[serde(default)]
481    pub chrome_pool: ChromePoolConfig,
482    /// Which Chrome backend the WS URL points at. **Explicit** — never sniff
483    /// from URL substrings (k8s svc names, port-forwards, custom routes break
484    /// substring detection per plan §C2). Default `Vanilla`.
485    #[serde(default)]
486    pub chrome_backend: ChromeBackend,
487    /// Enable the success-ratio renderer predictor in `HostPreferences`.
488    /// Default `false`; flipped after the predictor replay harness gates
489    /// on the 1k bench (false-skip < 2 %, false-escalate < 5 %, churn < 3 / 1k).
490    #[serde(default)]
491    pub use_predictor: bool,
492    /// Engine escalation policy (firecrawl-shaped: race + on-error). When
493    /// disabled (default), the renderer keeps its current ladder unchanged.
494    #[serde(default)]
495    pub escalation: EscalationConfig,
496    /// Anti-bot detection policy (crawl4ai 3-tier classifier).
497    #[serde(default)]
498    pub antibot: AntibotConfig,
499    /// DataImpulse residential-proxy base username (without `__cr.<cc>`
500    /// country suffix). When set alongside [`proxy_base_pass`], the engine
501    /// drives Chrome's proxy auth via CDP `Fetch.authRequired` and composes
502    /// the country-suffixed username per request. Read only by the
503    /// `chrome_proxy` tier. None = no upstream proxy auth (chrome_proxy
504    /// tier still functional only if a no-auth or pre-authed proxy is in
505    /// front of Chrome).
506    #[serde(default)]
507    pub proxy_base_user: Option<String>,
508    /// DataImpulse base password — see [`proxy_base_user`].
509    #[serde(default)]
510    pub proxy_base_pass: Option<String>,
511    /// Fallback country code used when a request omits `country`. Lowercased
512    /// 2-letter ISO 3166-1 alpha-2 (e.g. "us"). None = global pool (no suffix).
513    #[serde(default)]
514    pub proxy_default_country: Option<String>,
515}
516
517/// Engine escalation policy — adds `ChromeStealth` and `ChromeStealthProxy`
518/// tiers behind a feature flag. See `plans/recall-next-tier.md` Phase 2.
519#[derive(Debug, Clone, Deserialize)]
520pub struct EscalationConfig {
521    /// Master switch. Default `false` — current ladder runs unchanged.
522    #[serde(default)]
523    pub enabled: bool,
524    /// Per-tier waterfall trigger in ms. If the current engine hasn't returned
525    /// after this long, the next tier is started in parallel (firecrawl
526    /// `WaterfallNextEngineSignal`).
527    #[serde(default = "default_waterfall_timeout_ms")]
528    pub waterfall_timeout_ms: u64,
529    /// Hard global cap across the whole ladder.
530    #[serde(default = "default_escalation_global_timeout_ms")]
531    pub global_timeout_ms: u64,
532    /// Send `?proxy=residential&proxyCountry=…` to browserless on the
533    /// `ChromeStealthProxy` tier. Off by default — bears cost.
534    #[serde(default)]
535    pub residential_proxy: bool,
536    /// Country code passed to browserless when `residential_proxy = true`.
537    #[serde(default = "default_proxy_country")]
538    pub proxy_country: String,
539}
540
541impl Default for EscalationConfig {
542    fn default() -> Self {
543        Self {
544            enabled: false,
545            waterfall_timeout_ms: default_waterfall_timeout_ms(),
546            global_timeout_ms: default_escalation_global_timeout_ms(),
547            residential_proxy: false,
548            proxy_country: default_proxy_country(),
549        }
550    }
551}
552
553fn default_waterfall_timeout_ms() -> u64 {
554    8_000
555}
556fn default_escalation_global_timeout_ms() -> u64 {
557    60_000
558}
559fn default_proxy_country() -> String {
560    "us".to_string()
561}
562
563/// Anti-bot classifier policy. Default: detect+log only; escalation requires
564/// `escalate_on_signal = true` AND `escalation.enabled = true`.
565#[derive(Debug, Clone, Deserialize)]
566pub struct AntibotConfig {
567    /// Run the classifier on every fetch result. Cheap; default on.
568    #[serde(default = "default_true")]
569    pub enabled: bool,
570    /// When the classifier returns a non-`None` signal, advance to the next
571    /// engine tier (requires `escalation.enabled`).
572    #[serde(default)]
573    pub escalate_on_signal: bool,
574    /// When the classifier flags a block during the renderer failover loop,
575    /// treat the result as a soft failure so the loop advances to the next
576    /// tier — ending at `chrome_proxy` (residential). Default `true`. Set
577    /// `false` to keep the classifier running (error_code + telemetry) while
578    /// disabling in-loop escalation — the one-line kill switch.
579    #[serde(default = "default_true")]
580    pub escalate_in_failover: bool,
581}
582
583impl Default for AntibotConfig {
584    fn default() -> Self {
585        Self {
586            enabled: true,
587            escalate_on_signal: false,
588            escalate_in_failover: true,
589        }
590    }
591}
592
593fn default_chrome_nav_budget_ms() -> u64 {
594    12_000
595}
596
597/// Per-knob configuration for the bounded browser-context pool. Loaded under
598/// `[renderer.chrome_pool]`. Inactive unless
599/// `chrome_context_pool_enabled = true` AND `chrome_backend = "vanilla"`.
600#[derive(Debug, Clone, Deserialize)]
601pub struct ChromePoolConfig {
602    /// Pool size. `None` → `max(2, num_cpus / 2)`. Caps simultaneous
603    /// in-flight chrome requests per pool.
604    #[serde(default)]
605    pub size: Option<usize>,
606    /// Recycle policy: v1 always recreates the context after each release.
607    /// Reserved for a future "reuse N navigations then recreate" mode.
608    #[serde(default = "default_recycle_after_navs")]
609    pub recycle_after_navs: u32,
610    /// Idle slots older than this are health-checked on next acquire.
611    #[serde(default = "default_idle_timeout_secs")]
612    pub idle_timeout_secs: u64,
613    /// `Browser.getVersion` probe deadline (idle-slot liveness).
614    #[serde(default = "default_health_check_secs")]
615    pub health_check_secs: u64,
616    /// SIGTERM drain window before phase 3 force-close.
617    #[serde(default = "default_shutdown_drain_secs")]
618    pub shutdown_drain_secs: u64,
619}
620
621impl Default for ChromePoolConfig {
622    fn default() -> Self {
623        Self {
624            size: None,
625            recycle_after_navs: default_recycle_after_navs(),
626            idle_timeout_secs: default_idle_timeout_secs(),
627            health_check_secs: default_health_check_secs(),
628            shutdown_drain_secs: default_shutdown_drain_secs(),
629        }
630    }
631}
632
633fn default_recycle_after_navs() -> u32 {
634    1
635}
636fn default_idle_timeout_secs() -> u64 {
637    300
638}
639fn default_health_check_secs() -> u64 {
640    60
641}
642fn default_shutdown_drain_secs() -> u64 {
643    30
644}
645
646/// Chrome backend kind. Set explicitly under `[renderer]` as
647/// `chrome_backend = "vanilla"` or `chrome_backend = "browserless"`. **Never
648/// inferred from URL substrings** — k8s service names, port-forwards, and
649/// custom routes break substring detection. See plan §C2.
650#[derive(Debug, Clone, Copy, Default, Deserialize, PartialEq, Eq)]
651#[serde(rename_all = "lowercase")]
652pub enum ChromeBackend {
653    /// chromedp/headless-shell or vanilla Chrome with `/json/version`. Pool
654    /// is enabled here when `chrome_context_pool_enabled = true`.
655    #[default]
656    Vanilla,
657    /// Browserless v2 / commercial CDP endpoint. Pool is **gated off** in v1
658    /// — see plan §"Out of scope (v1)".
659    Browserless,
660}
661
662impl Default for RendererConfig {
663    fn default() -> Self {
664        Self {
665            mode: RendererMode::default(),
666            page_timeout_ms: default_page_timeout(),
667            http_timeout_ms: None,
668            lightpanda_timeout_ms: None,
669            chrome_timeout_ms: None,
670            pool_size: default_pool_size(),
671            render_js_default: None,
672            lightpanda: None,
673            playwright: None,
674            chrome: None,
675            chrome_proxy: None,
676            chrome_proxy_timeout_ms: None,
677            chrome_intercept_resources: false,
678            chrome_intercept_stylesheets: false,
679            chrome_host_intercept_disable: Vec::new(),
680            chrome_nav_budget_ms: default_chrome_nav_budget_ms(),
681            chrome_context_pool_enabled: false,
682            chrome_pool: ChromePoolConfig::default(),
683            chrome_backend: ChromeBackend::default(),
684            use_predictor: false,
685            escalation: EscalationConfig::default(),
686            antibot: AntibotConfig::default(),
687            proxy_base_user: None,
688            proxy_base_pass: None,
689            proxy_default_country: None,
690        }
691    }
692}
693fn default_page_timeout() -> u64 {
694    30000
695}
696
697impl RendererConfig {
698    /// Resolved per-tier nav timeout in milliseconds. Resolution rules:
699    ///   1. If the explicit per-tier field is set, use it verbatim.
700    ///   2. Otherwise fall back to `page_timeout_ms` (which itself defaults
701    ///      to 30s for backward compatibility with pre-multi-tier configs).
702    ///
703    /// New deployments are encouraged to set the per-tier knobs to 15/20/45s
704    /// (see config.docker.toml) — these match the bench-tuned values that
705    /// recover slow gov sites in the chrome tier without giving the http
706    /// tier permission to hog the request budget.
707    pub fn http_timeout(&self) -> u64 {
708        self.http_timeout_ms.unwrap_or(self.page_timeout_ms)
709    }
710    pub fn lightpanda_timeout(&self) -> u64 {
711        self.lightpanda_timeout_ms.unwrap_or(self.page_timeout_ms)
712    }
713    pub fn chrome_timeout(&self) -> u64 {
714        self.chrome_timeout_ms.unwrap_or(self.page_timeout_ms)
715    }
716    pub fn chrome_proxy_timeout(&self) -> u64 {
717        self.chrome_proxy_timeout_ms
718            .unwrap_or_else(|| self.chrome_timeout().saturating_add(15_000))
719    }
720
721    /// Compose the DataImpulse-style proxy credentials for a single request.
722    ///
723    /// Resolution order for the country suffix:
724    /// 1. `country` argument (per-request override)
725    /// 2. `self.proxy_default_country` (server default)
726    /// 3. No suffix → DataImpulse global pool
727    ///
728    /// Returns `None` when no base credentials are configured — caller treats
729    /// this as "no auth required". An invalid country code (wrong length,
730    /// non-alphabetic) silently falls through to the default; that keeps a
731    /// malformed `?country=` query from creating an unauthenticated request
732    /// while still letting through a well-known default.
733    pub fn effective_proxy_credentials(&self, country: Option<&str>) -> Option<(String, String)> {
734        let user = self.proxy_base_user.as_ref()?;
735        let pass = self.proxy_base_pass.as_ref()?;
736        let cc = country
737            .or(self.proxy_default_country.as_deref())
738            .map(|s| s.trim().to_lowercase())
739            .filter(|s| s.len() == 2 && s.chars().all(|c| c.is_ascii_alphabetic()));
740        Some(match cc {
741            Some(cc) => (format!("{user}__cr.{cc}"), pass.clone()),
742            None => (user.clone(), pass.clone()),
743        })
744    }
745
746    /// Number of active CDP tiers (lightpanda + playwright + chrome) under
747    /// the current `mode`. Mirrors the predicate used at runtime in
748    /// `crw-renderer/src/lib.rs` when constructing the renderer ladder:
749    /// `want(mode) && config.<tier>.is_some()`.
750    ///
751    /// Returns `0` when the binary is built without the `cdp` feature — in
752    /// that case no JS renderer can be constructed regardless of the config,
753    /// so the deadline auto-extension policy must collapse to HTTP-only.
754    pub fn cdp_tier_count(&self) -> usize {
755        if !cfg!(feature = "cdp") {
756            return 0;
757        }
758        let want =
759            |m: RendererMode| -> bool { matches!(self.mode, RendererMode::Auto) || self.mode == m };
760        let mut n = 0;
761        if want(RendererMode::Lightpanda) && self.lightpanda.is_some() {
762            n += 1;
763        }
764        if want(RendererMode::Playwright) && self.playwright.is_some() {
765            n += 1;
766        }
767        if want(RendererMode::Chrome) && self.chrome.is_some() {
768            n += 1;
769        }
770        n
771    }
772
773    /// Minimum request deadline budget (ms) required so that every configured
774    /// tier can use its full allowance when fallback exhausts the chain.
775    /// Sums the per-tier timeouts and adds [`CDP_TIER_OVERHEAD_MS`] for each
776    /// active CDP tier, matching the runtime ladder built in
777    /// `crw-renderer/src/lib.rs`.
778    pub fn min_deadline_for_full_ladder_ms(&self) -> u64 {
779        let want =
780            |m: RendererMode| -> bool { matches!(self.mode, RendererMode::Auto) || self.mode == m };
781
782        let mut sum: u64 = 0;
783        // HTTP prefetch runs ahead of any JS tier (content-type sniffing,
784        // direct PDF/binary handling) regardless of pinned mode. Skipped only
785        // when mode is `None` (no fetching at all).
786        if !matches!(self.mode, RendererMode::None) {
787            sum = sum.saturating_add(self.http_timeout());
788        }
789
790        // CDP tiers only contribute when the binary was built with the `cdp`
791        // feature; otherwise no JS renderer is constructable at runtime and
792        // including their budgets would over-extend the deadline.
793        if !cfg!(feature = "cdp") {
794            return sum;
795        }
796
797        let mut cdp_tier_count: u64 = 0;
798        if want(RendererMode::Lightpanda) && self.lightpanda.is_some() {
799            sum = sum.saturating_add(self.lightpanda_timeout());
800            cdp_tier_count += 1;
801        }
802        if want(RendererMode::Playwright) && self.playwright.is_some() {
803            sum = sum.saturating_add(self.chrome_timeout());
804            cdp_tier_count += 1;
805        }
806        if want(RendererMode::Chrome) && self.chrome.is_some() {
807            sum = sum.saturating_add(self.chrome_timeout());
808            cdp_tier_count += 1;
809        }
810        sum.saturating_add(cdp_tier_count.saturating_mul(CDP_TIER_OVERHEAD_MS))
811    }
812}
813fn default_pool_size() -> usize {
814    4
815}
816
817#[derive(Debug, Clone, Deserialize)]
818pub struct CdpEndpoint {
819    pub ws_url: String,
820}
821
822/// Stealth mode configuration for evading bot detection.
823#[derive(Debug, Clone, Deserialize)]
824pub struct StealthConfig {
825    /// Enable stealth mode globally.
826    #[serde(default)]
827    pub enabled: bool,
828    /// Custom user-agent pool. Empty = use built-in pool.
829    #[serde(default)]
830    pub user_agents: Vec<String>,
831    /// Jitter factor for rate limiting (0.0–1.0, default 0.2 = ±20%).
832    #[serde(default = "default_jitter")]
833    pub jitter_factor: f64,
834    /// Inject realistic browser headers (Accept, Sec-Fetch-*, etc.).
835    #[serde(default = "default_true")]
836    pub inject_headers: bool,
837}
838
839impl Default for StealthConfig {
840    fn default() -> Self {
841        Self {
842            enabled: false,
843            user_agents: vec![],
844            jitter_factor: default_jitter(),
845            inject_headers: true,
846        }
847    }
848}
849
850fn default_jitter() -> f64 {
851    0.2
852}
853
854/// Built-in realistic user-agent pool used when stealth is enabled.
855pub const BUILTIN_UA_POOL: &[&str] = &[
856    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
857    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
858    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
859    "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:133.0) Gecko/20100101 Firefox/133.0",
860    "Mozilla/5.0 (Macintosh; Intel Mac OS X 14_7_2) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.2 Safari/605.1.15",
861];
862
863#[derive(Debug, Clone, Deserialize)]
864pub struct CrawlerConfig {
865    #[serde(default = "default_concurrency")]
866    pub max_concurrency: usize,
867    #[serde(default = "default_rps")]
868    pub requests_per_second: f64,
869    #[serde(default = "default_true")]
870    pub respect_robots_txt: bool,
871    #[serde(default = "default_ua")]
872    pub user_agent: String,
873    #[serde(default = "default_depth")]
874    pub default_max_depth: u32,
875    #[serde(default = "default_max_pages")]
876    pub default_max_pages: u32,
877    /// Proxy URL for crawler requests. Supports HTTP, HTTPS, and SOCKS5
878    /// (e.g. "http://proxy:8080" or "socks5://user:pass@proxy:1080").
879    #[serde(default)]
880    pub proxy: Option<String>,
881    /// TTL in seconds for completed crawl jobs before cleanup (default: 3600)
882    #[serde(default = "default_job_ttl")]
883    pub job_ttl_secs: u64,
884    #[serde(default)]
885    pub stealth: StealthConfig,
886    /// Floor for the per-host limiter interval, in milliseconds. When a host
887    /// advertises `Crawl-delay` in robots.txt, the higher of the two wins.
888    /// Default `0` — robots.txt is the authoritative source, this is a
889    /// per-deployment safety net.
890    #[serde(default)]
891    pub per_host_min_interval_ms: u64,
892    /// Maximum concurrent in-flight requests against a single eTLD+1.
893    /// Default `1` — strict ethics posture; operators raise consciously via
894    /// config when scraping their own infrastructure.
895    #[serde(default = "default_per_host_max_concurrent")]
896    pub per_host_max_concurrent: u32,
897}
898
899fn default_per_host_max_concurrent() -> u32 {
900    1
901}
902
903impl Default for CrawlerConfig {
904    fn default() -> Self {
905        Self {
906            max_concurrency: default_concurrency(),
907            requests_per_second: default_rps(),
908            respect_robots_txt: true,
909            user_agent: default_ua(),
910            default_max_depth: default_depth(),
911            default_max_pages: default_max_pages(),
912            proxy: None,
913            job_ttl_secs: default_job_ttl(),
914            stealth: StealthConfig::default(),
915            per_host_min_interval_ms: 0,
916            per_host_max_concurrent: default_per_host_max_concurrent(),
917        }
918    }
919}
920
921fn default_concurrency() -> usize {
922    10
923}
924fn default_rps() -> f64 {
925    10.0
926}
927fn default_true() -> bool {
928    true
929}
930fn default_ua() -> String {
931    // Modern Chrome UA. The legacy "CRW/0.1" was rejected by UA-filtering sites
932    // (opencorporates, killeenisd, wsj) returning 403/404. Kept in sync with the
933    // Sec-Ch-Ua client hint in `crw-renderer/src/http_only.rs`.
934    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 \
935     (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36"
936        .into()
937}
938fn default_depth() -> u32 {
939    2
940}
941fn default_max_pages() -> u32 {
942    100
943}
944fn default_job_ttl() -> u64 {
945    3600
946}
947
948#[derive(Debug, Clone, Deserialize)]
949pub struct ExtractionConfig {
950    #[serde(default = "default_format")]
951    pub default_format: String,
952    #[serde(default = "default_true_ext")]
953    pub only_main_content: bool,
954    #[serde(default)]
955    pub llm: Option<LlmConfig>,
956    /// Hostname → CSS selector overrides applied before readability narrowing.
957    /// Match is exact host (no wildcard); user-supplied selector still wins.
958    #[serde(default)]
959    pub domain_selectors: std::collections::HashMap<String, String>,
960    #[serde(default)]
961    pub llm_fallback: LlmFallbackConfig,
962    /// Bytes below which an HTTP-tier extraction is treated as "thin"
963    /// and triggers a JS-renderer escalation. Default 100.
964    #[serde(default = "default_http_retry_threshold")]
965    pub http_retry_threshold_bytes: usize,
966    /// Bytes below which a LightPanda-tier extraction is treated as
967    /// "thin" and triggers a Chrome escalation. Default 2000 (LP often
968    /// returns SPA husks of 90–500B that pass HTML-shape checks).
969    #[serde(default = "default_lightpanda_retry_threshold")]
970    pub lightpanda_retry_threshold_bytes: usize,
971}
972
973fn default_http_retry_threshold() -> usize {
974    100
975}
976
977fn default_lightpanda_retry_threshold() -> usize {
978    2000
979}
980
981impl Default for ExtractionConfig {
982    fn default() -> Self {
983        Self {
984            default_format: default_format(),
985            only_main_content: true,
986            llm: None,
987            domain_selectors: std::collections::HashMap::new(),
988            llm_fallback: LlmFallbackConfig::default(),
989            http_retry_threshold_bytes: default_http_retry_threshold(),
990            lightpanda_retry_threshold_bytes: default_lightpanda_retry_threshold(),
991        }
992    }
993}
994
995#[derive(Debug, Clone, Deserialize)]
996pub struct LlmFallbackConfig {
997    #[serde(default)]
998    pub enable: bool,
999    #[serde(default = "default_llm_quality_threshold")]
1000    pub quality_threshold: f32,
1001    #[serde(default = "default_llm_max_html_bytes")]
1002    pub max_html_bytes: usize,
1003    /// When true (and `enable` is true), invoke the LLM on every page rather
1004    /// than only when DOM-based extraction scores below `quality_threshold`.
1005    /// Mirrors the "LLM as primary extractor" pattern used by Reader-LM,
1006    /// Firecrawl, and similar services. Higher cost, higher recall.
1007    #[serde(default)]
1008    pub always_run: bool,
1009}
1010
1011impl Default for LlmFallbackConfig {
1012    fn default() -> Self {
1013        Self {
1014            enable: false,
1015            quality_threshold: default_llm_quality_threshold(),
1016            max_html_bytes: default_llm_max_html_bytes(),
1017            always_run: false,
1018        }
1019    }
1020}
1021
1022fn default_llm_quality_threshold() -> f32 {
1023    0.3
1024}
1025fn default_llm_max_html_bytes() -> usize {
1026    100_000
1027}
1028
1029#[derive(Debug, Clone, Deserialize)]
1030pub struct LlmConfig {
1031    #[serde(default = "default_llm_provider")]
1032    pub provider: String,
1033    pub api_key: String,
1034    #[serde(default = "default_llm_model")]
1035    pub model: String,
1036    #[serde(default)]
1037    pub base_url: Option<String>,
1038    #[serde(default = "default_llm_max_tokens")]
1039    pub max_tokens: u32,
1040    /// Azure OpenAI API version (e.g. "2024-05-01-preview"). Required when
1041    /// `provider = "azure"`; ignored otherwise.
1042    #[serde(default)]
1043    pub azure_api_version: Option<String>,
1044    /// Max parallel LLM calls for fan-out (e.g. per-result search summaries).
1045    /// Bounded to avoid hitting provider rate limits.
1046    #[serde(default = "default_llm_max_concurrency")]
1047    pub max_concurrency: usize,
1048    /// Byte cap on content sent to the LLM in a single call. Content beyond
1049    /// the cap is truncated on a UTF-8 char boundary.
1050    #[serde(default = "default_llm_max_html_bytes")]
1051    pub max_html_bytes: usize,
1052    /// When set, opencore refuses LLM-touching requests that lack this header
1053    /// AND do not supply `llm_api_key` in the body. SaaS deploys set this so
1054    /// direct public callers can't access LLM features.
1055    #[serde(default)]
1056    pub require_byok_header: Option<String>,
1057    /// Sampling temperature for the LLM call. `None` (default) sends no
1058    /// `temperature` key, preserving each provider's default (DeepSeek = 1) and
1059    /// current prod behavior. The benchmark/eval harness sets `0.0` (with a
1060    /// seed) to make answers deterministic so a real +2-3pp lever is
1061    /// distinguishable from sampling noise. Prod stays `None` until temp=0 is
1062    /// proven not to raise abstention.
1063    #[serde(default)]
1064    pub temperature: Option<f32>,
1065}
1066
1067impl Default for LlmConfig {
1068    fn default() -> Self {
1069        Self {
1070            provider: default_llm_provider(),
1071            api_key: String::new(),
1072            model: default_llm_model(),
1073            base_url: None,
1074            max_tokens: default_llm_max_tokens(),
1075            azure_api_version: None,
1076            max_concurrency: default_llm_max_concurrency(),
1077            max_html_bytes: default_llm_max_html_bytes(),
1078            require_byok_header: None,
1079            temperature: None,
1080        }
1081    }
1082}
1083
1084fn default_llm_max_concurrency() -> usize {
1085    4
1086}
1087
1088fn default_llm_provider() -> String {
1089    "anthropic".into()
1090}
1091fn default_llm_model() -> String {
1092    "claude-sonnet-4-20250514".into()
1093}
1094fn default_llm_max_tokens() -> u32 {
1095    4096
1096}
1097
1098fn default_format() -> String {
1099    "markdown".into()
1100}
1101fn default_true_ext() -> bool {
1102    true
1103}
1104
1105#[derive(Debug, Clone, Default, Deserialize)]
1106pub struct AuthConfig {
1107    #[serde(default)]
1108    pub api_keys: Vec<String>,
1109}
1110
1111/// Path of the per-user config file written by `crw setup`. Returns `None` if
1112/// the home directory cannot be resolved (e.g. headless container with no
1113/// `$HOME`). Honors `$CRW_USER_CONFIG_DIR` for tests so we don't have to
1114/// monkey-patch `$HOME`.
1115pub fn user_config_path() -> Option<std::path::PathBuf> {
1116    if let Ok(dir) = std::env::var("CRW_USER_CONFIG_DIR") {
1117        return Some(std::path::PathBuf::from(dir).join("config.toml"));
1118    }
1119    let home = std::env::var_os("HOME")?;
1120    Some(
1121        std::path::PathBuf::from(home)
1122            .join(".config")
1123            .join("crw")
1124            .join("config.toml"),
1125    )
1126}
1127
1128impl AppConfig {
1129    /// Load config from config.default.toml + per-user config + environment
1130    /// variable overrides.
1131    ///
1132    /// Precedence (highest wins):
1133    ///   1. `CRW_*` env vars (CI/Docker)
1134    ///   2. `$CRW_CONFIG` file (or `config.local.toml` in cwd)
1135    ///   3. `~/.config/crw/config.toml` (written by `crw setup`)
1136    ///   4. `config.default.toml` (bundled defaults)
1137    ///
1138    /// Env stays on top so a one-off `CRW_FOO=bar crw …` always wins over
1139    /// whatever the user has saved, matching how every other shell tool works.
1140    pub fn load() -> Result<Self, config::ConfigError> {
1141        let mut builder = config::Config::builder()
1142            .add_source(config::File::with_name("config.default").required(false));
1143
1144        // User-level config — written atomically by `crw setup`. Optional, so
1145        // a never-configured machine simply reads defaults + env.
1146        if let Some(user_cfg) = user_config_path()
1147            && user_cfg.exists()
1148        {
1149            builder = builder.add_source(config::File::from(user_cfg).required(false));
1150        }
1151
1152        // Load optional override config file (e.g. config.docker.toml in containers).
1153        if let Ok(extra) = std::env::var("CRW_CONFIG") {
1154            builder = builder.add_source(config::File::with_name(&extra).required(true));
1155        } else {
1156            builder = builder.add_source(config::File::with_name("config.local").required(false));
1157        }
1158
1159        let cfg = builder
1160            .add_source(
1161                config::Environment::with_prefix("CRW")
1162                    .prefix_separator("_")
1163                    .separator("__")
1164                    .try_parsing(true),
1165            )
1166            .build()?;
1167        cfg.try_deserialize()
1168    }
1169
1170    /// Compute the effective end-to-end request deadline (ms). Implements the
1171    /// issue-#35 auto-extension policy:
1172    ///
1173    /// 1. If the caller supplied an explicit `requested_deadline_ms`, return it
1174    ///    verbatim — operators trust the request budget over our heuristic.
1175    /// 2. Otherwise, when `request.auto_extend_deadline_for_ladder` is on,
1176    ///    return `max(deadline_ms_default, ladder_min + wait_for_extra)`.
1177    ///    `ladder_min` covers the configured tier ladder; `wait_for_extra`
1178    ///    compensates for callers that bumped `wait_for_ms` above the default
1179    ///    SPA budget (8s) — without it, a long `wait_for` would silently
1180    ///    re-clamp inside CDP.
1181    /// 3. When the policy is disabled, return `deadline_ms_default` unchanged.
1182    ///
1183    /// `wait_for_ms` is the per-request override (ScrapeRequest::wait_for /
1184    /// CrawlRequest::wait_for); pass `None` for sub-fetches that don't
1185    /// surface a wait_for to the caller (search/map enrichment).
1186    pub fn effective_deadline_ms(
1187        &self,
1188        requested_deadline_ms: Option<u64>,
1189        wait_for_ms: Option<u64>,
1190    ) -> u64 {
1191        if let Some(explicit) = requested_deadline_ms {
1192            return explicit;
1193        }
1194        let default_ms = self.request.deadline_ms_default;
1195        if !self.request.auto_extend_deadline_for_ladder {
1196            return default_ms;
1197        }
1198        // Issue #35 is specifically about CDP tier overhead silently clamping
1199        // chrome_timeout_ms. HTTP-only deployments don't suffer the same
1200        // problem (the HTTP renderer respects deadline.remaining without the
1201        // extra fetch/challenge/stability overhead). Skip the extension when
1202        // no CDP tiers are configured so HTTP-only users keep the strict
1203        // operator-configured default.
1204        if self.renderer.cdp_tier_count() == 0 {
1205            return default_ms;
1206        }
1207        let ladder_min = self.renderer.min_deadline_for_full_ladder_ms();
1208        // Mirrors crw_renderer::cdp::SPA_SELECTOR_MAX_MS. The CDP module
1209        // adds `wait_for_ms.unwrap_or(SPA_SELECTOR_MAX_MS)` to its internal
1210        // timeout, so when the caller exceeds the default we need to extend
1211        // the deadline per active CDP tier.
1212        const SPA_DEFAULT_MS: u64 = 8_000;
1213        // Clamp `wait_for_ms` to MAX_WAIT_FOR_MS so the inner deadline never
1214        // exceeds the Tower envelope, which is sized off the same constant in
1215        // `effective_request_timeout_secs`. A pathological caller passing
1216        // `wait_for: 600_000` without `deadlineMs` would otherwise be cancelled
1217        // by Tower before the inner CDP loop noticed the bigger budget.
1218        let extra = if let Some(w) = wait_for_ms {
1219            let bounded = w.min(MAX_WAIT_FOR_MS);
1220            let per_tier = bounded.saturating_sub(SPA_DEFAULT_MS);
1221            per_tier.saturating_mul(self.renderer.cdp_tier_count() as u64)
1222        } else {
1223            0
1224        };
1225        default_ms.max(ladder_min.saturating_add(extra))
1226    }
1227
1228    /// Tower middleware outer timeout (seconds). Must accommodate the longest
1229    /// legitimate handler runtime so a healthy request isn't cancelled by the
1230    /// outer layer before the inner deadline fires.
1231    ///
1232    /// Covers the three route envelopes:
1233    /// - `/scrape`, `/mcp` — auto-extended scrape deadline.
1234    /// - `/search` — SearXNG fetch + bounded enrichment fan-out
1235    ///   (`ceil(max_limit / max_concurrency)` batches × scrape_ms).
1236    /// - `/crawl/jobs/:id`, `/map` — handler-side caps up to 300s.
1237    ///
1238    /// When auto-extend is disabled, returns the operator-configured baseline
1239    /// unchanged.
1240    pub fn effective_request_timeout_secs(&self) -> u64 {
1241        let baseline = self.server.request_timeout_secs;
1242        if !self.request.auto_extend_deadline_for_ladder {
1243            return baseline;
1244        }
1245        const OUTER_BUFFER_SECS: u64 = 5;
1246        // `/map` handler caps `req.timeout.unwrap_or(120).min(300)`; the outer
1247        // must cover the upper bound so callers passing `timeout=300` aren't
1248        // cancelled mid-flight.
1249        const MAP_REQUEST_TIMEOUT_CEILING_MS: u64 = 300_000;
1250        // Cover the worst-case implicit scrape: caller bumps `wait_for` to the
1251        // configured maximum without supplying `deadlineMs`. The same
1252        // [`MAX_WAIT_FOR_MS`] constant is used inside `effective_deadline_ms`
1253        // to clamp the inner extension, so the inner deadline can never
1254        // exceed this outer envelope.
1255        let scrape_ms = self.effective_deadline_ms(None, Some(MAX_WAIT_FOR_MS));
1256
1257        // Search enrichment: bounded by max_concurrency. Worst case sequential
1258        // batching with low concurrency: ceil(max_limit / max_concurrency)
1259        // batches each bounded by scrape_ms.
1260        let conc = (self.crawler.max_concurrency.max(1)) as u64;
1261        let max_results = self.search.max_limit as u64;
1262        let enrich_batches = max_results.div_ceil(conc);
1263        let search_enrichment_ms = enrich_batches.saturating_mul(scrape_ms);
1264        let search_ms = self.search.timeout_ms.saturating_add(search_enrichment_ms);
1265
1266        let max_handler_ms = scrape_ms.max(search_ms).max(MAP_REQUEST_TIMEOUT_CEILING_MS);
1267        let needed_secs = max_handler_ms
1268            .div_ceil(1_000)
1269            .saturating_add(OUTER_BUFFER_SECS);
1270        baseline.max(needed_secs)
1271    }
1272}
1273
1274#[cfg(test)]
1275mod tests {
1276    use super::*;
1277
1278    /// Env var tests modify process-wide state; serialize them to avoid cross-test
1279    /// interference (e.g. `force_js` alias + `render_js_default` direct both set).
1280    static ENV_LOCK: std::sync::Mutex<()> = std::sync::Mutex::new(());
1281
1282    fn clear_renderer_env() {
1283        for k in [
1284            "CRW_RENDERER__MODE",
1285            "CRW_RENDERER__FORCE_JS",
1286            "CRW_RENDERER__RENDER_JS_DEFAULT",
1287            "CRW_RENDERER__LIGHTPANDA__WS_URL",
1288            "CRW_SERVER__PORT",
1289        ] {
1290            unsafe { std::env::remove_var(k) };
1291        }
1292    }
1293
1294    #[test]
1295    fn renderer_mode_parses_variants() {
1296        #[derive(Deserialize)]
1297        struct Wrap {
1298            mode: RendererMode,
1299        }
1300        let cases = [
1301            ("mode = \"auto\"", RendererMode::Auto),
1302            ("mode = \"none\"", RendererMode::None),
1303            ("mode = \"lightpanda\"", RendererMode::Lightpanda),
1304            ("mode = \"chrome\"", RendererMode::Chrome),
1305            ("mode = \"playwright\"", RendererMode::Playwright),
1306        ];
1307        for (toml_str, expected) in cases {
1308            let w: Wrap = toml::from_str(toml_str).unwrap();
1309            assert_eq!(w.mode, expected, "toml: {toml_str}");
1310        }
1311    }
1312
1313    #[test]
1314    fn renderer_mode_bogus_errors() {
1315        #[derive(Deserialize)]
1316        struct Wrap {
1317            #[allow(dead_code)]
1318            mode: RendererMode,
1319        }
1320        let err: Result<Wrap, _> = toml::from_str("mode = \"bogus\"");
1321        assert!(err.is_err(), "bogus mode should fail to parse");
1322    }
1323
1324    #[test]
1325    fn renderer_config_default_mode_is_auto() {
1326        let cfg = RendererConfig::default();
1327        assert_eq!(cfg.mode, RendererMode::Auto);
1328        assert_eq!(cfg.render_js_default, None);
1329    }
1330
1331    #[test]
1332    fn render_js_default_force_js_alias() {
1333        let cfg: RendererConfig = toml::from_str("force_js = true").unwrap();
1334        assert_eq!(cfg.render_js_default, Some(true));
1335    }
1336
1337    #[test]
1338    fn render_js_default_direct_field() {
1339        let cfg: RendererConfig = toml::from_str("render_js_default = false").unwrap();
1340        assert_eq!(cfg.render_js_default, Some(false));
1341    }
1342
1343    #[test]
1344    fn env_var_renderer_mode_chrome() {
1345        let _g = ENV_LOCK.lock().unwrap();
1346        clear_renderer_env();
1347        unsafe { std::env::set_var("CRW_RENDERER__MODE", "chrome") };
1348        let cfg = AppConfig::load().unwrap();
1349        clear_renderer_env();
1350        assert_eq!(cfg.renderer.mode, RendererMode::Chrome);
1351    }
1352
1353    #[test]
1354    fn env_var_force_js_alias_works() {
1355        let _g = ENV_LOCK.lock().unwrap();
1356        clear_renderer_env();
1357        unsafe { std::env::set_var("CRW_RENDERER__FORCE_JS", "true") };
1358        let cfg = AppConfig::load().unwrap();
1359        clear_renderer_env();
1360        assert_eq!(cfg.renderer.render_js_default, Some(true));
1361    }
1362
1363    #[test]
1364    fn env_var_render_js_default_direct() {
1365        let _g = ENV_LOCK.lock().unwrap();
1366        clear_renderer_env();
1367        unsafe { std::env::set_var("CRW_RENDERER__RENDER_JS_DEFAULT", "true") };
1368        let cfg = AppConfig::load().unwrap();
1369        clear_renderer_env();
1370        assert_eq!(cfg.renderer.render_js_default, Some(true));
1371    }
1372
1373    #[test]
1374    fn request_config_defaults_match_plan() {
1375        let r = RequestConfig::default();
1376        assert_eq!(r.deadline_ms_default, 8000);
1377        assert!(r.auto_extend_deadline_for_ladder);
1378    }
1379
1380    #[test]
1381    fn default_app_config_enables_auto_extend() {
1382        // Programmatic Default must mirror serde defaults — issue #35.
1383        let cfg = AppConfig::default();
1384        assert!(cfg.request.auto_extend_deadline_for_ladder);
1385        assert_eq!(cfg.request.deadline_ms_default, 8000);
1386    }
1387
1388    fn renderer_with_chrome_only(chrome_ms: u64) -> RendererConfig {
1389        RendererConfig {
1390            mode: RendererMode::Chrome,
1391            page_timeout_ms: chrome_ms,
1392            chrome_timeout_ms: Some(chrome_ms),
1393            chrome: Some(CdpEndpoint {
1394                ws_url: "ws://chrome:9222".into(),
1395            }),
1396            ..Default::default()
1397        }
1398    }
1399
1400    #[test]
1401    #[cfg(feature = "cdp")]
1402    fn min_deadline_full_ladder_chrome_only() {
1403        // chrome-only mode: http (page_timeout) + chrome + 1 * 28000.
1404        let r = renderer_with_chrome_only(30_000);
1405        // page_timeout_ms is set to chrome_ms here, so http_timeout() → 30s.
1406        assert_eq!(
1407            r.min_deadline_for_full_ladder_ms(),
1408            30_000 + 30_000 + 28_000
1409        );
1410    }
1411
1412    #[test]
1413    #[cfg(feature = "cdp")]
1414    fn min_deadline_full_ladder_auto_three_tiers() {
1415        let r = RendererConfig {
1416            mode: RendererMode::Auto,
1417            page_timeout_ms: 15_000,
1418            http_timeout_ms: Some(15_000),
1419            lightpanda_timeout_ms: Some(2_500),
1420            chrome_timeout_ms: Some(30_000),
1421            lightpanda: Some(CdpEndpoint {
1422                ws_url: "ws://lp:9222".into(),
1423            }),
1424            chrome: Some(CdpEndpoint {
1425                ws_url: "ws://chrome:9222".into(),
1426            }),
1427            ..Default::default()
1428        };
1429        // http(15) + lp(2.5) + chrome(30) + 2*28 = 47.5 + 56 = 103_500.
1430        assert_eq!(
1431            r.min_deadline_for_full_ladder_ms(),
1432            15_000 + 2_500 + 30_000 + 2 * 28_000
1433        );
1434        assert_eq!(r.cdp_tier_count(), 2);
1435    }
1436
1437    #[test]
1438    fn effective_deadline_explicit_bypasses_auto_extend() {
1439        let mut cfg = AppConfig::default();
1440        cfg.request.auto_extend_deadline_for_ladder = true;
1441        cfg.renderer = renderer_with_chrome_only(30_000);
1442        // Explicit override beats both default and ladder_min.
1443        assert_eq!(cfg.effective_deadline_ms(Some(5_000), None), 5_000);
1444        assert_eq!(cfg.effective_deadline_ms(Some(500_000), None), 500_000);
1445    }
1446
1447    #[test]
1448    #[cfg(feature = "cdp")]
1449    fn effective_deadline_auto_extend_raises_to_ladder_min() {
1450        let mut cfg = AppConfig::default();
1451        cfg.request.auto_extend_deadline_for_ladder = true;
1452        cfg.request.deadline_ms_default = 8_000;
1453        cfg.renderer = renderer_with_chrome_only(30_000);
1454        let expected = cfg.renderer.min_deadline_for_full_ladder_ms();
1455        assert!(expected > 8_000);
1456        assert_eq!(cfg.effective_deadline_ms(None, None), expected);
1457    }
1458
1459    #[test]
1460    fn effective_deadline_default_wins_when_higher_than_ladder() {
1461        let mut cfg = AppConfig::default();
1462        cfg.request.auto_extend_deadline_for_ladder = true;
1463        cfg.request.deadline_ms_default = 1_000_000;
1464        cfg.renderer = renderer_with_chrome_only(30_000);
1465        assert_eq!(cfg.effective_deadline_ms(None, None), 1_000_000);
1466    }
1467
1468    #[test]
1469    fn effective_deadline_auto_extend_disabled_returns_baseline() {
1470        let mut cfg = AppConfig::default();
1471        cfg.request.auto_extend_deadline_for_ladder = false;
1472        cfg.request.deadline_ms_default = 8_000;
1473        cfg.renderer = renderer_with_chrome_only(30_000);
1474        assert_eq!(cfg.effective_deadline_ms(None, None), 8_000);
1475    }
1476
1477    #[test]
1478    #[cfg(feature = "cdp")]
1479    fn effective_deadline_extends_for_long_wait_for() {
1480        let mut cfg = AppConfig::default();
1481        cfg.request.auto_extend_deadline_for_ladder = true;
1482        cfg.request.deadline_ms_default = 8_000;
1483        cfg.renderer = renderer_with_chrome_only(30_000);
1484        let base = cfg.renderer.min_deadline_for_full_ladder_ms();
1485        let tier_count = cfg.renderer.cdp_tier_count() as u64;
1486        // wait_for = 20000 → per-tier extra = 12000 over SPA_DEFAULT_MS (8000).
1487        let with_wait = cfg.effective_deadline_ms(None, Some(20_000));
1488        assert_eq!(with_wait, base + 12_000 * tier_count);
1489        // wait_for below SPA default → no extra.
1490        assert_eq!(cfg.effective_deadline_ms(None, Some(2_000)), base);
1491    }
1492
1493    #[test]
1494    fn effective_request_timeout_covers_map_ceiling() {
1495        let mut cfg = AppConfig::default();
1496        cfg.request.auto_extend_deadline_for_ladder = true;
1497        cfg.request.deadline_ms_default = 8_000;
1498        cfg.renderer = renderer_with_chrome_only(30_000);
1499        cfg.search.timeout_ms = 15_000;
1500        cfg.crawler.max_concurrency = 10;
1501        cfg.search.max_limit = 20;
1502        cfg.server.request_timeout_secs = 60;
1503        // Map ceiling 300s + 5s buffer = 305s minimum.
1504        assert!(cfg.effective_request_timeout_secs() >= 305);
1505    }
1506
1507    #[test]
1508    fn effective_request_timeout_disabled_returns_baseline() {
1509        let mut cfg = AppConfig::default();
1510        cfg.request.auto_extend_deadline_for_ladder = false;
1511        cfg.server.request_timeout_secs = 60;
1512        assert_eq!(cfg.effective_request_timeout_secs(), 60);
1513    }
1514
1515    #[test]
1516    fn effective_request_timeout_respects_operator_override() {
1517        let mut cfg = AppConfig::default();
1518        cfg.request.auto_extend_deadline_for_ladder = true;
1519        cfg.server.request_timeout_secs = 600; // operator-configured high
1520        cfg.renderer = renderer_with_chrome_only(30_000);
1521        // Operator's explicit 600s should win over the auto-computed 305s.
1522        assert_eq!(cfg.effective_request_timeout_secs(), 600);
1523    }
1524
1525    #[test]
1526    fn effective_request_timeout_search_sequential_batching() {
1527        // Low concurrency forces ceil(max_limit/conc) batches → larger search_ms.
1528        let mut cfg = AppConfig::default();
1529        cfg.request.auto_extend_deadline_for_ladder = true;
1530        cfg.request.deadline_ms_default = 8_000;
1531        cfg.renderer = renderer_with_chrome_only(30_000);
1532        cfg.search.timeout_ms = 15_000;
1533        cfg.search.max_limit = 20;
1534        cfg.crawler.max_concurrency = 1;
1535        cfg.server.request_timeout_secs = 60;
1536        // The Tower envelope must cover the worst-case implicit scrape with
1537        // `wait_for` bumped to MAX_WAIT_FOR_MS (60s), because callers can do
1538        // that without supplying `deadlineMs`. Mirror that in the expected.
1539        let secs = cfg.effective_request_timeout_secs();
1540        let scrape_ms = cfg.effective_deadline_ms(None, Some(60_000));
1541        let expected_search_ms = 15_000 + 20 * scrape_ms;
1542        let expected_max_ms = scrape_ms.max(expected_search_ms).max(300_000);
1543        let expected_secs = expected_max_ms.div_ceil(1_000) + 5;
1544        assert_eq!(secs, 60u64.max(expected_secs));
1545    }
1546
1547    #[test]
1548    #[cfg(not(feature = "cdp"))]
1549    fn cdp_tier_count_zero_without_cdp_feature() {
1550        // Even when chrome/lightpanda are configured, a binary built without
1551        // the `cdp` feature can never construct a JS renderer. The deadline
1552        // policy must observe that and collapse to HTTP-only behavior.
1553        let r = RendererConfig {
1554            mode: RendererMode::Auto,
1555            page_timeout_ms: 15_000,
1556            chrome_timeout_ms: Some(30_000),
1557            chrome: Some(CdpEndpoint {
1558                ws_url: "ws://chrome:9222".into(),
1559            }),
1560            lightpanda: Some(CdpEndpoint {
1561                ws_url: "ws://lp:9222".into(),
1562            }),
1563            ..Default::default()
1564        };
1565        assert_eq!(r.cdp_tier_count(), 0);
1566        // Only the HTTP tier contributes to the ladder budget.
1567        assert_eq!(r.min_deadline_for_full_ladder_ms(), 15_000);
1568    }
1569
1570    #[test]
1571    fn effective_deadline_skipped_for_http_only_mode() {
1572        // P2 from codex review: HTTP-only deployments don't suffer the CDP
1573        // clamping problem (no fetch/challenge/stability overhead). The
1574        // auto-extension must NOT silently bump their default from 8s to 30s
1575        // just because page_timeout_ms defaults high.
1576        let mut cfg = AppConfig::default();
1577        cfg.request.auto_extend_deadline_for_ladder = true;
1578        cfg.request.deadline_ms_default = 8_000;
1579        cfg.renderer = RendererConfig {
1580            mode: RendererMode::Auto,
1581            page_timeout_ms: 30_000,
1582            // No CDP endpoints configured.
1583            lightpanda: None,
1584            playwright: None,
1585            chrome: None,
1586            ..Default::default()
1587        };
1588        assert_eq!(cfg.renderer.cdp_tier_count(), 0);
1589        assert_eq!(cfg.effective_deadline_ms(None, None), 8_000);
1590        assert_eq!(cfg.effective_deadline_ms(None, Some(30_000)), 8_000);
1591    }
1592
1593    #[test]
1594    #[cfg(feature = "cdp")]
1595    fn min_deadline_full_ladder_playwright_only() {
1596        // Playwright tier contributes one chrome_timeout + one CDP overhead,
1597        // matching the runtime predicate in `crw-renderer/src/lib.rs`.
1598        let r = RendererConfig {
1599            mode: RendererMode::Playwright,
1600            page_timeout_ms: 15_000,
1601            http_timeout_ms: Some(15_000),
1602            chrome_timeout_ms: Some(30_000),
1603            playwright: Some(CdpEndpoint {
1604                ws_url: "ws://playwright:9222".into(),
1605            }),
1606            ..Default::default()
1607        };
1608        assert_eq!(r.cdp_tier_count(), 1);
1609        // http(15) + chrome-equivalent(30) + 1 * 28 overhead.
1610        assert_eq!(
1611            r.min_deadline_for_full_ladder_ms(),
1612            15_000 + 30_000 + 28_000
1613        );
1614    }
1615
1616    #[test]
1617    fn renderer_phase_toggles_default_off_or_safe() {
1618        let r = RendererConfig::default();
1619        assert!(!r.chrome_intercept_resources);
1620        assert!(!r.chrome_intercept_stylesheets);
1621        assert!(r.chrome_host_intercept_disable.is_empty());
1622        assert_eq!(r.chrome_nav_budget_ms, 12_000);
1623        assert!(!r.chrome_context_pool_enabled);
1624        assert!(!r.use_predictor);
1625    }
1626
1627    #[test]
1628    fn crawler_per_host_limiter_defaults() {
1629        let c = CrawlerConfig::default();
1630        assert_eq!(c.per_host_min_interval_ms, 0);
1631        assert_eq!(c.per_host_max_concurrent, 1);
1632    }
1633
1634    #[test]
1635    fn env_var_overrides_toml_defaults() {
1636        let _g = ENV_LOCK.lock().unwrap();
1637        clear_renderer_env();
1638        unsafe {
1639            std::env::set_var("CRW_SERVER__PORT", "4444");
1640            std::env::set_var("CRW_RENDERER__LIGHTPANDA__WS_URL", "ws://test:9999/");
1641        }
1642        let cfg = AppConfig::load().unwrap();
1643        clear_renderer_env();
1644
1645        assert_eq!(cfg.server.port, 4444, "env var should override server.port");
1646        assert_eq!(
1647            cfg.renderer.lightpanda.as_ref().unwrap().ws_url,
1648            "ws://test:9999/",
1649            "env var should override renderer.lightpanda.ws_url"
1650        );
1651    }
1652
1653    #[test]
1654    fn user_config_path_honors_override_env() {
1655        let _g = ENV_LOCK.lock().unwrap();
1656        let tmp = std::env::temp_dir().join(format!("crw-cfg-test-{}", std::process::id()));
1657        unsafe {
1658            std::env::set_var("CRW_USER_CONFIG_DIR", &tmp);
1659        }
1660        let p = user_config_path().unwrap();
1661        unsafe {
1662            std::env::remove_var("CRW_USER_CONFIG_DIR");
1663        }
1664        assert_eq!(p, tmp.join("config.toml"));
1665    }
1666
1667    #[test]
1668    fn user_config_file_is_picked_up_by_load() {
1669        let _g = ENV_LOCK.lock().unwrap();
1670        clear_renderer_env();
1671        let tmp = std::env::temp_dir().join(format!("crw-load-test-{}", std::process::id()));
1672        std::fs::create_dir_all(&tmp).unwrap();
1673        let cfg_path = tmp.join("config.toml");
1674        std::fs::write(
1675            &cfg_path,
1676            r#"
1677[client]
1678api_url = "https://api.example.com"
1679api_key = "test-key-123"
1680
1681[search]
1682searxng_url = "http://localhost:9999"
1683
1684[extraction.llm]
1685provider = "deepseek"
1686api_key = "sk-test"
1687model = "deepseek-chat"
1688"#,
1689        )
1690        .unwrap();
1691
1692        unsafe {
1693            std::env::set_var("CRW_USER_CONFIG_DIR", &tmp);
1694        }
1695        let cfg = AppConfig::load().unwrap();
1696        unsafe {
1697            std::env::remove_var("CRW_USER_CONFIG_DIR");
1698        }
1699        std::fs::remove_dir_all(&tmp).ok();
1700
1701        assert_eq!(
1702            cfg.client.api_url.as_deref(),
1703            Some("https://api.example.com")
1704        );
1705        assert_eq!(cfg.client.api_key.as_deref(), Some("test-key-123"));
1706        assert_eq!(
1707            cfg.search.searxng_url.as_deref(),
1708            Some("http://localhost:9999")
1709        );
1710        let llm = cfg.extraction.llm.expect("llm config present");
1711        assert_eq!(llm.provider, "deepseek");
1712        assert_eq!(llm.api_key, "sk-test");
1713    }
1714
1715    #[test]
1716    fn env_var_beats_user_config() {
1717        let _g = ENV_LOCK.lock().unwrap();
1718        clear_renderer_env();
1719        let tmp = std::env::temp_dir().join(format!("crw-prec-test-{}", std::process::id()));
1720        std::fs::create_dir_all(&tmp).unwrap();
1721        std::fs::write(
1722            tmp.join("config.toml"),
1723            r#"
1724[search]
1725searxng_url = "http://from-file:8080"
1726"#,
1727        )
1728        .unwrap();
1729
1730        unsafe {
1731            std::env::set_var("CRW_USER_CONFIG_DIR", &tmp);
1732            std::env::set_var("CRW_SEARCH__SEARXNG_URL", "http://from-env:8080");
1733        }
1734        let cfg = AppConfig::load().unwrap();
1735        unsafe {
1736            std::env::remove_var("CRW_USER_CONFIG_DIR");
1737            std::env::remove_var("CRW_SEARCH__SEARXNG_URL");
1738        }
1739        std::fs::remove_dir_all(&tmp).ok();
1740
1741        assert_eq!(
1742            cfg.search.searxng_url.as_deref(),
1743            Some("http://from-env:8080"),
1744            "env var must win over user config file"
1745        );
1746    }
1747
1748    #[test]
1749    fn effective_proxy_credentials_appends_country_suffix() {
1750        let cfg = RendererConfig {
1751            proxy_base_user: Some("abc".into()),
1752            proxy_base_pass: Some("pw".into()),
1753            proxy_default_country: Some("de".into()),
1754            ..Default::default()
1755        };
1756        let (u, p) = cfg.effective_proxy_credentials(Some("us")).unwrap();
1757        assert_eq!(u, "abc__cr.us");
1758        assert_eq!(p, "pw");
1759        // Per-request wins over default.
1760        let (u, _) = cfg.effective_proxy_credentials(Some("GB")).unwrap();
1761        assert_eq!(u, "abc__cr.gb", "uppercase input is normalized");
1762        // Default country used when per-request omits it.
1763        let (u, _) = cfg.effective_proxy_credentials(None).unwrap();
1764        assert_eq!(u, "abc__cr.de");
1765    }
1766
1767    #[test]
1768    fn effective_proxy_credentials_invalid_country_uses_global_pool() {
1769        let cfg = RendererConfig {
1770            proxy_base_user: Some("abc".into()),
1771            proxy_base_pass: Some("pw".into()),
1772            ..Default::default()
1773        };
1774        // 3-letter ISO code → rejected, no suffix (global pool).
1775        let (u, _) = cfg.effective_proxy_credentials(Some("usa")).unwrap();
1776        assert_eq!(u, "abc");
1777        // Digits → rejected.
1778        let (u, _) = cfg.effective_proxy_credentials(Some("u1")).unwrap();
1779        assert_eq!(u, "abc");
1780        // Empty string after trim → rejected.
1781        let (u, _) = cfg.effective_proxy_credentials(Some("  ")).unwrap();
1782        assert_eq!(u, "abc");
1783    }
1784
1785    #[test]
1786    fn effective_proxy_credentials_no_base_returns_none() {
1787        let cfg = RendererConfig::default();
1788        assert!(cfg.effective_proxy_credentials(Some("us")).is_none());
1789
1790        let only_user = RendererConfig {
1791            proxy_base_user: Some("abc".into()),
1792            ..Default::default()
1793        };
1794        assert!(only_user.effective_proxy_credentials(Some("us")).is_none());
1795    }
1796}