crw_core/
config.rs

1use serde::Deserialize;
2
3#[derive(Debug, Clone, Deserialize, Default)]
4pub struct AppConfig {
5    #[serde(default)]
6    pub server: ServerConfig,
7    #[serde(default)]
8    pub renderer: RendererConfig,
9    #[serde(default)]
10    pub crawler: CrawlerConfig,
11    #[serde(default)]
12    pub extraction: ExtractionConfig,
13    #[serde(default)]
14    pub auth: AuthConfig,
15    #[serde(default)]
16    pub request: RequestConfig,
17    #[serde(default)]
18    pub search: SearchConfig,
19    #[serde(default)]
20    pub map: MapConfig,
21    /// `[document]` — binary-document (PDF) parsing knobs.
22    #[serde(default)]
23    pub document: DocumentConfig,
24    /// `[client]` — settings for the local CLI/MCP when it proxies to the
25    /// hosted SaaS. Written by `crw setup` into the user-config file.
26    #[serde(default)]
27    pub client: ClientConfig,
28}
29
30/// `[client]` — cloud-proxy credentials populated by `crw setup` and read by
31/// `crw mcp` / `crw-mcp`. Both fields are `Option` so an unconfigured user runs
32/// in local mode without surprise overrides.
33#[derive(Debug, Clone, Default, Deserialize)]
34pub struct ClientConfig {
35    /// Base URL of the hosted CRW API, e.g. `https://api.fastcrw.com`.
36    #[serde(default)]
37    pub api_url: Option<String>,
38    /// API key for the hosted CRW API.
39    #[serde(default)]
40    pub api_key: Option<String>,
41}
42
43/// `[document]` section — controls PDF (and future binary-document) parsing.
44/// All knobs honor `CRW_DOCUMENT__*` env overrides.
45#[derive(Debug, Clone, Deserialize)]
46#[serde(default)]
47pub struct DocumentConfig {
48    /// Master switch for document parsing at runtime (independent of the
49    /// compile-time `pdf` cargo feature). When `false`, PDFs are left unparsed.
50    pub enabled: bool,
51    /// Cap on the number of pages converted per document. `0` = no limit.
52    pub max_pages: usize,
53    /// Best-effort extraction from scanned/image PDFs (no OCR; usually empty).
54    pub attempt_scanned: bool,
55    /// Maximum upload size in bytes for `POST /v2/parse`. Defaults to 50 MB,
56    /// matching the HTTP renderer's response cap.
57    pub max_upload_bytes: usize,
58    /// Maximum number of concurrent uploads being parsed at once — bounds peak
59    /// memory (each in-flight upload buffers up to `max_upload_bytes`).
60    pub upload_concurrency: usize,
61    /// Process-wide cap on concurrent PDF parses across ALL surfaces (URL
62    /// scrape, crawl, batch, upload). Bounds peak CPU + decompressed memory: a
63    /// malicious PDF can decompress far beyond its on-wire size, so this is the
64    /// primary memory-DoS guard. Independent of `upload_concurrency` (which
65    /// only bounds upload body buffering).
66    pub max_concurrent_parses: usize,
67    /// Wall-clock timeout (ms) for a single PDF parse. A parse exceeding this
68    /// returns a timeout error to the caller; protects against pathological
69    /// documents that spin the parser. `0` disables the timeout.
70    pub parse_timeout_ms: u64,
71    /// Decompression-bomb guard: maximum total DECOMPRESSED bytes a document's
72    /// FlateDecode streams may inflate to. Checked in bounded memory BEFORE the
73    /// parser runs, so a small file that explodes to many GB is rejected with
74    /// `pdf_too_large` having allocated only kilobytes. This is the primary
75    /// guard against OOM-crashing the host. `0` disables it. Default 100 MiB —
76    /// huge for text extraction (millions of words) yet tiny next to host RAM.
77    /// Raise only if you must parse image-heavy PDFs.
78    pub max_decompressed_bytes: usize,
79    /// Run each PDF parse in an isolated child PROCESS (Unix only) instead of
80    /// in-process. The child gets a hard OS memory ceiling (`RLIMIT_AS`) and CPU
81    /// limit, inherits no env/secrets, and is killed on timeout. A crash, OOM,
82    /// or even a hypothetical parser RCE is contained to the child — the main
83    /// server (scrape/crawl) keeps running. Costs ~1-3ms spawn overhead per
84    /// parse. Recommended for hosts that accept untrusted uploads. Default off.
85    pub sandbox: bool,
86    /// Hard address-space limit (bytes) for a sandbox child (`RLIMIT_AS`). The
87    /// child is aborted by the OS if it allocates beyond this — the ultimate
88    /// backstop against memory-DoS even if the decompression guard is bypassed.
89    /// Default 512 MiB.
90    pub sandbox_memory_bytes: u64,
91}
92
93impl Default for DocumentConfig {
94    fn default() -> Self {
95        Self {
96            enabled: true,
97            max_pages: 0,
98            attempt_scanned: false,
99            max_upload_bytes: 52_428_800, // 50 MiB
100            upload_concurrency: 4,
101            max_concurrent_parses: 4,
102            parse_timeout_ms: 30_000,
103            max_decompressed_bytes: 104_857_600, // 100 MiB
104            sandbox: false,
105            sandbox_memory_bytes: 536_870_912, // 512 MiB
106        }
107    }
108}
109
110/// `[map]` section — currently only carries `[map.url_filter]`.
111#[derive(Debug, Clone, Deserialize, Default)]
112pub struct MapConfig {
113    #[serde(default)]
114    pub url_filter: MapUrlFilterConfig,
115}
116
117/// `[map.url_filter]` — raw TOML view of the filter knobs. Conversion to
118/// the runtime `UrlFilterCfg` lives in `crw-crawl` (which can see both this
119/// type and the filter module). Keeping this struct dependency-free here
120/// avoids a cycle (`crw-core` does not depend on `crw-crawl`).
121#[derive(Debug, Clone, Deserialize)]
122pub struct MapUrlFilterConfig {
123    /// Tier B — strip tracking params. Default: `true`.
124    #[serde(default = "default_true_filter")]
125    pub strip_tracking_params: bool,
126    /// Tier A — drop action URLs entirely. Default: `true`.
127    #[serde(default = "default_true_filter")]
128    pub drop_action_urls: bool,
129    /// When `true`, `.gov`/`.mil` hosts run Tier A too. Default `false`.
130    #[serde(default)]
131    pub gov_tld_drop_actions: bool,
132    /// Additive on top of `DEFAULT_TRACKING_PARAMS`.
133    #[serde(default)]
134    pub extra_tracking_params: Vec<String>,
135    /// Additive on top of `DEFAULT_ACTION_PARAMS`.
136    #[serde(default)]
137    pub extra_action_params: Vec<String>,
138    /// Additive on top of `ALWAYS_PRESERVE`.
139    #[serde(default)]
140    pub extra_preserve_params: Vec<String>,
141}
142
143impl Default for MapUrlFilterConfig {
144    fn default() -> Self {
145        Self {
146            strip_tracking_params: true,
147            drop_action_urls: true,
148            gov_tld_drop_actions: false,
149            extra_tracking_params: Vec::new(),
150            extra_action_params: Vec::new(),
151            extra_preserve_params: Vec::new(),
152        }
153    }
154}
155
156fn default_true_filter() -> bool {
157    true
158}
159
160/// Per-tier CDP overhead in milliseconds — sum of SPA selector poll budget,
161/// challenge retry budget, content-stability budget, and fetch overhead.
162/// Mirrors the constants in `crw-renderer::cdp`. The drift between the two
163/// is regression-tested by `crates/crw-server/tests/cdp_constants_test.rs`
164/// (gated behind `feature = "cdp"`).
165///
166/// Used by [`RendererConfig::min_deadline_for_full_ladder_ms`] so the request
167/// deadline accommodates each CDP tier's outer fetch timeout, not just its
168/// configured `page_timeout`.
169pub const CDP_TIER_OVERHEAD_MS: u64 = 28_000;
170
171/// Hard upper bound on the per-request `wait_for_ms` budget. The Tower outer
172/// timeout is sized so a worst-case implicit scrape (no `deadlineMs`,
173/// `wait_for` at this maximum) still completes inside it; values above this
174/// are clamped by [`AppConfig::effective_deadline_ms`] so the inner deadline
175/// can never escape the outer envelope. Documented as `(0, 60000]` in
176/// `types.rs::ScrapeRequest::wait_for`.
177pub const MAX_WAIT_FOR_MS: u64 = 60_000;
178
179/// Configuration for the `/v1/search` endpoint and its SearXNG backend.
180///
181/// When `searxng_url` is unset the endpoint returns HTTP 503 with
182/// `error_code: "search_disabled"` — the route remains mounted so that
183/// startup doesn't have to know whether search will ever be configured.
184#[derive(Debug, Clone, Deserialize)]
185pub struct SearchConfig {
186    /// Master switch. Defaults to `true`; set to `false` to refuse all
187    /// `/v1/search` requests even if `searxng_url` is configured.
188    #[serde(default = "default_true_search")]
189    pub enabled: bool,
190    /// Base URL of the SearXNG instance (e.g. `http://searxng:8080`).
191    /// `None` (the default) disables the endpoint with a clear error.
192    #[serde(default)]
193    pub searxng_url: Option<String>,
194    /// End-to-end timeout for the SearXNG call in milliseconds.
195    #[serde(default = "default_search_timeout_ms")]
196    pub timeout_ms: u64,
197    /// Default `limit` when the request omits it.
198    #[serde(default = "default_search_limit")]
199    pub default_limit: u32,
200    /// Hard cap on `limit` per request. SaaS uses 20.
201    #[serde(default = "default_search_max_limit")]
202    pub max_limit: u32,
203    /// SearXNG engines invoked when the request includes `categories: ["research"]`.
204    /// Defaults match the SaaS implementation.
205    #[serde(default = "default_research_engines")]
206    pub research_engines: Vec<String>,
207    /// SearXNG engines invoked when the request includes `categories: ["github"]`.
208    #[serde(default = "default_github_engines")]
209    pub github_engines: Vec<String>,
210    /// Re-rank the flat result pool for the LLM answer / summarize path
211    /// (RRF + junk/coverage/geo filter + BM25 + domain dedupe) instead of the
212    /// raw SearXNG-score sort. Defaults to `true`. The plain (non-LLM) path is
213    /// unaffected and keeps SaaS byte-parity regardless of this flag.
214    #[serde(default = "default_true_search")]
215    pub rerank_enabled: bool,
216    /// Multi-query expansion for the LLM answer / summarize path: before the
217    /// SearXNG fetch, generate an entity/keyword-focused rewrite of the query,
218    /// fetch both the original and the rewrite, and UNION the candidate pools
219    /// (recall can only increase — the original's results are always kept).
220    /// Targets "retrieval-miss" failures where the answer's source never
221    /// surfaced for the user's phrasing. Costs one extra small LLM call + one
222    /// extra SearXNG fetch. Defaults to `false` (gated); the plain path and the
223    /// answer layer are untouched, so precision/SaaS-parity are preserved.
224    #[serde(default)]
225    pub query_expand: bool,
226    /// Number of LLM-generated query rewrites to fetch + union when
227    /// `query_expand` is on. `1` reproduces the original single-variant
228    /// behavior. Higher values request more DIVERSE reformulations
229    /// (abbreviation/acronym-expanded, keyword-focused) and fetch their pools
230    /// in parallel, raising recall on retrieval-miss queries (e.g. an
231    /// unexpanded acronym whose page never surfaced) at the cost of one extra
232    /// SearXNG fetch each. Clamped to `MAX_QUERY_EXPAND_VARIANTS` in the route.
233    #[serde(default = "default_query_expand_variants")]
234    pub query_expand_variants: usize,
235    /// Adaptive multi-round retrieval (the "evidence-scout" loop). When the
236    /// round-1 answer ABSTAINS (sources lacked the fact), an LLM scout reads the
237    /// round-1 evidence and emits targeted follow-up queries (acronym-expanded,
238    /// exact-entity, predicate/date-specific); their results are scraped, unioned
239    /// into the pool, and the answer is re-synthesized ONCE. Bounded (one extra
240    /// round, capped follow-up queries) so worst-case stays within the request
241    /// deadline. Only fires on abstention, so ~most queries keep the single-shot
242    /// fast path. Recall-only + monotone-safe: a still-abstaining round-2 is
243    /// discarded, keeping round-1. Targets "the answer page never entered the
244    /// first pool" — the dominant remaining miss. Defaults to `false` (gated).
245    #[serde(default)]
246    pub multi_round: bool,
247    /// Passage-level relevance gate for the LLM answer path: split each scraped
248    /// source into passages and feed the answer LLM only the query-relevant
249    /// ones (DeepSeek-scored, no new ML deps). Subtractive — removes noise, never
250    /// adds sources or forces commits; falls back to the full source on any
251    /// failure (byte-identical to off), so it is monotone-safe. Defaults to
252    /// `false` (gated); answer prompt + plain path untouched.
253    #[serde(default)]
254    pub passage_select: bool,
255    /// Page-2 fallback for the LLM answer / summarize path: if the reranked
256    /// (junk-filtered, deduped) candidate pool comes back thinner than the
257    /// answer needs (`< answer_top_n`), fetch the SAME query's SearXNG page 2
258    /// once and union it in, then re-rank. The trigger is evaluated POST-rerank,
259    /// so a junk-heavy first page does not suppress it; the extra fetch only
260    /// fires on already-under-yielding queries (QPS never doubles across the
261    /// corpus). Recall-only + abstention is untouched (a sparse page1+page2 pool
262    /// still abstains). Defaults to `false` (gated); requires `rerank_enabled`.
263    #[serde(default)]
264    pub page2_fallback: bool,
265    /// Calibrated answer path (gated): reduce recoverable OVER-abstentions by
266    /// (a) feeding more sources to the answer LLM by default (top_n 5->8, so the
267    /// answer in result #6-8 or behind a failed top-5 scrape still reaches it)
268    /// and (b) swapping the answer prompt's abstention rule for an anti-hedge
269    /// variant — commit when the sources DO contain the answer (even indirectly
270    /// / one inference step), abstain ONLY when they genuinely lack it. The
271    /// "use ONLY sources" grounding is untouched, so this is the precise inverse
272    /// of the cycle-1 blunt "always commit" failure (which forced commits on
273    /// no-source cases). Default false; A/B with an INCORRECT-guard before flip.
274    #[serde(default)]
275    pub answer_calibrated: bool,
276    /// Moat-hardening abstention (gated). Appends a clause making the answer
277    /// model (a) REJECT a false/unverifiable premise instead of answering as
278    /// though it were true, (b) report when sources CONFLICT rather than picking
279    /// one confidently, and (c) abstain when not confident. Targets the
280    /// adversarial failure SealQA Seal-0 exposed: 32% confident-WRONG
281    /// (hallucination) on conflicting-source / false-premise questions, where
282    /// the "use ONLY sources" rule alone is insufficient. Complements (does not
283    /// replace) `answer_calibrated`. Default false; A/B requires Seal-0
284    /// hallucination DOWN with SimpleQA accuracy NOT regressed before flip.
285    #[serde(default)]
286    pub answer_guarded: bool,
287    /// Use SearXNG structured sources (gated, W0). SearXNG's `infoboxes[]` /
288    /// `answers[]` arrays carry Wikidata/Wikipedia knowledge-panel facts
289    /// (entity attributes like religion/capital/director) that the `results[]`
290    /// transform path discards. With this on, those facts are parsed and pinned
291    /// as a high-trust source at the FRONT of the answer pool (still
292    /// UNTRUSTED-wrapped — widens evidence, never bypasses the safety wrapper).
293    /// Targets the obscure-entity recall gap (PopQA). Default false; A/B on
294    /// diag500 gold-in-sources with the wrong-non-abstain invariant before flip.
295    #[serde(default)]
296    pub use_structured_sources: bool,
297    /// Deterministic Wikidata entity-relation lookup (gated, W3). For
298    /// `<relation> of <entity>` questions (PopQA's obscure long tail that web
299    /// search can't surface), classify -> wbsearchentities -> property fetch and
300    /// pin the fact as a structured source (UNTRUSTED-wrapped, runs in parallel
301    /// with SearXNG, 3s-bounded, any error falls through). Free open data, no
302    /// AI, no SPARQL hot-path. Default false; A/B on diag500 PopQA accuracy +
303    /// the wrong-non-abstain invariant before flip.
304    #[serde(default)]
305    pub wikidata_lookup: bool,
306    /// Snippet fallback for the LLM answer path (gated): when a top-N result's
307    /// scrape failed (empty `markdown`), the result is normally dropped from the
308    /// answer pool — if it was the answer-bearing page, crw abstains though
309    /// retrieval succeeded (diagnosed Pattern A). With this on, such results
310    /// fall back to their SearXNG `description` snippet as a thin source instead
311    /// of vanishing. The snippet is verbatim upstream text, so it cannot inject
312    /// a fact not already present — near-zero INCORRECT exposure. Default false.
313    #[serde(default)]
314    pub snippet_fallback: bool,
315    /// Relevance gate for the LLM answer / summarize re-rank (gated). After the
316    /// lexical-core junk/coverage/geo filters, keep only the rows that cover the
317    /// MOST important (non-stopword) query terms present in the pool, so a
318    /// partial-match homonym ("best pizza in REDMOND" for "best pizza in
319    /// belgrade", coverage 1/2) is evicted the instant a full-match row
320    /// ("pizza … belgrade", 2/2) is present. Ranks on the query's OWN tokens —
321    /// no geo/country/IP signal — so it holds for self-hosted deployments in any
322    /// region. Monotone-safe (degrade fallback applies first; never empties a
323    /// non-empty pool). Requires `rerank_enabled`. Default false; A/B against
324    /// the frozen rerank benchmark before flip.
325    #[serde(default)]
326    pub rerank_relevance: bool,
327    /// List-format answers for the LLM answer path (gated). When the query has
328    /// list intent ("best/top X in Y", "recommend …", "list of …"), the answer
329    /// prompt's prose directive is swapped for a ranked-list directive so the
330    /// model emits up to 10 named options (`N. <name> — <why>`) instead of a
331    /// 3–6 sentence paragraph. A deterministic classifier (`is_list_intent`)
332    /// decides per query; factual/non-list queries are untouched. The "use ONLY
333    /// sources" grounding, the abstention rule, and the `===CITATIONS===` block
334    /// are preserved (no fabrication, citation moat intact). Default false; A/B
335    /// against the answer-accuracy benchmark before flip.
336    #[serde(default)]
337    pub answer_list_format: bool,
338}
339
340impl Default for SearchConfig {
341    fn default() -> Self {
342        Self {
343            enabled: true,
344            searxng_url: None,
345            timeout_ms: default_search_timeout_ms(),
346            default_limit: default_search_limit(),
347            max_limit: default_search_max_limit(),
348            research_engines: default_research_engines(),
349            github_engines: default_github_engines(),
350            rerank_enabled: true,
351            query_expand: false,
352            query_expand_variants: default_query_expand_variants(),
353            multi_round: false,
354            passage_select: false,
355            page2_fallback: false,
356            answer_calibrated: false,
357            answer_guarded: false,
358            use_structured_sources: false,
359            wikidata_lookup: false,
360            snippet_fallback: false,
361            rerank_relevance: false,
362            answer_list_format: false,
363        }
364    }
365}
366
367fn default_query_expand_variants() -> usize {
368    1
369}
370fn default_true_search() -> bool {
371    true
372}
373fn default_search_timeout_ms() -> u64 {
374    15_000
375}
376fn default_search_limit() -> u32 {
377    5
378}
379fn default_search_max_limit() -> u32 {
380    20
381}
382fn default_research_engines() -> Vec<String> {
383    vec![
384        "arxiv".into(),
385        "crossref".into(),
386        "google scholar".into(),
387        "semantic scholar".into(),
388    ]
389}
390fn default_github_engines() -> Vec<String> {
391    vec!["github".into()]
392}
393
394/// Per-request defaults that apply to every scrape, crawl, or map call when
395/// the caller does not specify an override. Currently only governs the
396/// end-to-end deadline budget (see `crw-core/src/deadline.rs`).
397#[derive(Debug, Clone, Deserialize)]
398pub struct RequestConfig {
399    /// Default end-to-end deadline budget in milliseconds when a request does
400    /// not specify `deadlineMs`. The SLO p95 latency metric is computed only
401    /// over requests with `deadline_ms <= 8000`; longer values land in a
402    /// separate slow-path histogram.
403    #[serde(default = "default_deadline_ms")]
404    pub deadline_ms_default: u64,
405    /// When `true` (default), an implicit deadline (no per-request `deadlineMs`)
406    /// is auto-extended to `max(deadline_ms_default, ladder_min)` where
407    /// `ladder_min = sum(http+lightpanda+chrome timeouts) + N_cdp_tiers * 28s`.
408    /// This prevents `chrome_timeout_ms = 30000` from appearing inert when
409    /// `deadline_ms_default` is small (issue #35).
410    ///
411    /// Set to `false` to enforce a strict SLO regardless of tier sizing —
412    /// requests that would have completed under the extended budget will
413    /// instead time out at `deadline_ms_default`.
414    #[serde(default = "default_true_request")]
415    pub auto_extend_deadline_for_ladder: bool,
416}
417
418impl Default for RequestConfig {
419    fn default() -> Self {
420        Self {
421            deadline_ms_default: default_deadline_ms(),
422            auto_extend_deadline_for_ladder: true,
423        }
424    }
425}
426
427fn default_true_request() -> bool {
428    true
429}
430
431fn default_deadline_ms() -> u64 {
432    8000
433}
434
435#[derive(Debug, Clone, Deserialize)]
436pub struct ServerConfig {
437    #[serde(default = "default_host")]
438    pub host: String,
439    #[serde(default = "default_port")]
440    pub port: u16,
441    #[serde(default = "default_request_timeout")]
442    pub request_timeout_secs: u64,
443    /// Maximum requests per second (global). 0 = unlimited.
444    #[serde(default = "default_rate_limit_rps")]
445    pub rate_limit_rps: u64,
446}
447
448impl Default for ServerConfig {
449    fn default() -> Self {
450        Self {
451            host: default_host(),
452            port: default_port(),
453            request_timeout_secs: default_request_timeout(),
454            rate_limit_rps: default_rate_limit_rps(),
455        }
456    }
457}
458
459fn default_rate_limit_rps() -> u64 {
460    10
461}
462
463fn default_host() -> String {
464    "0.0.0.0".into()
465}
466fn default_port() -> u16 {
467    3000
468}
469fn default_request_timeout() -> u64 {
470    60
471}
472
473/// Selects which JS renderer(s) the [`FallbackRenderer`] will build.
474///
475/// - `Auto` (default): try every configured CDP endpoint (Lightpanda, Playwright, Chrome)
476///   in order. If none is configured, JS rendering is disabled but HTTP still works.
477/// - `None`: HTTP-only. Never attempt JS rendering.
478/// - `Lightpanda` / `Chrome` / `Playwright`: require the matching `[renderer.<name>]`
479///   endpoint; fail startup if missing. Only the named backend is used.
480///
481/// [`FallbackRenderer`]: https://docs.rs/crw-renderer/latest/crw_renderer/struct.FallbackRenderer.html
482#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, Deserialize)]
483#[serde(rename_all = "lowercase")]
484pub enum RendererMode {
485    #[default]
486    Auto,
487    None,
488    Lightpanda,
489    Chrome,
490    Playwright,
491}
492
493#[derive(Debug, Clone, Deserialize)]
494pub struct RendererConfig {
495    #[serde(default)]
496    pub mode: RendererMode,
497    /// Generic per-page navigation timeout. Used as the fallback when no
498    /// per-tier override is configured. Kept for backward compatibility — the
499    /// per-tier knobs below are preferred for new deployments.
500    #[serde(default = "default_page_timeout")]
501    pub page_timeout_ms: u64,
502    /// Override for the HTTP-only fetcher request timeout. Falls back to
503    /// `page_timeout_ms` when unset. HTTP responses arrive quickly when they
504    /// arrive at all, so 15s is generous and keeps slow upstreams from
505    /// hogging the request budget that should be spent on JS retries.
506    #[serde(default)]
507    pub http_timeout_ms: Option<u64>,
508    /// Override for the LightPanda CDP renderer. LightPanda completes most
509    /// renders in <10s; if it stalls past 20s it almost always means an
510    /// adversarial page that Chrome will render anyway, so failing fast and
511    /// escalating beats waiting it out.
512    #[serde(default)]
513    pub lightpanda_timeout_ms: Option<u64>,
514    /// Override for the full-Chromium tier. Chrome is the slow path
515    /// (gov/legal SPAs need 30–40s for `networkidle`); the larger budget here
516    /// recovers ~6 URLs per fc-wins iteration without affecting the fast path.
517    #[serde(default)]
518    pub chrome_timeout_ms: Option<u64>,
519    #[serde(default = "default_pool_size")]
520    pub pool_size: usize,
521    /// If set, applies to every request that doesn't specify `renderJs` explicitly.
522    /// `Some(true)` = force JS rendering; `Some(false)` = skip JS; `None` = auto-detect.
523    ///
524    /// Accepts the `force_js` alias for backward compatibility.
525    #[serde(default, alias = "force_js")]
526    pub render_js_default: Option<bool>,
527    #[serde(default)]
528    pub lightpanda: Option<CdpEndpoint>,
529    #[serde(default)]
530    pub playwright: Option<CdpEndpoint>,
531    #[serde(default)]
532    pub chrome: Option<CdpEndpoint>,
533    /// Residential-proxy Chrome tier (opt-in 4th renderer). Same Chromium
534    /// browser as `chrome`, but egress routed through a forwarder that adds
535    /// upstream proxy auth (e.g. DataImpulse). Tried after Chrome fails —
536    /// covers IP-blocked targets where the browser fingerprint is fine but
537    /// the VPS egress IP is flagged.
538    #[serde(default)]
539    pub chrome_proxy: Option<CdpEndpoint>,
540    /// Per-tier nav timeout override for `chrome_proxy`. When unset, defaults
541    /// to `chrome_timeout() + 15_000` — the proxy hop adds latency, so the
542    /// fallback tier needs more headroom than direct Chrome.
543    #[serde(default)]
544    pub chrome_proxy_timeout_ms: Option<u64>,
545    /// Enable Chrome resource interception (`Fetch.enable` blocking of media,
546    /// fonts, trackers). Default `false`; flipped after the CDP-fake suite
547    /// validates pump + cleanup behaviour. See plan Phase 2.
548    #[serde(default)]
549    pub chrome_intercept_resources: bool,
550    /// Additionally block `stylesheet` requests when interception is enabled.
551    /// Default `false` — kept off in v1 because some extractors depend on
552    /// CSS-driven visibility / lazy-content triggers.
553    #[serde(default)]
554    pub chrome_intercept_stylesheets: bool,
555    /// Per-host opt-out for chrome interception. Hosts in this list run with
556    /// interception disabled even when `chrome_intercept_resources = true`.
557    #[serde(default)]
558    pub chrome_host_intercept_disable: Vec<String>,
559    /// Hard chrome-tier navigation budget in ms. Wraps `wait_for_page_ready`
560    /// in an inner race; on budget hit the renderer snapshots whatever DOM is
561    /// present and returns `truncated = true`. Calibrated as
562    /// `p90(successful chrome renders)` clamped to `[8_000, 12_000]`.
563    #[serde(default = "default_chrome_nav_budget_ms")]
564    pub chrome_nav_budget_ms: u64,
565    /// Enable the bounded browser-context pool. Default `false`; v1 ships
566    /// `RECYCLE_AFTER_NAV = 1` (recreate every release) before optimising to
567    /// reuse-with-clearing. See plan Phase 4. **Gated off when
568    /// `chrome_backend = "browserless"`** — browserless v2's
569    /// `Target.createBrowserContext` semantics with long-lived sessions are
570    /// unproven; lib.rs forces this to `false` with a WARN log in that case.
571    #[serde(default)]
572    pub chrome_context_pool_enabled: bool,
573    /// Per-knob pool configuration. Read only when
574    /// `chrome_context_pool_enabled = true` AND backend is `Vanilla`.
575    #[serde(default)]
576    pub chrome_pool: ChromePoolConfig,
577    /// Which Chrome backend the WS URL points at. **Explicit** — never sniff
578    /// from URL substrings (k8s svc names, port-forwards, custom routes break
579    /// substring detection per plan §C2). Default `Vanilla`.
580    #[serde(default)]
581    pub chrome_backend: ChromeBackend,
582    /// Enable the success-ratio renderer predictor in `HostPreferences`.
583    /// Default `false`; flipped after the predictor replay harness gates
584    /// on the 1k bench (false-skip < 2 %, false-escalate < 5 %, churn < 3 / 1k).
585    #[serde(default)]
586    pub use_predictor: bool,
587    /// Engine escalation policy (firecrawl-shaped: race + on-error). When
588    /// disabled (default), the renderer keeps its current ladder unchanged.
589    #[serde(default)]
590    pub escalation: EscalationConfig,
591    /// Anti-bot detection policy (crawl4ai 3-tier classifier).
592    #[serde(default)]
593    pub antibot: AntibotConfig,
594    /// DataImpulse residential-proxy base username (without `__cr.<cc>`
595    /// country suffix). When set alongside [`proxy_base_pass`], the engine
596    /// drives Chrome's proxy auth via CDP `Fetch.authRequired` and composes
597    /// the country-suffixed username per request. Read only by the
598    /// `chrome_proxy` tier. None = no upstream proxy auth (chrome_proxy
599    /// tier still functional only if a no-auth or pre-authed proxy is in
600    /// front of Chrome).
601    #[serde(default)]
602    pub proxy_base_user: Option<String>,
603    /// DataImpulse base password — see [`proxy_base_user`].
604    #[serde(default)]
605    pub proxy_base_pass: Option<String>,
606    /// Fallback country code used when a request omits `country`. Lowercased
607    /// 2-letter ISO 3166-1 alpha-2 (e.g. "us"). None = global pool (no suffix).
608    #[serde(default)]
609    pub proxy_default_country: Option<String>,
610}
611
612/// Engine escalation policy — adds `ChromeStealth` and `ChromeStealthProxy`
613/// tiers behind a feature flag. See `plans/recall-next-tier.md` Phase 2.
614#[derive(Debug, Clone, Deserialize)]
615pub struct EscalationConfig {
616    /// Master switch. Default `false` — current ladder runs unchanged.
617    #[serde(default)]
618    pub enabled: bool,
619    /// Per-tier waterfall trigger in ms. If the current engine hasn't returned
620    /// after this long, the next tier is started in parallel (firecrawl
621    /// `WaterfallNextEngineSignal`).
622    #[serde(default = "default_waterfall_timeout_ms")]
623    pub waterfall_timeout_ms: u64,
624    /// Hard global cap across the whole ladder.
625    #[serde(default = "default_escalation_global_timeout_ms")]
626    pub global_timeout_ms: u64,
627    /// Send `?proxy=residential&proxyCountry=…` to browserless on the
628    /// `ChromeStealthProxy` tier. Off by default — bears cost.
629    #[serde(default)]
630    pub residential_proxy: bool,
631    /// Country code passed to browserless when `residential_proxy = true`.
632    #[serde(default = "default_proxy_country")]
633    pub proxy_country: String,
634}
635
636impl Default for EscalationConfig {
637    fn default() -> Self {
638        Self {
639            enabled: false,
640            waterfall_timeout_ms: default_waterfall_timeout_ms(),
641            global_timeout_ms: default_escalation_global_timeout_ms(),
642            residential_proxy: false,
643            proxy_country: default_proxy_country(),
644        }
645    }
646}
647
648fn default_waterfall_timeout_ms() -> u64 {
649    8_000
650}
651fn default_escalation_global_timeout_ms() -> u64 {
652    60_000
653}
654fn default_proxy_country() -> String {
655    "us".to_string()
656}
657
658/// Anti-bot classifier policy. Default: detect+log only; escalation requires
659/// `escalate_on_signal = true` AND `escalation.enabled = true`.
660#[derive(Debug, Clone, Deserialize)]
661pub struct AntibotConfig {
662    /// Run the classifier on every fetch result. Cheap; default on.
663    #[serde(default = "default_true")]
664    pub enabled: bool,
665    /// When the classifier returns a non-`None` signal, advance to the next
666    /// engine tier (requires `escalation.enabled`).
667    #[serde(default)]
668    pub escalate_on_signal: bool,
669    /// When the classifier flags a block during the renderer failover loop,
670    /// treat the result as a soft failure so the loop advances to the next
671    /// tier — ending at `chrome_proxy` (residential). Default `true`. Set
672    /// `false` to keep the classifier running (error_code + telemetry) while
673    /// disabling in-loop escalation — the one-line kill switch.
674    #[serde(default = "default_true")]
675    pub escalate_in_failover: bool,
676}
677
678impl Default for AntibotConfig {
679    fn default() -> Self {
680        Self {
681            enabled: true,
682            escalate_on_signal: false,
683            escalate_in_failover: true,
684        }
685    }
686}
687
688fn default_chrome_nav_budget_ms() -> u64 {
689    12_000
690}
691
692/// Per-knob configuration for the bounded browser-context pool. Loaded under
693/// `[renderer.chrome_pool]`. Inactive unless
694/// `chrome_context_pool_enabled = true` AND `chrome_backend = "vanilla"`.
695#[derive(Debug, Clone, Deserialize)]
696pub struct ChromePoolConfig {
697    /// Pool size. `None` → `max(2, num_cpus / 2)`. Caps simultaneous
698    /// in-flight chrome requests per pool.
699    #[serde(default)]
700    pub size: Option<usize>,
701    /// Recycle policy: v1 always recreates the context after each release.
702    /// Reserved for a future "reuse N navigations then recreate" mode.
703    #[serde(default = "default_recycle_after_navs")]
704    pub recycle_after_navs: u32,
705    /// Idle slots older than this are health-checked on next acquire.
706    #[serde(default = "default_idle_timeout_secs")]
707    pub idle_timeout_secs: u64,
708    /// `Browser.getVersion` probe deadline (idle-slot liveness).
709    #[serde(default = "default_health_check_secs")]
710    pub health_check_secs: u64,
711    /// SIGTERM drain window before phase 3 force-close.
712    #[serde(default = "default_shutdown_drain_secs")]
713    pub shutdown_drain_secs: u64,
714}
715
716impl Default for ChromePoolConfig {
717    fn default() -> Self {
718        Self {
719            size: None,
720            recycle_after_navs: default_recycle_after_navs(),
721            idle_timeout_secs: default_idle_timeout_secs(),
722            health_check_secs: default_health_check_secs(),
723            shutdown_drain_secs: default_shutdown_drain_secs(),
724        }
725    }
726}
727
728fn default_recycle_after_navs() -> u32 {
729    1
730}
731fn default_idle_timeout_secs() -> u64 {
732    300
733}
734fn default_health_check_secs() -> u64 {
735    60
736}
737fn default_shutdown_drain_secs() -> u64 {
738    30
739}
740
741/// Chrome backend kind. Set explicitly under `[renderer]` as
742/// `chrome_backend = "vanilla"` or `chrome_backend = "browserless"`. **Never
743/// inferred from URL substrings** — k8s service names, port-forwards, and
744/// custom routes break substring detection. See plan §C2.
745#[derive(Debug, Clone, Copy, Default, Deserialize, PartialEq, Eq)]
746#[serde(rename_all = "lowercase")]
747pub enum ChromeBackend {
748    /// chromedp/headless-shell or vanilla Chrome with `/json/version`. Pool
749    /// is enabled here when `chrome_context_pool_enabled = true`.
750    #[default]
751    Vanilla,
752    /// Browserless v2 / commercial CDP endpoint. Pool is **gated off** in v1
753    /// — see plan §"Out of scope (v1)".
754    Browserless,
755}
756
757impl Default for RendererConfig {
758    fn default() -> Self {
759        Self {
760            mode: RendererMode::default(),
761            page_timeout_ms: default_page_timeout(),
762            http_timeout_ms: None,
763            lightpanda_timeout_ms: None,
764            chrome_timeout_ms: None,
765            pool_size: default_pool_size(),
766            render_js_default: None,
767            lightpanda: None,
768            playwright: None,
769            chrome: None,
770            chrome_proxy: None,
771            chrome_proxy_timeout_ms: None,
772            chrome_intercept_resources: false,
773            chrome_intercept_stylesheets: false,
774            chrome_host_intercept_disable: Vec::new(),
775            chrome_nav_budget_ms: default_chrome_nav_budget_ms(),
776            chrome_context_pool_enabled: false,
777            chrome_pool: ChromePoolConfig::default(),
778            chrome_backend: ChromeBackend::default(),
779            use_predictor: false,
780            escalation: EscalationConfig::default(),
781            antibot: AntibotConfig::default(),
782            proxy_base_user: None,
783            proxy_base_pass: None,
784            proxy_default_country: None,
785        }
786    }
787}
788fn default_page_timeout() -> u64 {
789    30000
790}
791
792impl RendererConfig {
793    /// Resolved per-tier nav timeout in milliseconds. Resolution rules:
794    ///   1. If the explicit per-tier field is set, use it verbatim.
795    ///   2. Otherwise fall back to `page_timeout_ms` (which itself defaults
796    ///      to 30s for backward compatibility with pre-multi-tier configs).
797    ///
798    /// New deployments are encouraged to set the per-tier knobs to 15/20/45s
799    /// (see config.docker.toml) — these match the bench-tuned values that
800    /// recover slow gov sites in the chrome tier without giving the http
801    /// tier permission to hog the request budget.
802    pub fn http_timeout(&self) -> u64 {
803        self.http_timeout_ms.unwrap_or(self.page_timeout_ms)
804    }
805    pub fn lightpanda_timeout(&self) -> u64 {
806        self.lightpanda_timeout_ms.unwrap_or(self.page_timeout_ms)
807    }
808    pub fn chrome_timeout(&self) -> u64 {
809        self.chrome_timeout_ms.unwrap_or(self.page_timeout_ms)
810    }
811    pub fn chrome_proxy_timeout(&self) -> u64 {
812        self.chrome_proxy_timeout_ms
813            .unwrap_or_else(|| self.chrome_timeout().saturating_add(15_000))
814    }
815
816    /// Compose the DataImpulse-style proxy credentials for a single request.
817    ///
818    /// Resolution order for the country suffix:
819    /// 1. `country` argument (per-request override)
820    /// 2. `self.proxy_default_country` (server default)
821    /// 3. No suffix → DataImpulse global pool
822    ///
823    /// Returns `None` when no base credentials are configured — caller treats
824    /// this as "no auth required". An invalid country code (wrong length,
825    /// non-alphabetic) silently falls through to the default; that keeps a
826    /// malformed `?country=` query from creating an unauthenticated request
827    /// while still letting through a well-known default.
828    pub fn effective_proxy_credentials(&self, country: Option<&str>) -> Option<(String, String)> {
829        let user = self.proxy_base_user.as_ref()?;
830        let pass = self.proxy_base_pass.as_ref()?;
831        let cc = country
832            .or(self.proxy_default_country.as_deref())
833            .map(|s| s.trim().to_lowercase())
834            .filter(|s| s.len() == 2 && s.chars().all(|c| c.is_ascii_alphabetic()));
835        Some(match cc {
836            Some(cc) => (format!("{user}__cr.{cc}"), pass.clone()),
837            None => (user.clone(), pass.clone()),
838        })
839    }
840
841    /// Number of active CDP tiers (lightpanda + playwright + chrome) under
842    /// the current `mode`. Mirrors the predicate used at runtime in
843    /// `crw-renderer/src/lib.rs` when constructing the renderer ladder:
844    /// `want(mode) && config.<tier>.is_some()`.
845    ///
846    /// Returns `0` when the binary is built without the `cdp` feature — in
847    /// that case no JS renderer can be constructed regardless of the config,
848    /// so the deadline auto-extension policy must collapse to HTTP-only.
849    pub fn cdp_tier_count(&self) -> usize {
850        if !cfg!(feature = "cdp") {
851            return 0;
852        }
853        let want =
854            |m: RendererMode| -> bool { matches!(self.mode, RendererMode::Auto) || self.mode == m };
855        let mut n = 0;
856        if want(RendererMode::Lightpanda) && self.lightpanda.is_some() {
857            n += 1;
858        }
859        if want(RendererMode::Playwright) && self.playwright.is_some() {
860            n += 1;
861        }
862        if want(RendererMode::Chrome) && self.chrome.is_some() {
863            n += 1;
864        }
865        n
866    }
867
868    /// Minimum request deadline budget (ms) required so that every configured
869    /// tier can use its full allowance when fallback exhausts the chain.
870    /// Sums the per-tier timeouts and adds [`CDP_TIER_OVERHEAD_MS`] for each
871    /// active CDP tier, matching the runtime ladder built in
872    /// `crw-renderer/src/lib.rs`.
873    pub fn min_deadline_for_full_ladder_ms(&self) -> u64 {
874        let want =
875            |m: RendererMode| -> bool { matches!(self.mode, RendererMode::Auto) || self.mode == m };
876
877        let mut sum: u64 = 0;
878        // HTTP prefetch runs ahead of any JS tier (content-type sniffing,
879        // direct PDF/binary handling) regardless of pinned mode. Skipped only
880        // when mode is `None` (no fetching at all).
881        if !matches!(self.mode, RendererMode::None) {
882            sum = sum.saturating_add(self.http_timeout());
883        }
884
885        // CDP tiers only contribute when the binary was built with the `cdp`
886        // feature; otherwise no JS renderer is constructable at runtime and
887        // including their budgets would over-extend the deadline.
888        if !cfg!(feature = "cdp") {
889            return sum;
890        }
891
892        let mut cdp_tier_count: u64 = 0;
893        if want(RendererMode::Lightpanda) && self.lightpanda.is_some() {
894            sum = sum.saturating_add(self.lightpanda_timeout());
895            cdp_tier_count += 1;
896        }
897        if want(RendererMode::Playwright) && self.playwright.is_some() {
898            sum = sum.saturating_add(self.chrome_timeout());
899            cdp_tier_count += 1;
900        }
901        if want(RendererMode::Chrome) && self.chrome.is_some() {
902            sum = sum.saturating_add(self.chrome_timeout());
903            cdp_tier_count += 1;
904        }
905        sum.saturating_add(cdp_tier_count.saturating_mul(CDP_TIER_OVERHEAD_MS))
906    }
907}
908fn default_pool_size() -> usize {
909    4
910}
911
912#[derive(Debug, Clone, Deserialize)]
913pub struct CdpEndpoint {
914    pub ws_url: String,
915}
916
917/// Stealth mode configuration for evading bot detection.
918#[derive(Debug, Clone, Deserialize)]
919pub struct StealthConfig {
920    /// Enable stealth mode globally.
921    #[serde(default)]
922    pub enabled: bool,
923    /// Custom user-agent pool. Empty = use built-in pool.
924    #[serde(default)]
925    pub user_agents: Vec<String>,
926    /// Jitter factor for rate limiting (0.0–1.0, default 0.2 = ±20%).
927    #[serde(default = "default_jitter")]
928    pub jitter_factor: f64,
929    /// Inject realistic browser headers (Accept, Sec-Fetch-*, etc.).
930    #[serde(default = "default_true")]
931    pub inject_headers: bool,
932}
933
934impl Default for StealthConfig {
935    fn default() -> Self {
936        Self {
937            enabled: false,
938            user_agents: vec![],
939            jitter_factor: default_jitter(),
940            inject_headers: true,
941        }
942    }
943}
944
945fn default_jitter() -> f64 {
946    0.2
947}
948
949/// Built-in realistic user-agent pool used when stealth is enabled.
950pub const BUILTIN_UA_POOL: &[&str] = &[
951    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
952    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
953    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
954    "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:133.0) Gecko/20100101 Firefox/133.0",
955    "Mozilla/5.0 (Macintosh; Intel Mac OS X 14_7_2) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.2 Safari/605.1.15",
956];
957
958#[derive(Debug, Clone, Deserialize)]
959pub struct CrawlerConfig {
960    #[serde(default = "default_concurrency")]
961    pub max_concurrency: usize,
962    #[serde(default = "default_rps")]
963    pub requests_per_second: f64,
964    #[serde(default = "default_true")]
965    pub respect_robots_txt: bool,
966    #[serde(default = "default_ua")]
967    pub user_agent: String,
968    #[serde(default = "default_depth")]
969    pub default_max_depth: u32,
970    #[serde(default = "default_max_pages")]
971    pub default_max_pages: u32,
972    /// Proxy URL for crawler requests. Supports HTTP, HTTPS, and SOCKS5
973    /// (e.g. "http://proxy:8080" or "socks5://user:pass@proxy:1080").
974    #[serde(default)]
975    pub proxy: Option<String>,
976    /// TTL in seconds for completed crawl jobs before cleanup (default: 3600)
977    #[serde(default = "default_job_ttl")]
978    pub job_ttl_secs: u64,
979    #[serde(default)]
980    pub stealth: StealthConfig,
981    /// Floor for the per-host limiter interval, in milliseconds. When a host
982    /// advertises `Crawl-delay` in robots.txt, the higher of the two wins.
983    /// Default `0` — robots.txt is the authoritative source, this is a
984    /// per-deployment safety net.
985    #[serde(default)]
986    pub per_host_min_interval_ms: u64,
987    /// Maximum concurrent in-flight requests against a single eTLD+1.
988    /// Default `1` — strict ethics posture; operators raise consciously via
989    /// config when scraping their own infrastructure.
990    #[serde(default = "default_per_host_max_concurrent")]
991    pub per_host_max_concurrent: u32,
992}
993
994fn default_per_host_max_concurrent() -> u32 {
995    1
996}
997
998impl Default for CrawlerConfig {
999    fn default() -> Self {
1000        Self {
1001            max_concurrency: default_concurrency(),
1002            requests_per_second: default_rps(),
1003            respect_robots_txt: true,
1004            user_agent: default_ua(),
1005            default_max_depth: default_depth(),
1006            default_max_pages: default_max_pages(),
1007            proxy: None,
1008            job_ttl_secs: default_job_ttl(),
1009            stealth: StealthConfig::default(),
1010            per_host_min_interval_ms: 0,
1011            per_host_max_concurrent: default_per_host_max_concurrent(),
1012        }
1013    }
1014}
1015
1016fn default_concurrency() -> usize {
1017    10
1018}
1019fn default_rps() -> f64 {
1020    10.0
1021}
1022fn default_true() -> bool {
1023    true
1024}
1025fn default_ua() -> String {
1026    // Modern Chrome UA. The legacy "CRW/0.1" was rejected by UA-filtering sites
1027    // (opencorporates, killeenisd, wsj) returning 403/404. Kept in sync with the
1028    // Sec-Ch-Ua client hint in `crw-renderer/src/http_only.rs`.
1029    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 \
1030     (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36"
1031        .into()
1032}
1033fn default_depth() -> u32 {
1034    2
1035}
1036fn default_max_pages() -> u32 {
1037    100
1038}
1039fn default_job_ttl() -> u64 {
1040    3600
1041}
1042
1043#[derive(Debug, Clone, Deserialize)]
1044pub struct ExtractionConfig {
1045    #[serde(default = "default_format")]
1046    pub default_format: String,
1047    #[serde(default = "default_true_ext")]
1048    pub only_main_content: bool,
1049    #[serde(default)]
1050    pub llm: Option<LlmConfig>,
1051    /// Hostname → CSS selector overrides applied before readability narrowing.
1052    /// Match is exact host (no wildcard); user-supplied selector still wins.
1053    #[serde(default)]
1054    pub domain_selectors: std::collections::HashMap<String, String>,
1055    #[serde(default)]
1056    pub llm_fallback: LlmFallbackConfig,
1057    /// Bytes below which an HTTP-tier extraction is treated as "thin"
1058    /// and triggers a JS-renderer escalation. Default 100.
1059    #[serde(default = "default_http_retry_threshold")]
1060    pub http_retry_threshold_bytes: usize,
1061    /// Bytes below which a LightPanda-tier extraction is treated as
1062    /// "thin" and triggers a Chrome escalation. Default 2000 (LP often
1063    /// returns SPA husks of 90–500B that pass HTML-shape checks).
1064    #[serde(default = "default_lightpanda_retry_threshold")]
1065    pub lightpanda_retry_threshold_bytes: usize,
1066}
1067
1068fn default_http_retry_threshold() -> usize {
1069    100
1070}
1071
1072fn default_lightpanda_retry_threshold() -> usize {
1073    2000
1074}
1075
1076impl Default for ExtractionConfig {
1077    fn default() -> Self {
1078        Self {
1079            default_format: default_format(),
1080            only_main_content: true,
1081            llm: None,
1082            domain_selectors: std::collections::HashMap::new(),
1083            llm_fallback: LlmFallbackConfig::default(),
1084            http_retry_threshold_bytes: default_http_retry_threshold(),
1085            lightpanda_retry_threshold_bytes: default_lightpanda_retry_threshold(),
1086        }
1087    }
1088}
1089
1090#[derive(Debug, Clone, Deserialize)]
1091pub struct LlmFallbackConfig {
1092    #[serde(default)]
1093    pub enable: bool,
1094    #[serde(default = "default_llm_quality_threshold")]
1095    pub quality_threshold: f32,
1096    #[serde(default = "default_llm_max_html_bytes")]
1097    pub max_html_bytes: usize,
1098    /// When true (and `enable` is true), invoke the LLM on every page rather
1099    /// than only when DOM-based extraction scores below `quality_threshold`.
1100    /// Mirrors the "LLM as primary extractor" pattern used by Reader-LM,
1101    /// Firecrawl, and similar services. Higher cost, higher recall.
1102    #[serde(default)]
1103    pub always_run: bool,
1104}
1105
1106impl Default for LlmFallbackConfig {
1107    fn default() -> Self {
1108        Self {
1109            enable: false,
1110            quality_threshold: default_llm_quality_threshold(),
1111            max_html_bytes: default_llm_max_html_bytes(),
1112            always_run: false,
1113        }
1114    }
1115}
1116
1117fn default_llm_quality_threshold() -> f32 {
1118    0.3
1119}
1120fn default_llm_max_html_bytes() -> usize {
1121    100_000
1122}
1123
1124#[derive(Debug, Clone, Deserialize)]
1125pub struct LlmConfig {
1126    #[serde(default = "default_llm_provider")]
1127    pub provider: String,
1128    pub api_key: String,
1129    #[serde(default = "default_llm_model")]
1130    pub model: String,
1131    #[serde(default)]
1132    pub base_url: Option<String>,
1133    #[serde(default = "default_llm_max_tokens")]
1134    pub max_tokens: u32,
1135    /// Azure OpenAI API version (e.g. "2024-05-01-preview"). Required when
1136    /// `provider = "azure"`; ignored otherwise.
1137    #[serde(default)]
1138    pub azure_api_version: Option<String>,
1139    /// Max parallel LLM calls for fan-out (e.g. per-result search summaries).
1140    /// Bounded to avoid hitting provider rate limits.
1141    #[serde(default = "default_llm_max_concurrency")]
1142    pub max_concurrency: usize,
1143    /// Byte cap on content sent to the LLM in a single call. Content beyond
1144    /// the cap is truncated on a UTF-8 char boundary.
1145    #[serde(default = "default_llm_max_html_bytes")]
1146    pub max_html_bytes: usize,
1147    /// When set, opencore refuses LLM-touching requests that lack this header
1148    /// AND do not supply `llm_api_key` in the body. SaaS deploys set this so
1149    /// direct public callers can't access LLM features.
1150    #[serde(default)]
1151    pub require_byok_header: Option<String>,
1152    /// Sampling temperature for the LLM call. `None` (default) sends no
1153    /// `temperature` key, preserving each provider's default (DeepSeek = 1) and
1154    /// current prod behavior. The benchmark/eval harness sets `0.0` (with a
1155    /// seed) to make answers deterministic so a real +2-3pp lever is
1156    /// distinguishable from sampling noise. Prod stays `None` until temp=0 is
1157    /// proven not to raise abstention.
1158    #[serde(default)]
1159    pub temperature: Option<f32>,
1160}
1161
1162impl Default for LlmConfig {
1163    fn default() -> Self {
1164        Self {
1165            provider: default_llm_provider(),
1166            api_key: String::new(),
1167            model: default_llm_model(),
1168            base_url: None,
1169            max_tokens: default_llm_max_tokens(),
1170            azure_api_version: None,
1171            max_concurrency: default_llm_max_concurrency(),
1172            max_html_bytes: default_llm_max_html_bytes(),
1173            require_byok_header: None,
1174            temperature: None,
1175        }
1176    }
1177}
1178
1179fn default_llm_max_concurrency() -> usize {
1180    4
1181}
1182
1183fn default_llm_provider() -> String {
1184    "anthropic".into()
1185}
1186fn default_llm_model() -> String {
1187    "claude-sonnet-4-20250514".into()
1188}
1189fn default_llm_max_tokens() -> u32 {
1190    4096
1191}
1192
1193fn default_format() -> String {
1194    "markdown".into()
1195}
1196fn default_true_ext() -> bool {
1197    true
1198}
1199
1200/// Custom deserializer for Vec<String> that accepts:
1201/// - TOML array: `api_keys = ["key1", "key2"]`
1202/// - JSON array: `["key1", "key2"]` (for env vars)
1203/// - Comma-separated: `key1,key2` (for simple env var usage)
1204fn deserialize_string_vec<'de, D>(deserializer: D) -> Result<Vec<String>, D::Error>
1205where
1206    D: serde::Deserializer<'de>,
1207{
1208    #[derive(serde::Deserialize)]
1209    #[serde(untagged)]
1210    enum StringOrVec {
1211        Vec(Vec<String>),
1212        Str(String),
1213    }
1214
1215    match StringOrVec::deserialize(deserializer)? {
1216        StringOrVec::Vec(v) => Ok(v),
1217        StringOrVec::Str(s) => {
1218            let s = s.trim();
1219            // Try JSON array first
1220            if s.starts_with('[') {
1221                serde_json::from_str(s).map_err(serde::de::Error::custom)
1222            } else {
1223                // Comma-separated fallback
1224                Ok(s.split(',')
1225                    .map(|s| s.trim().to_string())
1226                    .filter(|s| !s.is_empty())
1227                    .collect())
1228            }
1229        }
1230    }
1231}
1232
1233#[derive(Debug, Clone, Default, Deserialize)]
1234pub struct AuthConfig {
1235    #[serde(default, deserialize_with = "deserialize_string_vec")]
1236    pub api_keys: Vec<String>,
1237}
1238
1239/// Path of the per-user config file written by `crw setup`. Returns `None` if
1240/// the home directory cannot be resolved (e.g. headless container with no
1241/// `$HOME`). Honors `$CRW_USER_CONFIG_DIR` for tests so we don't have to
1242/// monkey-patch `$HOME`.
1243pub fn user_config_path() -> Option<std::path::PathBuf> {
1244    if let Ok(dir) = std::env::var("CRW_USER_CONFIG_DIR") {
1245        return Some(std::path::PathBuf::from(dir).join("config.toml"));
1246    }
1247    let home = std::env::var_os("HOME")?;
1248    Some(
1249        std::path::PathBuf::from(home)
1250            .join(".config")
1251            .join("crw")
1252            .join("config.toml"),
1253    )
1254}
1255
1256impl AppConfig {
1257    /// Load config from config.default.toml + per-user config + environment
1258    /// variable overrides.
1259    ///
1260    /// Precedence (highest wins):
1261    ///   1. `CRW_*` env vars (CI/Docker)
1262    ///   2. `$CRW_CONFIG` file (or `config.local.toml` in cwd)
1263    ///   3. `~/.config/crw/config.toml` (written by `crw setup`)
1264    ///   4. `config.default.toml` (bundled defaults)
1265    ///
1266    /// Env stays on top so a one-off `CRW_FOO=bar crw …` always wins over
1267    /// whatever the user has saved, matching how every other shell tool works.
1268    pub fn load() -> Result<Self, config::ConfigError> {
1269        let mut builder = config::Config::builder()
1270            .add_source(config::File::with_name("config.default").required(false));
1271
1272        // User-level config — written atomically by `crw setup`. Optional, so
1273        // a never-configured machine simply reads defaults + env.
1274        if let Some(user_cfg) = user_config_path()
1275            && user_cfg.exists()
1276        {
1277            builder = builder.add_source(config::File::from(user_cfg).required(false));
1278        }
1279
1280        // Load optional override config file (e.g. config.docker.toml in containers).
1281        if let Ok(extra) = std::env::var("CRW_CONFIG") {
1282            builder = builder.add_source(config::File::with_name(&extra).required(true));
1283        } else {
1284            builder = builder.add_source(config::File::with_name("config.local").required(false));
1285        }
1286
1287        let cfg = builder
1288            .add_source(
1289                config::Environment::with_prefix("CRW")
1290                    .prefix_separator("_")
1291                    .separator("__")
1292                    .try_parsing(true),
1293            )
1294            .build()?;
1295        cfg.try_deserialize()
1296    }
1297
1298    /// Compute the effective end-to-end request deadline (ms). Implements the
1299    /// issue-#35 auto-extension policy:
1300    ///
1301    /// 1. If the caller supplied an explicit `requested_deadline_ms`, return it
1302    ///    verbatim — operators trust the request budget over our heuristic.
1303    /// 2. Otherwise, when `request.auto_extend_deadline_for_ladder` is on,
1304    ///    return `max(deadline_ms_default, ladder_min + wait_for_extra)`.
1305    ///    `ladder_min` covers the configured tier ladder; `wait_for_extra`
1306    ///    compensates for callers that bumped `wait_for_ms` above the default
1307    ///    SPA budget (8s) — without it, a long `wait_for` would silently
1308    ///    re-clamp inside CDP.
1309    /// 3. When the policy is disabled, return `deadline_ms_default` unchanged.
1310    ///
1311    /// `wait_for_ms` is the per-request override (ScrapeRequest::wait_for /
1312    /// CrawlRequest::wait_for); pass `None` for sub-fetches that don't
1313    /// surface a wait_for to the caller (search/map enrichment).
1314    pub fn effective_deadline_ms(
1315        &self,
1316        requested_deadline_ms: Option<u64>,
1317        wait_for_ms: Option<u64>,
1318    ) -> u64 {
1319        if let Some(explicit) = requested_deadline_ms {
1320            return explicit;
1321        }
1322        let default_ms = self.request.deadline_ms_default;
1323        if !self.request.auto_extend_deadline_for_ladder {
1324            return default_ms;
1325        }
1326        // Issue #35 is specifically about CDP tier overhead silently clamping
1327        // chrome_timeout_ms. HTTP-only deployments don't suffer the same
1328        // problem (the HTTP renderer respects deadline.remaining without the
1329        // extra fetch/challenge/stability overhead). Skip the extension when
1330        // no CDP tiers are configured so HTTP-only users keep the strict
1331        // operator-configured default.
1332        if self.renderer.cdp_tier_count() == 0 {
1333            return default_ms;
1334        }
1335        let ladder_min = self.renderer.min_deadline_for_full_ladder_ms();
1336        // Mirrors crw_renderer::cdp::SPA_SELECTOR_MAX_MS. The CDP module
1337        // adds `wait_for_ms.unwrap_or(SPA_SELECTOR_MAX_MS)` to its internal
1338        // timeout, so when the caller exceeds the default we need to extend
1339        // the deadline per active CDP tier.
1340        const SPA_DEFAULT_MS: u64 = 8_000;
1341        // Clamp `wait_for_ms` to MAX_WAIT_FOR_MS so the inner deadline never
1342        // exceeds the Tower envelope, which is sized off the same constant in
1343        // `effective_request_timeout_secs`. A pathological caller passing
1344        // `wait_for: 600_000` without `deadlineMs` would otherwise be cancelled
1345        // by Tower before the inner CDP loop noticed the bigger budget.
1346        let extra = if let Some(w) = wait_for_ms {
1347            let bounded = w.min(MAX_WAIT_FOR_MS);
1348            let per_tier = bounded.saturating_sub(SPA_DEFAULT_MS);
1349            per_tier.saturating_mul(self.renderer.cdp_tier_count() as u64)
1350        } else {
1351            0
1352        };
1353        default_ms.max(ladder_min.saturating_add(extra))
1354    }
1355
1356    /// Tower middleware outer timeout (seconds). Must accommodate the longest
1357    /// legitimate handler runtime so a healthy request isn't cancelled by the
1358    /// outer layer before the inner deadline fires.
1359    ///
1360    /// Covers the three route envelopes:
1361    /// - `/scrape`, `/mcp` — auto-extended scrape deadline.
1362    /// - `/search` — SearXNG fetch + bounded enrichment fan-out
1363    ///   (`ceil(max_limit / max_concurrency)` batches × scrape_ms).
1364    /// - `/crawl/jobs/:id`, `/map` — handler-side caps up to 300s.
1365    ///
1366    /// When auto-extend is disabled, returns the operator-configured baseline
1367    /// unchanged.
1368    pub fn effective_request_timeout_secs(&self) -> u64 {
1369        let baseline = self.server.request_timeout_secs;
1370        if !self.request.auto_extend_deadline_for_ladder {
1371            return baseline;
1372        }
1373        const OUTER_BUFFER_SECS: u64 = 5;
1374        // `/map` handler caps `req.timeout.unwrap_or(120).min(300)`; the outer
1375        // must cover the upper bound so callers passing `timeout=300` aren't
1376        // cancelled mid-flight.
1377        const MAP_REQUEST_TIMEOUT_CEILING_MS: u64 = 300_000;
1378        // Cover the worst-case implicit scrape: caller bumps `wait_for` to the
1379        // configured maximum without supplying `deadlineMs`. The same
1380        // [`MAX_WAIT_FOR_MS`] constant is used inside `effective_deadline_ms`
1381        // to clamp the inner extension, so the inner deadline can never
1382        // exceed this outer envelope.
1383        let scrape_ms = self.effective_deadline_ms(None, Some(MAX_WAIT_FOR_MS));
1384
1385        // Search enrichment: bounded by max_concurrency. Worst case sequential
1386        // batching with low concurrency: ceil(max_limit / max_concurrency)
1387        // batches each bounded by scrape_ms.
1388        let conc = (self.crawler.max_concurrency.max(1)) as u64;
1389        let max_results = self.search.max_limit as u64;
1390        let enrich_batches = max_results.div_ceil(conc);
1391        let search_enrichment_ms = enrich_batches.saturating_mul(scrape_ms);
1392        let search_ms = self.search.timeout_ms.saturating_add(search_enrichment_ms);
1393
1394        let max_handler_ms = scrape_ms.max(search_ms).max(MAP_REQUEST_TIMEOUT_CEILING_MS);
1395        let needed_secs = max_handler_ms
1396            .div_ceil(1_000)
1397            .saturating_add(OUTER_BUFFER_SECS);
1398        baseline.max(needed_secs)
1399    }
1400}
1401
1402#[cfg(test)]
1403mod tests {
1404    use super::*;
1405
1406    /// Env var tests modify process-wide state; serialize them to avoid cross-test
1407    /// interference (e.g. `force_js` alias + `render_js_default` direct both set).
1408    static ENV_LOCK: std::sync::Mutex<()> = std::sync::Mutex::new(());
1409
1410    fn clear_renderer_env() {
1411        for k in [
1412            "CRW_RENDERER__MODE",
1413            "CRW_RENDERER__FORCE_JS",
1414            "CRW_RENDERER__RENDER_JS_DEFAULT",
1415            "CRW_RENDERER__LIGHTPANDA__WS_URL",
1416            "CRW_SERVER__PORT",
1417        ] {
1418            unsafe { std::env::remove_var(k) };
1419        }
1420    }
1421
1422    #[test]
1423    fn renderer_mode_parses_variants() {
1424        #[derive(Deserialize)]
1425        struct Wrap {
1426            mode: RendererMode,
1427        }
1428        let cases = [
1429            ("mode = \"auto\"", RendererMode::Auto),
1430            ("mode = \"none\"", RendererMode::None),
1431            ("mode = \"lightpanda\"", RendererMode::Lightpanda),
1432            ("mode = \"chrome\"", RendererMode::Chrome),
1433            ("mode = \"playwright\"", RendererMode::Playwright),
1434        ];
1435        for (toml_str, expected) in cases {
1436            let w: Wrap = toml::from_str(toml_str).unwrap();
1437            assert_eq!(w.mode, expected, "toml: {toml_str}");
1438        }
1439    }
1440
1441    #[test]
1442    fn renderer_mode_bogus_errors() {
1443        #[derive(Deserialize)]
1444        struct Wrap {
1445            #[allow(dead_code)]
1446            mode: RendererMode,
1447        }
1448        let err: Result<Wrap, _> = toml::from_str("mode = \"bogus\"");
1449        assert!(err.is_err(), "bogus mode should fail to parse");
1450    }
1451
1452    #[test]
1453    fn renderer_config_default_mode_is_auto() {
1454        let cfg = RendererConfig::default();
1455        assert_eq!(cfg.mode, RendererMode::Auto);
1456        assert_eq!(cfg.render_js_default, None);
1457    }
1458
1459    #[test]
1460    fn render_js_default_force_js_alias() {
1461        let cfg: RendererConfig = toml::from_str("force_js = true").unwrap();
1462        assert_eq!(cfg.render_js_default, Some(true));
1463    }
1464
1465    #[test]
1466    fn render_js_default_direct_field() {
1467        let cfg: RendererConfig = toml::from_str("render_js_default = false").unwrap();
1468        assert_eq!(cfg.render_js_default, Some(false));
1469    }
1470
1471    #[test]
1472    fn env_var_renderer_mode_chrome() {
1473        let _g = ENV_LOCK.lock().unwrap();
1474        clear_renderer_env();
1475        unsafe { std::env::set_var("CRW_RENDERER__MODE", "chrome") };
1476        let cfg = AppConfig::load().unwrap();
1477        clear_renderer_env();
1478        assert_eq!(cfg.renderer.mode, RendererMode::Chrome);
1479    }
1480
1481    #[test]
1482    fn env_var_force_js_alias_works() {
1483        let _g = ENV_LOCK.lock().unwrap();
1484        clear_renderer_env();
1485        unsafe { std::env::set_var("CRW_RENDERER__FORCE_JS", "true") };
1486        let cfg = AppConfig::load().unwrap();
1487        clear_renderer_env();
1488        assert_eq!(cfg.renderer.render_js_default, Some(true));
1489    }
1490
1491    #[test]
1492    fn env_var_render_js_default_direct() {
1493        let _g = ENV_LOCK.lock().unwrap();
1494        clear_renderer_env();
1495        unsafe { std::env::set_var("CRW_RENDERER__RENDER_JS_DEFAULT", "true") };
1496        let cfg = AppConfig::load().unwrap();
1497        clear_renderer_env();
1498        assert_eq!(cfg.renderer.render_js_default, Some(true));
1499    }
1500
1501    #[test]
1502    fn request_config_defaults_match_plan() {
1503        let r = RequestConfig::default();
1504        assert_eq!(r.deadline_ms_default, 8000);
1505        assert!(r.auto_extend_deadline_for_ladder);
1506    }
1507
1508    #[test]
1509    fn default_app_config_enables_auto_extend() {
1510        // Programmatic Default must mirror serde defaults — issue #35.
1511        let cfg = AppConfig::default();
1512        assert!(cfg.request.auto_extend_deadline_for_ladder);
1513        assert_eq!(cfg.request.deadline_ms_default, 8000);
1514    }
1515
1516    fn renderer_with_chrome_only(chrome_ms: u64) -> RendererConfig {
1517        RendererConfig {
1518            mode: RendererMode::Chrome,
1519            page_timeout_ms: chrome_ms,
1520            chrome_timeout_ms: Some(chrome_ms),
1521            chrome: Some(CdpEndpoint {
1522                ws_url: "ws://chrome:9222".into(),
1523            }),
1524            ..Default::default()
1525        }
1526    }
1527
1528    #[test]
1529    #[cfg(feature = "cdp")]
1530    fn min_deadline_full_ladder_chrome_only() {
1531        // chrome-only mode: http (page_timeout) + chrome + 1 * 28000.
1532        let r = renderer_with_chrome_only(30_000);
1533        // page_timeout_ms is set to chrome_ms here, so http_timeout() → 30s.
1534        assert_eq!(
1535            r.min_deadline_for_full_ladder_ms(),
1536            30_000 + 30_000 + 28_000
1537        );
1538    }
1539
1540    #[test]
1541    #[cfg(feature = "cdp")]
1542    fn min_deadline_full_ladder_auto_three_tiers() {
1543        let r = RendererConfig {
1544            mode: RendererMode::Auto,
1545            page_timeout_ms: 15_000,
1546            http_timeout_ms: Some(15_000),
1547            lightpanda_timeout_ms: Some(2_500),
1548            chrome_timeout_ms: Some(30_000),
1549            lightpanda: Some(CdpEndpoint {
1550                ws_url: "ws://lp:9222".into(),
1551            }),
1552            chrome: Some(CdpEndpoint {
1553                ws_url: "ws://chrome:9222".into(),
1554            }),
1555            ..Default::default()
1556        };
1557        // http(15) + lp(2.5) + chrome(30) + 2*28 = 47.5 + 56 = 103_500.
1558        assert_eq!(
1559            r.min_deadline_for_full_ladder_ms(),
1560            15_000 + 2_500 + 30_000 + 2 * 28_000
1561        );
1562        assert_eq!(r.cdp_tier_count(), 2);
1563    }
1564
1565    #[test]
1566    fn effective_deadline_explicit_bypasses_auto_extend() {
1567        let mut cfg = AppConfig::default();
1568        cfg.request.auto_extend_deadline_for_ladder = true;
1569        cfg.renderer = renderer_with_chrome_only(30_000);
1570        // Explicit override beats both default and ladder_min.
1571        assert_eq!(cfg.effective_deadline_ms(Some(5_000), None), 5_000);
1572        assert_eq!(cfg.effective_deadline_ms(Some(500_000), None), 500_000);
1573    }
1574
1575    #[test]
1576    #[cfg(feature = "cdp")]
1577    fn effective_deadline_auto_extend_raises_to_ladder_min() {
1578        let mut cfg = AppConfig::default();
1579        cfg.request.auto_extend_deadline_for_ladder = true;
1580        cfg.request.deadline_ms_default = 8_000;
1581        cfg.renderer = renderer_with_chrome_only(30_000);
1582        let expected = cfg.renderer.min_deadline_for_full_ladder_ms();
1583        assert!(expected > 8_000);
1584        assert_eq!(cfg.effective_deadline_ms(None, None), expected);
1585    }
1586
1587    #[test]
1588    fn effective_deadline_default_wins_when_higher_than_ladder() {
1589        let mut cfg = AppConfig::default();
1590        cfg.request.auto_extend_deadline_for_ladder = true;
1591        cfg.request.deadline_ms_default = 1_000_000;
1592        cfg.renderer = renderer_with_chrome_only(30_000);
1593        assert_eq!(cfg.effective_deadline_ms(None, None), 1_000_000);
1594    }
1595
1596    #[test]
1597    fn effective_deadline_auto_extend_disabled_returns_baseline() {
1598        let mut cfg = AppConfig::default();
1599        cfg.request.auto_extend_deadline_for_ladder = false;
1600        cfg.request.deadline_ms_default = 8_000;
1601        cfg.renderer = renderer_with_chrome_only(30_000);
1602        assert_eq!(cfg.effective_deadline_ms(None, None), 8_000);
1603    }
1604
1605    #[test]
1606    #[cfg(feature = "cdp")]
1607    fn effective_deadline_extends_for_long_wait_for() {
1608        let mut cfg = AppConfig::default();
1609        cfg.request.auto_extend_deadline_for_ladder = true;
1610        cfg.request.deadline_ms_default = 8_000;
1611        cfg.renderer = renderer_with_chrome_only(30_000);
1612        let base = cfg.renderer.min_deadline_for_full_ladder_ms();
1613        let tier_count = cfg.renderer.cdp_tier_count() as u64;
1614        // wait_for = 20000 → per-tier extra = 12000 over SPA_DEFAULT_MS (8000).
1615        let with_wait = cfg.effective_deadline_ms(None, Some(20_000));
1616        assert_eq!(with_wait, base + 12_000 * tier_count);
1617        // wait_for below SPA default → no extra.
1618        assert_eq!(cfg.effective_deadline_ms(None, Some(2_000)), base);
1619    }
1620
1621    #[test]
1622    fn effective_request_timeout_covers_map_ceiling() {
1623        let mut cfg = AppConfig::default();
1624        cfg.request.auto_extend_deadline_for_ladder = true;
1625        cfg.request.deadline_ms_default = 8_000;
1626        cfg.renderer = renderer_with_chrome_only(30_000);
1627        cfg.search.timeout_ms = 15_000;
1628        cfg.crawler.max_concurrency = 10;
1629        cfg.search.max_limit = 20;
1630        cfg.server.request_timeout_secs = 60;
1631        // Map ceiling 300s + 5s buffer = 305s minimum.
1632        assert!(cfg.effective_request_timeout_secs() >= 305);
1633    }
1634
1635    #[test]
1636    fn effective_request_timeout_disabled_returns_baseline() {
1637        let mut cfg = AppConfig::default();
1638        cfg.request.auto_extend_deadline_for_ladder = false;
1639        cfg.server.request_timeout_secs = 60;
1640        assert_eq!(cfg.effective_request_timeout_secs(), 60);
1641    }
1642
1643    #[test]
1644    fn effective_request_timeout_respects_operator_override() {
1645        let mut cfg = AppConfig::default();
1646        cfg.request.auto_extend_deadline_for_ladder = true;
1647        cfg.server.request_timeout_secs = 600; // operator-configured high
1648        cfg.renderer = renderer_with_chrome_only(30_000);
1649        // Operator's explicit 600s should win over the auto-computed 305s.
1650        assert_eq!(cfg.effective_request_timeout_secs(), 600);
1651    }
1652
1653    #[test]
1654    fn effective_request_timeout_search_sequential_batching() {
1655        // Low concurrency forces ceil(max_limit/conc) batches → larger search_ms.
1656        let mut cfg = AppConfig::default();
1657        cfg.request.auto_extend_deadline_for_ladder = true;
1658        cfg.request.deadline_ms_default = 8_000;
1659        cfg.renderer = renderer_with_chrome_only(30_000);
1660        cfg.search.timeout_ms = 15_000;
1661        cfg.search.max_limit = 20;
1662        cfg.crawler.max_concurrency = 1;
1663        cfg.server.request_timeout_secs = 60;
1664        // The Tower envelope must cover the worst-case implicit scrape with
1665        // `wait_for` bumped to MAX_WAIT_FOR_MS (60s), because callers can do
1666        // that without supplying `deadlineMs`. Mirror that in the expected.
1667        let secs = cfg.effective_request_timeout_secs();
1668        let scrape_ms = cfg.effective_deadline_ms(None, Some(60_000));
1669        let expected_search_ms = 15_000 + 20 * scrape_ms;
1670        let expected_max_ms = scrape_ms.max(expected_search_ms).max(300_000);
1671        let expected_secs = expected_max_ms.div_ceil(1_000) + 5;
1672        assert_eq!(secs, 60u64.max(expected_secs));
1673    }
1674
1675    #[test]
1676    #[cfg(not(feature = "cdp"))]
1677    fn cdp_tier_count_zero_without_cdp_feature() {
1678        // Even when chrome/lightpanda are configured, a binary built without
1679        // the `cdp` feature can never construct a JS renderer. The deadline
1680        // policy must observe that and collapse to HTTP-only behavior.
1681        let r = RendererConfig {
1682            mode: RendererMode::Auto,
1683            page_timeout_ms: 15_000,
1684            chrome_timeout_ms: Some(30_000),
1685            chrome: Some(CdpEndpoint {
1686                ws_url: "ws://chrome:9222".into(),
1687            }),
1688            lightpanda: Some(CdpEndpoint {
1689                ws_url: "ws://lp:9222".into(),
1690            }),
1691            ..Default::default()
1692        };
1693        assert_eq!(r.cdp_tier_count(), 0);
1694        // Only the HTTP tier contributes to the ladder budget.
1695        assert_eq!(r.min_deadline_for_full_ladder_ms(), 15_000);
1696    }
1697
1698    #[test]
1699    fn effective_deadline_skipped_for_http_only_mode() {
1700        // P2 from codex review: HTTP-only deployments don't suffer the CDP
1701        // clamping problem (no fetch/challenge/stability overhead). The
1702        // auto-extension must NOT silently bump their default from 8s to 30s
1703        // just because page_timeout_ms defaults high.
1704        let mut cfg = AppConfig::default();
1705        cfg.request.auto_extend_deadline_for_ladder = true;
1706        cfg.request.deadline_ms_default = 8_000;
1707        cfg.renderer = RendererConfig {
1708            mode: RendererMode::Auto,
1709            page_timeout_ms: 30_000,
1710            // No CDP endpoints configured.
1711            lightpanda: None,
1712            playwright: None,
1713            chrome: None,
1714            ..Default::default()
1715        };
1716        assert_eq!(cfg.renderer.cdp_tier_count(), 0);
1717        assert_eq!(cfg.effective_deadline_ms(None, None), 8_000);
1718        assert_eq!(cfg.effective_deadline_ms(None, Some(30_000)), 8_000);
1719    }
1720
1721    #[test]
1722    #[cfg(feature = "cdp")]
1723    fn min_deadline_full_ladder_playwright_only() {
1724        // Playwright tier contributes one chrome_timeout + one CDP overhead,
1725        // matching the runtime predicate in `crw-renderer/src/lib.rs`.
1726        let r = RendererConfig {
1727            mode: RendererMode::Playwright,
1728            page_timeout_ms: 15_000,
1729            http_timeout_ms: Some(15_000),
1730            chrome_timeout_ms: Some(30_000),
1731            playwright: Some(CdpEndpoint {
1732                ws_url: "ws://playwright:9222".into(),
1733            }),
1734            ..Default::default()
1735        };
1736        assert_eq!(r.cdp_tier_count(), 1);
1737        // http(15) + chrome-equivalent(30) + 1 * 28 overhead.
1738        assert_eq!(
1739            r.min_deadline_for_full_ladder_ms(),
1740            15_000 + 30_000 + 28_000
1741        );
1742    }
1743
1744    #[test]
1745    fn renderer_phase_toggles_default_off_or_safe() {
1746        let r = RendererConfig::default();
1747        assert!(!r.chrome_intercept_resources);
1748        assert!(!r.chrome_intercept_stylesheets);
1749        assert!(r.chrome_host_intercept_disable.is_empty());
1750        assert_eq!(r.chrome_nav_budget_ms, 12_000);
1751        assert!(!r.chrome_context_pool_enabled);
1752        assert!(!r.use_predictor);
1753    }
1754
1755    #[test]
1756    fn crawler_per_host_limiter_defaults() {
1757        let c = CrawlerConfig::default();
1758        assert_eq!(c.per_host_min_interval_ms, 0);
1759        assert_eq!(c.per_host_max_concurrent, 1);
1760    }
1761
1762    #[test]
1763    fn env_var_overrides_toml_defaults() {
1764        let _g = ENV_LOCK.lock().unwrap();
1765        clear_renderer_env();
1766        unsafe {
1767            std::env::set_var("CRW_SERVER__PORT", "4444");
1768            std::env::set_var("CRW_RENDERER__LIGHTPANDA__WS_URL", "ws://test:9999/");
1769        }
1770        let cfg = AppConfig::load().unwrap();
1771        clear_renderer_env();
1772
1773        assert_eq!(cfg.server.port, 4444, "env var should override server.port");
1774        assert_eq!(
1775            cfg.renderer.lightpanda.as_ref().unwrap().ws_url,
1776            "ws://test:9999/",
1777            "env var should override renderer.lightpanda.ws_url"
1778        );
1779    }
1780
1781    #[test]
1782    fn user_config_path_honors_override_env() {
1783        let _g = ENV_LOCK.lock().unwrap();
1784        let tmp = std::env::temp_dir().join(format!("crw-cfg-test-{}", std::process::id()));
1785        unsafe {
1786            std::env::set_var("CRW_USER_CONFIG_DIR", &tmp);
1787        }
1788        let p = user_config_path().unwrap();
1789        unsafe {
1790            std::env::remove_var("CRW_USER_CONFIG_DIR");
1791        }
1792        assert_eq!(p, tmp.join("config.toml"));
1793    }
1794
1795    #[test]
1796    fn user_config_file_is_picked_up_by_load() {
1797        let _g = ENV_LOCK.lock().unwrap();
1798        clear_renderer_env();
1799        let tmp = std::env::temp_dir().join(format!("crw-load-test-{}", std::process::id()));
1800        std::fs::create_dir_all(&tmp).unwrap();
1801        let cfg_path = tmp.join("config.toml");
1802        std::fs::write(
1803            &cfg_path,
1804            r#"
1805[client]
1806api_url = "https://api.example.com"
1807api_key = "test-key-123"
1808
1809[search]
1810searxng_url = "http://localhost:9999"
1811
1812[extraction.llm]
1813provider = "deepseek"
1814api_key = "sk-test"
1815model = "deepseek-chat"
1816"#,
1817        )
1818        .unwrap();
1819
1820        unsafe {
1821            std::env::set_var("CRW_USER_CONFIG_DIR", &tmp);
1822        }
1823        let cfg = AppConfig::load().unwrap();
1824        unsafe {
1825            std::env::remove_var("CRW_USER_CONFIG_DIR");
1826        }
1827        std::fs::remove_dir_all(&tmp).ok();
1828
1829        assert_eq!(
1830            cfg.client.api_url.as_deref(),
1831            Some("https://api.example.com")
1832        );
1833        assert_eq!(cfg.client.api_key.as_deref(), Some("test-key-123"));
1834        assert_eq!(
1835            cfg.search.searxng_url.as_deref(),
1836            Some("http://localhost:9999")
1837        );
1838        let llm = cfg.extraction.llm.expect("llm config present");
1839        assert_eq!(llm.provider, "deepseek");
1840        assert_eq!(llm.api_key, "sk-test");
1841    }
1842
1843    #[test]
1844    fn env_var_beats_user_config() {
1845        let _g = ENV_LOCK.lock().unwrap();
1846        clear_renderer_env();
1847        let tmp = std::env::temp_dir().join(format!("crw-prec-test-{}", std::process::id()));
1848        std::fs::create_dir_all(&tmp).unwrap();
1849        std::fs::write(
1850            tmp.join("config.toml"),
1851            r#"
1852[search]
1853searxng_url = "http://from-file:8080"
1854"#,
1855        )
1856        .unwrap();
1857
1858        unsafe {
1859            std::env::set_var("CRW_USER_CONFIG_DIR", &tmp);
1860            std::env::set_var("CRW_SEARCH__SEARXNG_URL", "http://from-env:8080");
1861        }
1862        let cfg = AppConfig::load().unwrap();
1863        unsafe {
1864            std::env::remove_var("CRW_USER_CONFIG_DIR");
1865            std::env::remove_var("CRW_SEARCH__SEARXNG_URL");
1866        }
1867        std::fs::remove_dir_all(&tmp).ok();
1868
1869        assert_eq!(
1870            cfg.search.searxng_url.as_deref(),
1871            Some("http://from-env:8080"),
1872            "env var must win over user config file"
1873        );
1874    }
1875
1876    #[test]
1877    fn effective_proxy_credentials_appends_country_suffix() {
1878        let cfg = RendererConfig {
1879            proxy_base_user: Some("abc".into()),
1880            proxy_base_pass: Some("pw".into()),
1881            proxy_default_country: Some("de".into()),
1882            ..Default::default()
1883        };
1884        let (u, p) = cfg.effective_proxy_credentials(Some("us")).unwrap();
1885        assert_eq!(u, "abc__cr.us");
1886        assert_eq!(p, "pw");
1887        // Per-request wins over default.
1888        let (u, _) = cfg.effective_proxy_credentials(Some("GB")).unwrap();
1889        assert_eq!(u, "abc__cr.gb", "uppercase input is normalized");
1890        // Default country used when per-request omits it.
1891        let (u, _) = cfg.effective_proxy_credentials(None).unwrap();
1892        assert_eq!(u, "abc__cr.de");
1893    }
1894
1895    #[test]
1896    fn effective_proxy_credentials_invalid_country_uses_global_pool() {
1897        let cfg = RendererConfig {
1898            proxy_base_user: Some("abc".into()),
1899            proxy_base_pass: Some("pw".into()),
1900            ..Default::default()
1901        };
1902        // 3-letter ISO code → rejected, no suffix (global pool).
1903        let (u, _) = cfg.effective_proxy_credentials(Some("usa")).unwrap();
1904        assert_eq!(u, "abc");
1905        // Digits → rejected.
1906        let (u, _) = cfg.effective_proxy_credentials(Some("u1")).unwrap();
1907        assert_eq!(u, "abc");
1908        // Empty string after trim → rejected.
1909        let (u, _) = cfg.effective_proxy_credentials(Some("  ")).unwrap();
1910        assert_eq!(u, "abc");
1911    }
1912
1913    #[test]
1914    fn effective_proxy_credentials_no_base_returns_none() {
1915        let cfg = RendererConfig::default();
1916        assert!(cfg.effective_proxy_credentials(Some("us")).is_none());
1917
1918        let only_user = RendererConfig {
1919            proxy_base_user: Some("abc".into()),
1920            ..Default::default()
1921        };
1922        assert!(only_user.effective_proxy_credentials(Some("us")).is_none());
1923    }
1924}
crw_core/config.rs

crw_core/
config.rs