crw_core/config.rs
1use serde::Deserialize;
2
3#[derive(Debug, Clone, Deserialize, Default)]
4pub struct AppConfig {
5 #[serde(default)]
6 pub server: ServerConfig,
7 #[serde(default)]
8 pub renderer: RendererConfig,
9 #[serde(default)]
10 pub crawler: CrawlerConfig,
11 #[serde(default)]
12 pub extraction: ExtractionConfig,
13 #[serde(default)]
14 pub auth: AuthConfig,
15 #[serde(default)]
16 pub request: RequestConfig,
17 #[serde(default)]
18 pub search: SearchConfig,
19 #[serde(default)]
20 pub map: MapConfig,
21 /// `[document]` — binary-document (PDF) parsing knobs.
22 #[serde(default)]
23 pub document: DocumentConfig,
24 /// `[client]` — settings for the local CLI/MCP when it proxies to the
25 /// hosted SaaS. Written by `crw setup` into the user-config file.
26 #[serde(default)]
27 pub client: ClientConfig,
28}
29
30/// `[client]` — cloud-proxy credentials populated by `crw setup` and read by
31/// `crw mcp` / `crw-mcp`. Both fields are `Option` so an unconfigured user runs
32/// in local mode without surprise overrides.
33#[derive(Debug, Clone, Default, Deserialize)]
34pub struct ClientConfig {
35 /// Base URL of the hosted CRW API, e.g. `https://api.fastcrw.com`.
36 #[serde(default)]
37 pub api_url: Option<String>,
38 /// API key for the hosted CRW API.
39 #[serde(default)]
40 pub api_key: Option<String>,
41}
42
43/// `[document]` section — controls PDF (and future binary-document) parsing.
44/// All knobs honor `CRW_DOCUMENT__*` env overrides.
45#[derive(Debug, Clone, Deserialize)]
46#[serde(default)]
47pub struct DocumentConfig {
48 /// Master switch for document parsing at runtime (independent of the
49 /// compile-time `pdf` cargo feature). When `false`, PDFs are left unparsed.
50 pub enabled: bool,
51 /// Cap on the number of pages converted per document. `0` = no limit.
52 pub max_pages: usize,
53 /// Best-effort extraction from scanned/image PDFs (no OCR; usually empty).
54 pub attempt_scanned: bool,
55 /// Maximum upload size in bytes for `POST /v2/parse`. Defaults to 50 MB,
56 /// matching the HTTP renderer's response cap.
57 pub max_upload_bytes: usize,
58 /// Maximum number of concurrent uploads being parsed at once — bounds peak
59 /// memory (each in-flight upload buffers up to `max_upload_bytes`).
60 pub upload_concurrency: usize,
61 /// Process-wide cap on concurrent PDF parses across ALL surfaces (URL
62 /// scrape, crawl, batch, upload). Bounds peak CPU + decompressed memory: a
63 /// malicious PDF can decompress far beyond its on-wire size, so this is the
64 /// primary memory-DoS guard. Independent of `upload_concurrency` (which
65 /// only bounds upload body buffering).
66 pub max_concurrent_parses: usize,
67 /// Wall-clock timeout (ms) for a single PDF parse. A parse exceeding this
68 /// returns a timeout error to the caller; protects against pathological
69 /// documents that spin the parser. `0` disables the timeout.
70 pub parse_timeout_ms: u64,
71 /// Decompression-bomb guard: maximum total DECOMPRESSED bytes a document's
72 /// FlateDecode streams may inflate to. Checked in bounded memory BEFORE the
73 /// parser runs, so a small file that explodes to many GB is rejected with
74 /// `pdf_too_large` having allocated only kilobytes. This is the primary
75 /// guard against OOM-crashing the host. `0` disables it. Default 100 MiB —
76 /// huge for text extraction (millions of words) yet tiny next to host RAM.
77 /// Raise only if you must parse image-heavy PDFs.
78 pub max_decompressed_bytes: usize,
79 /// Run each PDF parse in an isolated child PROCESS (Unix only) instead of
80 /// in-process. The child gets a hard OS memory ceiling (`RLIMIT_AS`) and CPU
81 /// limit, inherits no env/secrets, and is killed on timeout. A crash, OOM,
82 /// or even a hypothetical parser RCE is contained to the child — the main
83 /// server (scrape/crawl) keeps running. Costs ~1-3ms spawn overhead per
84 /// parse. Recommended for hosts that accept untrusted uploads. Default off.
85 pub sandbox: bool,
86 /// Hard address-space limit (bytes) for a sandbox child (`RLIMIT_AS`). The
87 /// child is aborted by the OS if it allocates beyond this — the ultimate
88 /// backstop against memory-DoS even if the decompression guard is bypassed.
89 /// Default 512 MiB.
90 pub sandbox_memory_bytes: u64,
91}
92
93impl Default for DocumentConfig {
94 fn default() -> Self {
95 Self {
96 enabled: true,
97 max_pages: 0,
98 attempt_scanned: false,
99 max_upload_bytes: 52_428_800, // 50 MiB
100 upload_concurrency: 4,
101 max_concurrent_parses: 4,
102 parse_timeout_ms: 30_000,
103 max_decompressed_bytes: 104_857_600, // 100 MiB
104 sandbox: false,
105 sandbox_memory_bytes: 536_870_912, // 512 MiB
106 }
107 }
108}
109
110/// `[map]` section — currently only carries `[map.url_filter]`.
111#[derive(Debug, Clone, Deserialize, Default)]
112pub struct MapConfig {
113 #[serde(default)]
114 pub url_filter: MapUrlFilterConfig,
115}
116
117/// `[map.url_filter]` — raw TOML view of the filter knobs. Conversion to
118/// the runtime `UrlFilterCfg` lives in `crw-crawl` (which can see both this
119/// type and the filter module). Keeping this struct dependency-free here
120/// avoids a cycle (`crw-core` does not depend on `crw-crawl`).
121#[derive(Debug, Clone, Deserialize)]
122pub struct MapUrlFilterConfig {
123 /// Tier B — strip tracking params. Default: `true`.
124 #[serde(default = "default_true_filter")]
125 pub strip_tracking_params: bool,
126 /// Tier A — drop action URLs entirely. Default: `true`.
127 #[serde(default = "default_true_filter")]
128 pub drop_action_urls: bool,
129 /// When `true`, `.gov`/`.mil` hosts run Tier A too. Default `false`.
130 #[serde(default)]
131 pub gov_tld_drop_actions: bool,
132 /// Additive on top of `DEFAULT_TRACKING_PARAMS`.
133 #[serde(default)]
134 pub extra_tracking_params: Vec<String>,
135 /// Additive on top of `DEFAULT_ACTION_PARAMS`.
136 #[serde(default)]
137 pub extra_action_params: Vec<String>,
138 /// Additive on top of `ALWAYS_PRESERVE`.
139 #[serde(default)]
140 pub extra_preserve_params: Vec<String>,
141}
142
143impl Default for MapUrlFilterConfig {
144 fn default() -> Self {
145 Self {
146 strip_tracking_params: true,
147 drop_action_urls: true,
148 gov_tld_drop_actions: false,
149 extra_tracking_params: Vec::new(),
150 extra_action_params: Vec::new(),
151 extra_preserve_params: Vec::new(),
152 }
153 }
154}
155
156fn default_true_filter() -> bool {
157 true
158}
159
160/// Per-tier CDP overhead in milliseconds — sum of SPA selector poll budget,
161/// challenge retry budget, content-stability budget, and fetch overhead.
162/// Mirrors the constants in `crw-renderer::cdp`. The drift between the two
163/// is regression-tested by `crates/crw-server/tests/cdp_constants_test.rs`
164/// (gated behind `feature = "cdp"`).
165///
166/// Used by [`RendererConfig::min_deadline_for_full_ladder_ms`] so the request
167/// deadline accommodates each CDP tier's outer fetch timeout, not just its
168/// configured `page_timeout`.
169pub const CDP_TIER_OVERHEAD_MS: u64 = 28_000;
170
171/// Hard upper bound on the per-request `wait_for_ms` budget. The Tower outer
172/// timeout is sized so a worst-case implicit scrape (no `deadlineMs`,
173/// `wait_for` at this maximum) still completes inside it; values above this
174/// are clamped by [`AppConfig::effective_deadline_ms`] so the inner deadline
175/// can never escape the outer envelope. Documented as `(0, 60000]` in
176/// `types.rs::ScrapeRequest::wait_for`.
177pub const MAX_WAIT_FOR_MS: u64 = 60_000;
178
179/// Configuration for the `/v1/search` endpoint and its SearXNG backend.
180///
181/// When `searxng_url` is unset the endpoint returns HTTP 503 with
182/// `error_code: "search_disabled"` — the route remains mounted so that
183/// startup doesn't have to know whether search will ever be configured.
184#[derive(Debug, Clone, Deserialize)]
185pub struct SearchConfig {
186 /// Master switch. Defaults to `true`; set to `false` to refuse all
187 /// `/v1/search` requests even if `searxng_url` is configured.
188 #[serde(default = "default_true_search")]
189 pub enabled: bool,
190 /// Base URL of the SearXNG instance (e.g. `http://searxng:8080`).
191 /// `None` (the default) disables the endpoint with a clear error.
192 #[serde(default)]
193 pub searxng_url: Option<String>,
194 /// End-to-end timeout for the SearXNG call in milliseconds.
195 #[serde(default = "default_search_timeout_ms")]
196 pub timeout_ms: u64,
197 /// Default `limit` when the request omits it.
198 #[serde(default = "default_search_limit")]
199 pub default_limit: u32,
200 /// Hard cap on `limit` per request. SaaS uses 20.
201 #[serde(default = "default_search_max_limit")]
202 pub max_limit: u32,
203 /// SearXNG engines invoked when the request includes `categories: ["research"]`.
204 /// Defaults match the SaaS implementation.
205 #[serde(default = "default_research_engines")]
206 pub research_engines: Vec<String>,
207 /// SearXNG engines invoked when the request includes `categories: ["github"]`.
208 #[serde(default = "default_github_engines")]
209 pub github_engines: Vec<String>,
210 /// Re-rank the flat result pool for the LLM answer / summarize path
211 /// (RRF + junk/coverage/geo filter + BM25 + domain dedupe) instead of the
212 /// raw SearXNG-score sort. Defaults to `true`. The plain (non-LLM) path is
213 /// unaffected and keeps SaaS byte-parity regardless of this flag.
214 #[serde(default = "default_true_search")]
215 pub rerank_enabled: bool,
216 /// Multi-query expansion for the LLM answer / summarize path: before the
217 /// SearXNG fetch, generate an entity/keyword-focused rewrite of the query,
218 /// fetch both the original and the rewrite, and UNION the candidate pools
219 /// (recall can only increase — the original's results are always kept).
220 /// Targets "retrieval-miss" failures where the answer's source never
221 /// surfaced for the user's phrasing. Costs one extra small LLM call + one
222 /// extra SearXNG fetch. Defaults to `false` (gated); the plain path and the
223 /// answer layer are untouched, so precision/SaaS-parity are preserved.
224 #[serde(default)]
225 pub query_expand: bool,
226 /// Number of LLM-generated query rewrites to fetch + union when
227 /// `query_expand` is on. `1` reproduces the original single-variant
228 /// behavior. Higher values request more DIVERSE reformulations
229 /// (abbreviation/acronym-expanded, keyword-focused) and fetch their pools
230 /// in parallel, raising recall on retrieval-miss queries (e.g. an
231 /// unexpanded acronym whose page never surfaced) at the cost of one extra
232 /// SearXNG fetch each. Clamped to `MAX_QUERY_EXPAND_VARIANTS` in the route.
233 #[serde(default = "default_query_expand_variants")]
234 pub query_expand_variants: usize,
235 /// Adaptive multi-round retrieval (the "evidence-scout" loop). When the
236 /// round-1 answer ABSTAINS (sources lacked the fact), an LLM scout reads the
237 /// round-1 evidence and emits targeted follow-up queries (acronym-expanded,
238 /// exact-entity, predicate/date-specific); their results are scraped, unioned
239 /// into the pool, and the answer is re-synthesized ONCE. Bounded (one extra
240 /// round, capped follow-up queries) so worst-case stays within the request
241 /// deadline. Only fires on abstention, so ~most queries keep the single-shot
242 /// fast path. Recall-only + monotone-safe: a still-abstaining round-2 is
243 /// discarded, keeping round-1. Targets "the answer page never entered the
244 /// first pool" — the dominant remaining miss. Defaults to `false` (gated).
245 #[serde(default)]
246 pub multi_round: bool,
247 /// Passage-level relevance gate for the LLM answer path: split each scraped
248 /// source into passages and feed the answer LLM only the query-relevant
249 /// ones (DeepSeek-scored, no new ML deps). Subtractive — removes noise, never
250 /// adds sources or forces commits; falls back to the full source on any
251 /// failure (byte-identical to off), so it is monotone-safe. Defaults to
252 /// `false` (gated); answer prompt + plain path untouched.
253 #[serde(default)]
254 pub passage_select: bool,
255 /// Page-2 fallback for the LLM answer / summarize path: if the reranked
256 /// (junk-filtered, deduped) candidate pool comes back thinner than the
257 /// answer needs (`< answer_top_n`), fetch the SAME query's SearXNG page 2
258 /// once and union it in, then re-rank. The trigger is evaluated POST-rerank,
259 /// so a junk-heavy first page does not suppress it; the extra fetch only
260 /// fires on already-under-yielding queries (QPS never doubles across the
261 /// corpus). Recall-only + abstention is untouched (a sparse page1+page2 pool
262 /// still abstains). Defaults to `false` (gated); requires `rerank_enabled`.
263 #[serde(default)]
264 pub page2_fallback: bool,
265 /// Calibrated answer path (gated): reduce recoverable OVER-abstentions by
266 /// (a) feeding more sources to the answer LLM by default (top_n 5->8, so the
267 /// answer in result #6-8 or behind a failed top-5 scrape still reaches it)
268 /// and (b) swapping the answer prompt's abstention rule for an anti-hedge
269 /// variant — commit when the sources DO contain the answer (even indirectly
270 /// / one inference step), abstain ONLY when they genuinely lack it. The
271 /// "use ONLY sources" grounding is untouched, so this is the precise inverse
272 /// of the cycle-1 blunt "always commit" failure (which forced commits on
273 /// no-source cases). Default false; A/B with an INCORRECT-guard before flip.
274 #[serde(default)]
275 pub answer_calibrated: bool,
276 /// Moat-hardening abstention (gated). Appends a clause making the answer
277 /// model (a) REJECT a false/unverifiable premise instead of answering as
278 /// though it were true, (b) report when sources CONFLICT rather than picking
279 /// one confidently, and (c) abstain when not confident. Targets the
280 /// adversarial failure SealQA Seal-0 exposed: 32% confident-WRONG
281 /// (hallucination) on conflicting-source / false-premise questions, where
282 /// the "use ONLY sources" rule alone is insufficient. Complements (does not
283 /// replace) `answer_calibrated`. Default false; A/B requires Seal-0
284 /// hallucination DOWN with SimpleQA accuracy NOT regressed before flip.
285 #[serde(default)]
286 pub answer_guarded: bool,
287 /// Use SearXNG structured sources (gated, W0). SearXNG's `infoboxes[]` /
288 /// `answers[]` arrays carry Wikidata/Wikipedia knowledge-panel facts
289 /// (entity attributes like religion/capital/director) that the `results[]`
290 /// transform path discards. With this on, those facts are parsed and pinned
291 /// as a high-trust source at the FRONT of the answer pool (still
292 /// UNTRUSTED-wrapped — widens evidence, never bypasses the safety wrapper).
293 /// Targets the obscure-entity recall gap (PopQA). Default false; A/B on
294 /// diag500 gold-in-sources with the wrong-non-abstain invariant before flip.
295 #[serde(default)]
296 pub use_structured_sources: bool,
297 /// Deterministic Wikidata entity-relation lookup (gated, W3). For
298 /// `<relation> of <entity>` questions (PopQA's obscure long tail that web
299 /// search can't surface), classify -> wbsearchentities -> property fetch and
300 /// pin the fact as a structured source (UNTRUSTED-wrapped, runs in parallel
301 /// with SearXNG, 3s-bounded, any error falls through). Free open data, no
302 /// AI, no SPARQL hot-path. Default false; A/B on diag500 PopQA accuracy +
303 /// the wrong-non-abstain invariant before flip.
304 #[serde(default)]
305 pub wikidata_lookup: bool,
306 /// Snippet fallback for the LLM answer path (gated): when a top-N result's
307 /// scrape failed (empty `markdown`), the result is normally dropped from the
308 /// answer pool — if it was the answer-bearing page, crw abstains though
309 /// retrieval succeeded (diagnosed Pattern A). With this on, such results
310 /// fall back to their SearXNG `description` snippet as a thin source instead
311 /// of vanishing. The snippet is verbatim upstream text, so it cannot inject
312 /// a fact not already present — near-zero INCORRECT exposure. Default false.
313 #[serde(default)]
314 pub snippet_fallback: bool,
315 /// Relevance gate for the LLM answer / summarize re-rank (gated). After the
316 /// lexical-core junk/coverage/geo filters, keep only the rows that cover the
317 /// MOST important (non-stopword) query terms present in the pool, so a
318 /// partial-match homonym ("best pizza in REDMOND" for "best pizza in
319 /// belgrade", coverage 1/2) is evicted the instant a full-match row
320 /// ("pizza … belgrade", 2/2) is present. Ranks on the query's OWN tokens —
321 /// no geo/country/IP signal — so it holds for self-hosted deployments in any
322 /// region. Monotone-safe (degrade fallback applies first; never empties a
323 /// non-empty pool). Requires `rerank_enabled`. Default false; A/B against
324 /// the frozen rerank benchmark before flip.
325 #[serde(default)]
326 pub rerank_relevance: bool,
327 /// List-format answers for the LLM answer path (gated). When the query has
328 /// list intent ("best/top X in Y", "recommend …", "list of …"), the answer
329 /// prompt's prose directive is swapped for a ranked-list directive so the
330 /// model emits up to 10 named options (`N. <name> — <why>`) instead of a
331 /// 3–6 sentence paragraph. A deterministic classifier (`is_list_intent`)
332 /// decides per query; factual/non-list queries are untouched. The "use ONLY
333 /// sources" grounding, the abstention rule, and the `===CITATIONS===` block
334 /// are preserved (no fabrication, citation moat intact). Default false; A/B
335 /// against the answer-accuracy benchmark before flip.
336 #[serde(default)]
337 pub answer_list_format: bool,
338}
339
340impl Default for SearchConfig {
341 fn default() -> Self {
342 Self {
343 enabled: true,
344 searxng_url: None,
345 timeout_ms: default_search_timeout_ms(),
346 default_limit: default_search_limit(),
347 max_limit: default_search_max_limit(),
348 research_engines: default_research_engines(),
349 github_engines: default_github_engines(),
350 rerank_enabled: true,
351 query_expand: false,
352 query_expand_variants: default_query_expand_variants(),
353 multi_round: false,
354 passage_select: false,
355 page2_fallback: false,
356 answer_calibrated: false,
357 answer_guarded: false,
358 use_structured_sources: false,
359 wikidata_lookup: false,
360 snippet_fallback: false,
361 rerank_relevance: false,
362 answer_list_format: false,
363 }
364 }
365}
366
367fn default_query_expand_variants() -> usize {
368 1
369}
370fn default_true_search() -> bool {
371 true
372}
373fn default_search_timeout_ms() -> u64 {
374 15_000
375}
376fn default_search_limit() -> u32 {
377 5
378}
379fn default_search_max_limit() -> u32 {
380 20
381}
382fn default_research_engines() -> Vec<String> {
383 vec![
384 "arxiv".into(),
385 "crossref".into(),
386 "google scholar".into(),
387 "semantic scholar".into(),
388 ]
389}
390fn default_github_engines() -> Vec<String> {
391 vec!["github".into()]
392}
393
394/// Per-request defaults that apply to every scrape, crawl, or map call when
395/// the caller does not specify an override. Currently only governs the
396/// end-to-end deadline budget (see `crw-core/src/deadline.rs`).
397#[derive(Debug, Clone, Deserialize)]
398pub struct RequestConfig {
399 /// Default end-to-end deadline budget in milliseconds when a request does
400 /// not specify `deadlineMs`. The SLO p95 latency metric is computed only
401 /// over requests with `deadline_ms <= 8000`; longer values land in a
402 /// separate slow-path histogram.
403 #[serde(default = "default_deadline_ms")]
404 pub deadline_ms_default: u64,
405 /// When `true` (default), an implicit deadline (no per-request `deadlineMs`)
406 /// is auto-extended to `max(deadline_ms_default, ladder_min)` where
407 /// `ladder_min = sum(http+lightpanda+chrome timeouts) + N_cdp_tiers * 28s`.
408 /// This prevents `chrome_timeout_ms = 30000` from appearing inert when
409 /// `deadline_ms_default` is small (issue #35).
410 ///
411 /// Set to `false` to enforce a strict SLO regardless of tier sizing —
412 /// requests that would have completed under the extended budget will
413 /// instead time out at `deadline_ms_default`.
414 #[serde(default = "default_true_request")]
415 pub auto_extend_deadline_for_ladder: bool,
416}
417
418impl Default for RequestConfig {
419 fn default() -> Self {
420 Self {
421 deadline_ms_default: default_deadline_ms(),
422 auto_extend_deadline_for_ladder: true,
423 }
424 }
425}
426
427fn default_true_request() -> bool {
428 true
429}
430
431fn default_deadline_ms() -> u64 {
432 8000
433}
434
435#[derive(Debug, Clone, Deserialize)]
436pub struct ServerConfig {
437 #[serde(default = "default_host")]
438 pub host: String,
439 #[serde(default = "default_port")]
440 pub port: u16,
441 #[serde(default = "default_request_timeout")]
442 pub request_timeout_secs: u64,
443 /// Maximum requests per second (global). 0 = unlimited.
444 #[serde(default = "default_rate_limit_rps")]
445 pub rate_limit_rps: u64,
446}
447
448impl Default for ServerConfig {
449 fn default() -> Self {
450 Self {
451 host: default_host(),
452 port: default_port(),
453 request_timeout_secs: default_request_timeout(),
454 rate_limit_rps: default_rate_limit_rps(),
455 }
456 }
457}
458
459fn default_rate_limit_rps() -> u64 {
460 10
461}
462
463fn default_host() -> String {
464 "0.0.0.0".into()
465}
466fn default_port() -> u16 {
467 3000
468}
469fn default_request_timeout() -> u64 {
470 60
471}
472
473/// Selects which JS renderer(s) the [`FallbackRenderer`] will build.
474///
475/// - `Auto` (default): try every configured CDP endpoint (Lightpanda, Playwright, Chrome)
476/// in order. If none is configured, JS rendering is disabled but HTTP still works.
477/// - `None`: HTTP-only. Never attempt JS rendering.
478/// - `Lightpanda` / `Chrome` / `Playwright`: require the matching `[renderer.<name>]`
479/// endpoint; fail startup if missing. Only the named backend is used.
480///
481/// [`FallbackRenderer`]: https://docs.rs/crw-renderer/latest/crw_renderer/struct.FallbackRenderer.html
482#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, Deserialize)]
483#[serde(rename_all = "lowercase")]
484pub enum RendererMode {
485 #[default]
486 Auto,
487 None,
488 Lightpanda,
489 Chrome,
490 Playwright,
491}
492
493#[derive(Debug, Clone, Deserialize)]
494pub struct RendererConfig {
495 #[serde(default)]
496 pub mode: RendererMode,
497 /// Generic per-page navigation timeout. Used as the fallback when no
498 /// per-tier override is configured. Kept for backward compatibility — the
499 /// per-tier knobs below are preferred for new deployments.
500 #[serde(default = "default_page_timeout")]
501 pub page_timeout_ms: u64,
502 /// Override for the HTTP-only fetcher request timeout. Falls back to
503 /// `page_timeout_ms` when unset. HTTP responses arrive quickly when they
504 /// arrive at all, so 15s is generous and keeps slow upstreams from
505 /// hogging the request budget that should be spent on JS retries.
506 #[serde(default)]
507 pub http_timeout_ms: Option<u64>,
508 /// Override for the LightPanda CDP renderer. LightPanda completes most
509 /// renders in <10s; if it stalls past 20s it almost always means an
510 /// adversarial page that Chrome will render anyway, so failing fast and
511 /// escalating beats waiting it out.
512 #[serde(default)]
513 pub lightpanda_timeout_ms: Option<u64>,
514 /// Override for the full-Chromium tier. Chrome is the slow path
515 /// (gov/legal SPAs need 30–40s for `networkidle`); the larger budget here
516 /// recovers ~6 URLs per fc-wins iteration without affecting the fast path.
517 #[serde(default)]
518 pub chrome_timeout_ms: Option<u64>,
519 #[serde(default = "default_pool_size")]
520 pub pool_size: usize,
521 /// If set, applies to every request that doesn't specify `renderJs` explicitly.
522 /// `Some(true)` = force JS rendering; `Some(false)` = skip JS; `None` = auto-detect.
523 ///
524 /// Accepts the `force_js` alias for backward compatibility.
525 #[serde(default, alias = "force_js")]
526 pub render_js_default: Option<bool>,
527 #[serde(default)]
528 pub lightpanda: Option<CdpEndpoint>,
529 #[serde(default)]
530 pub playwright: Option<CdpEndpoint>,
531 #[serde(default)]
532 pub chrome: Option<CdpEndpoint>,
533 /// Residential-proxy Chrome tier (opt-in 4th renderer). Same Chromium
534 /// browser as `chrome`, but egress routed through a forwarder that adds
535 /// upstream proxy auth (e.g. DataImpulse). Tried after Chrome fails —
536 /// covers IP-blocked targets where the browser fingerprint is fine but
537 /// the VPS egress IP is flagged.
538 #[serde(default)]
539 pub chrome_proxy: Option<CdpEndpoint>,
540 /// Per-tier nav timeout override for `chrome_proxy`. When unset, defaults
541 /// to `chrome_timeout() + 15_000` — the proxy hop adds latency, so the
542 /// fallback tier needs more headroom than direct Chrome.
543 #[serde(default)]
544 pub chrome_proxy_timeout_ms: Option<u64>,
545 /// Enable Chrome resource interception (`Fetch.enable` blocking of media,
546 /// fonts, trackers). Default `false`; flipped after the CDP-fake suite
547 /// validates pump + cleanup behaviour. See plan Phase 2.
548 #[serde(default)]
549 pub chrome_intercept_resources: bool,
550 /// Additionally block `stylesheet` requests when interception is enabled.
551 /// Default `false` — kept off in v1 because some extractors depend on
552 /// CSS-driven visibility / lazy-content triggers.
553 #[serde(default)]
554 pub chrome_intercept_stylesheets: bool,
555 /// Per-host opt-out for chrome interception. Hosts in this list run with
556 /// interception disabled even when `chrome_intercept_resources = true`.
557 #[serde(default)]
558 pub chrome_host_intercept_disable: Vec<String>,
559 /// Hard chrome-tier navigation budget in ms. Wraps `wait_for_page_ready`
560 /// in an inner race; on budget hit the renderer snapshots whatever DOM is
561 /// present and returns `truncated = true`. Calibrated as
562 /// `p90(successful chrome renders)` clamped to `[8_000, 12_000]`.
563 #[serde(default = "default_chrome_nav_budget_ms")]
564 pub chrome_nav_budget_ms: u64,
565 /// Enable the bounded browser-context pool. Default `false`; v1 ships
566 /// `RECYCLE_AFTER_NAV = 1` (recreate every release) before optimising to
567 /// reuse-with-clearing. See plan Phase 4. **Gated off when
568 /// `chrome_backend = "browserless"`** — browserless v2's
569 /// `Target.createBrowserContext` semantics with long-lived sessions are
570 /// unproven; lib.rs forces this to `false` with a WARN log in that case.
571 #[serde(default)]
572 pub chrome_context_pool_enabled: bool,
573 /// Per-knob pool configuration. Read only when
574 /// `chrome_context_pool_enabled = true` AND backend is `Vanilla`.
575 #[serde(default)]
576 pub chrome_pool: ChromePoolConfig,
577 /// Which Chrome backend the WS URL points at. **Explicit** — never sniff
578 /// from URL substrings (k8s svc names, port-forwards, custom routes break
579 /// substring detection per plan §C2). Default `Vanilla`.
580 #[serde(default)]
581 pub chrome_backend: ChromeBackend,
582 /// Enable the success-ratio renderer predictor in `HostPreferences`.
583 /// Default `false`; flipped after the predictor replay harness gates
584 /// on the 1k bench (false-skip < 2 %, false-escalate < 5 %, churn < 3 / 1k).
585 #[serde(default)]
586 pub use_predictor: bool,
587 /// Engine escalation policy (firecrawl-shaped: race + on-error). When
588 /// disabled (default), the renderer keeps its current ladder unchanged.
589 #[serde(default)]
590 pub escalation: EscalationConfig,
591 /// Anti-bot detection policy (crawl4ai 3-tier classifier).
592 #[serde(default)]
593 pub antibot: AntibotConfig,
594 /// DataImpulse residential-proxy base username (without `__cr.<cc>`
595 /// country suffix). When set alongside [`proxy_base_pass`], the engine
596 /// drives Chrome's proxy auth via CDP `Fetch.authRequired` and composes
597 /// the country-suffixed username per request. Read only by the
598 /// `chrome_proxy` tier. None = no upstream proxy auth (chrome_proxy
599 /// tier still functional only if a no-auth or pre-authed proxy is in
600 /// front of Chrome).
601 #[serde(default)]
602 pub proxy_base_user: Option<String>,
603 /// DataImpulse base password — see [`proxy_base_user`].
604 #[serde(default)]
605 pub proxy_base_pass: Option<String>,
606 /// Fallback country code used when a request omits `country`. Lowercased
607 /// 2-letter ISO 3166-1 alpha-2 (e.g. "us"). None = global pool (no suffix).
608 #[serde(default)]
609 pub proxy_default_country: Option<String>,
610}
611
612/// Engine escalation policy — adds `ChromeStealth` and `ChromeStealthProxy`
613/// tiers behind a feature flag. See `plans/recall-next-tier.md` Phase 2.
614#[derive(Debug, Clone, Deserialize)]
615pub struct EscalationConfig {
616 /// Master switch. Default `false` — current ladder runs unchanged.
617 #[serde(default)]
618 pub enabled: bool,
619 /// Per-tier waterfall trigger in ms. If the current engine hasn't returned
620 /// after this long, the next tier is started in parallel (firecrawl
621 /// `WaterfallNextEngineSignal`).
622 #[serde(default = "default_waterfall_timeout_ms")]
623 pub waterfall_timeout_ms: u64,
624 /// Hard global cap across the whole ladder.
625 #[serde(default = "default_escalation_global_timeout_ms")]
626 pub global_timeout_ms: u64,
627 /// Send `?proxy=residential&proxyCountry=…` to browserless on the
628 /// `ChromeStealthProxy` tier. Off by default — bears cost.
629 #[serde(default)]
630 pub residential_proxy: bool,
631 /// Country code passed to browserless when `residential_proxy = true`.
632 #[serde(default = "default_proxy_country")]
633 pub proxy_country: String,
634}
635
636impl Default for EscalationConfig {
637 fn default() -> Self {
638 Self {
639 enabled: false,
640 waterfall_timeout_ms: default_waterfall_timeout_ms(),
641 global_timeout_ms: default_escalation_global_timeout_ms(),
642 residential_proxy: false,
643 proxy_country: default_proxy_country(),
644 }
645 }
646}
647
648fn default_waterfall_timeout_ms() -> u64 {
649 8_000
650}
651fn default_escalation_global_timeout_ms() -> u64 {
652 60_000
653}
654fn default_proxy_country() -> String {
655 "us".to_string()
656}
657
658/// Anti-bot classifier policy. Default: detect+log only; escalation requires
659/// `escalate_on_signal = true` AND `escalation.enabled = true`.
660#[derive(Debug, Clone, Deserialize)]
661pub struct AntibotConfig {
662 /// Run the classifier on every fetch result. Cheap; default on.
663 #[serde(default = "default_true")]
664 pub enabled: bool,
665 /// When the classifier returns a non-`None` signal, advance to the next
666 /// engine tier (requires `escalation.enabled`).
667 #[serde(default)]
668 pub escalate_on_signal: bool,
669 /// When the classifier flags a block during the renderer failover loop,
670 /// treat the result as a soft failure so the loop advances to the next
671 /// tier — ending at `chrome_proxy` (residential). Default `true`. Set
672 /// `false` to keep the classifier running (error_code + telemetry) while
673 /// disabling in-loop escalation — the one-line kill switch.
674 #[serde(default = "default_true")]
675 pub escalate_in_failover: bool,
676}
677
678impl Default for AntibotConfig {
679 fn default() -> Self {
680 Self {
681 enabled: true,
682 escalate_on_signal: false,
683 escalate_in_failover: true,
684 }
685 }
686}
687
688fn default_chrome_nav_budget_ms() -> u64 {
689 12_000
690}
691
692/// Per-knob configuration for the bounded browser-context pool. Loaded under
693/// `[renderer.chrome_pool]`. Inactive unless
694/// `chrome_context_pool_enabled = true` AND `chrome_backend = "vanilla"`.
695#[derive(Debug, Clone, Deserialize)]
696pub struct ChromePoolConfig {
697 /// Pool size. `None` → `max(2, num_cpus / 2)`. Caps simultaneous
698 /// in-flight chrome requests per pool.
699 #[serde(default)]
700 pub size: Option<usize>,
701 /// Recycle policy: v1 always recreates the context after each release.
702 /// Reserved for a future "reuse N navigations then recreate" mode.
703 #[serde(default = "default_recycle_after_navs")]
704 pub recycle_after_navs: u32,
705 /// Idle slots older than this are health-checked on next acquire.
706 #[serde(default = "default_idle_timeout_secs")]
707 pub idle_timeout_secs: u64,
708 /// `Browser.getVersion` probe deadline (idle-slot liveness).
709 #[serde(default = "default_health_check_secs")]
710 pub health_check_secs: u64,
711 /// SIGTERM drain window before phase 3 force-close.
712 #[serde(default = "default_shutdown_drain_secs")]
713 pub shutdown_drain_secs: u64,
714}
715
716impl Default for ChromePoolConfig {
717 fn default() -> Self {
718 Self {
719 size: None,
720 recycle_after_navs: default_recycle_after_navs(),
721 idle_timeout_secs: default_idle_timeout_secs(),
722 health_check_secs: default_health_check_secs(),
723 shutdown_drain_secs: default_shutdown_drain_secs(),
724 }
725 }
726}
727
728fn default_recycle_after_navs() -> u32 {
729 1
730}
731fn default_idle_timeout_secs() -> u64 {
732 300
733}
734fn default_health_check_secs() -> u64 {
735 60
736}
737fn default_shutdown_drain_secs() -> u64 {
738 30
739}
740
741/// Chrome backend kind. Set explicitly under `[renderer]` as
742/// `chrome_backend = "vanilla"` or `chrome_backend = "browserless"`. **Never
743/// inferred from URL substrings** — k8s service names, port-forwards, and
744/// custom routes break substring detection. See plan §C2.
745#[derive(Debug, Clone, Copy, Default, Deserialize, PartialEq, Eq)]
746#[serde(rename_all = "lowercase")]
747pub enum ChromeBackend {
748 /// chromedp/headless-shell or vanilla Chrome with `/json/version`. Pool
749 /// is enabled here when `chrome_context_pool_enabled = true`.
750 #[default]
751 Vanilla,
752 /// Browserless v2 / commercial CDP endpoint. Pool is **gated off** in v1
753 /// — see plan §"Out of scope (v1)".
754 Browserless,
755}
756
757impl Default for RendererConfig {
758 fn default() -> Self {
759 Self {
760 mode: RendererMode::default(),
761 page_timeout_ms: default_page_timeout(),
762 http_timeout_ms: None,
763 lightpanda_timeout_ms: None,
764 chrome_timeout_ms: None,
765 pool_size: default_pool_size(),
766 render_js_default: None,
767 lightpanda: None,
768 playwright: None,
769 chrome: None,
770 chrome_proxy: None,
771 chrome_proxy_timeout_ms: None,
772 chrome_intercept_resources: false,
773 chrome_intercept_stylesheets: false,
774 chrome_host_intercept_disable: Vec::new(),
775 chrome_nav_budget_ms: default_chrome_nav_budget_ms(),
776 chrome_context_pool_enabled: false,
777 chrome_pool: ChromePoolConfig::default(),
778 chrome_backend: ChromeBackend::default(),
779 use_predictor: false,
780 escalation: EscalationConfig::default(),
781 antibot: AntibotConfig::default(),
782 proxy_base_user: None,
783 proxy_base_pass: None,
784 proxy_default_country: None,
785 }
786 }
787}
788fn default_page_timeout() -> u64 {
789 30000
790}
791
792impl RendererConfig {
793 /// Resolved per-tier nav timeout in milliseconds. Resolution rules:
794 /// 1. If the explicit per-tier field is set, use it verbatim.
795 /// 2. Otherwise fall back to `page_timeout_ms` (which itself defaults
796 /// to 30s for backward compatibility with pre-multi-tier configs).
797 ///
798 /// New deployments are encouraged to set the per-tier knobs to 15/20/45s
799 /// (see config.docker.toml) — these match the bench-tuned values that
800 /// recover slow gov sites in the chrome tier without giving the http
801 /// tier permission to hog the request budget.
802 pub fn http_timeout(&self) -> u64 {
803 self.http_timeout_ms.unwrap_or(self.page_timeout_ms)
804 }
805 pub fn lightpanda_timeout(&self) -> u64 {
806 self.lightpanda_timeout_ms.unwrap_or(self.page_timeout_ms)
807 }
808 pub fn chrome_timeout(&self) -> u64 {
809 self.chrome_timeout_ms.unwrap_or(self.page_timeout_ms)
810 }
811 pub fn chrome_proxy_timeout(&self) -> u64 {
812 self.chrome_proxy_timeout_ms
813 .unwrap_or_else(|| self.chrome_timeout().saturating_add(15_000))
814 }
815
816 /// Compose the DataImpulse-style proxy credentials for a single request.
817 ///
818 /// Resolution order for the country suffix:
819 /// 1. `country` argument (per-request override)
820 /// 2. `self.proxy_default_country` (server default)
821 /// 3. No suffix → DataImpulse global pool
822 ///
823 /// Returns `None` when no base credentials are configured — caller treats
824 /// this as "no auth required". An invalid country code (wrong length,
825 /// non-alphabetic) silently falls through to the default; that keeps a
826 /// malformed `?country=` query from creating an unauthenticated request
827 /// while still letting through a well-known default.
828 pub fn effective_proxy_credentials(&self, country: Option<&str>) -> Option<(String, String)> {
829 let user = self.proxy_base_user.as_ref()?;
830 let pass = self.proxy_base_pass.as_ref()?;
831 let cc = country
832 .or(self.proxy_default_country.as_deref())
833 .map(|s| s.trim().to_lowercase())
834 .filter(|s| s.len() == 2 && s.chars().all(|c| c.is_ascii_alphabetic()));
835 Some(match cc {
836 Some(cc) => (format!("{user}__cr.{cc}"), pass.clone()),
837 None => (user.clone(), pass.clone()),
838 })
839 }
840
841 /// Number of active CDP tiers (lightpanda + playwright + chrome) under
842 /// the current `mode`. Mirrors the predicate used at runtime in
843 /// `crw-renderer/src/lib.rs` when constructing the renderer ladder:
844 /// `want(mode) && config.<tier>.is_some()`.
845 ///
846 /// Returns `0` when the binary is built without the `cdp` feature — in
847 /// that case no JS renderer can be constructed regardless of the config,
848 /// so the deadline auto-extension policy must collapse to HTTP-only.
849 pub fn cdp_tier_count(&self) -> usize {
850 if !cfg!(feature = "cdp") {
851 return 0;
852 }
853 let want =
854 |m: RendererMode| -> bool { matches!(self.mode, RendererMode::Auto) || self.mode == m };
855 let mut n = 0;
856 if want(RendererMode::Lightpanda) && self.lightpanda.is_some() {
857 n += 1;
858 }
859 if want(RendererMode::Playwright) && self.playwright.is_some() {
860 n += 1;
861 }
862 if want(RendererMode::Chrome) && self.chrome.is_some() {
863 n += 1;
864 }
865 n
866 }
867
868 /// Minimum request deadline budget (ms) required so that every configured
869 /// tier can use its full allowance when fallback exhausts the chain.
870 /// Sums the per-tier timeouts and adds [`CDP_TIER_OVERHEAD_MS`] for each
871 /// active CDP tier, matching the runtime ladder built in
872 /// `crw-renderer/src/lib.rs`.
873 pub fn min_deadline_for_full_ladder_ms(&self) -> u64 {
874 let want =
875 |m: RendererMode| -> bool { matches!(self.mode, RendererMode::Auto) || self.mode == m };
876
877 let mut sum: u64 = 0;
878 // HTTP prefetch runs ahead of any JS tier (content-type sniffing,
879 // direct PDF/binary handling) regardless of pinned mode. Skipped only
880 // when mode is `None` (no fetching at all).
881 if !matches!(self.mode, RendererMode::None) {
882 sum = sum.saturating_add(self.http_timeout());
883 }
884
885 // CDP tiers only contribute when the binary was built with the `cdp`
886 // feature; otherwise no JS renderer is constructable at runtime and
887 // including their budgets would over-extend the deadline.
888 if !cfg!(feature = "cdp") {
889 return sum;
890 }
891
892 let mut cdp_tier_count: u64 = 0;
893 if want(RendererMode::Lightpanda) && self.lightpanda.is_some() {
894 sum = sum.saturating_add(self.lightpanda_timeout());
895 cdp_tier_count += 1;
896 }
897 if want(RendererMode::Playwright) && self.playwright.is_some() {
898 sum = sum.saturating_add(self.chrome_timeout());
899 cdp_tier_count += 1;
900 }
901 if want(RendererMode::Chrome) && self.chrome.is_some() {
902 sum = sum.saturating_add(self.chrome_timeout());
903 cdp_tier_count += 1;
904 }
905 sum.saturating_add(cdp_tier_count.saturating_mul(CDP_TIER_OVERHEAD_MS))
906 }
907}
908fn default_pool_size() -> usize {
909 4
910}
911
912#[derive(Debug, Clone, Deserialize)]
913pub struct CdpEndpoint {
914 pub ws_url: String,
915}
916
917/// Stealth mode configuration for evading bot detection.
918#[derive(Debug, Clone, Deserialize)]
919pub struct StealthConfig {
920 /// Enable stealth mode globally.
921 #[serde(default)]
922 pub enabled: bool,
923 /// Custom user-agent pool. Empty = use built-in pool.
924 #[serde(default)]
925 pub user_agents: Vec<String>,
926 /// Jitter factor for rate limiting (0.0–1.0, default 0.2 = ±20%).
927 #[serde(default = "default_jitter")]
928 pub jitter_factor: f64,
929 /// Inject realistic browser headers (Accept, Sec-Fetch-*, etc.).
930 #[serde(default = "default_true")]
931 pub inject_headers: bool,
932}
933
934impl Default for StealthConfig {
935 fn default() -> Self {
936 Self {
937 enabled: false,
938 user_agents: vec![],
939 jitter_factor: default_jitter(),
940 inject_headers: true,
941 }
942 }
943}
944
945fn default_jitter() -> f64 {
946 0.2
947}
948
949/// Built-in realistic user-agent pool used when stealth is enabled.
950pub const BUILTIN_UA_POOL: &[&str] = &[
951 "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
952 "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
953 "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
954 "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:133.0) Gecko/20100101 Firefox/133.0",
955 "Mozilla/5.0 (Macintosh; Intel Mac OS X 14_7_2) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.2 Safari/605.1.15",
956];
957
958#[derive(Debug, Clone, Deserialize)]
959pub struct CrawlerConfig {
960 #[serde(default = "default_concurrency")]
961 pub max_concurrency: usize,
962 #[serde(default = "default_rps")]
963 pub requests_per_second: f64,
964 #[serde(default = "default_true")]
965 pub respect_robots_txt: bool,
966 #[serde(default = "default_ua")]
967 pub user_agent: String,
968 #[serde(default = "default_depth")]
969 pub default_max_depth: u32,
970 #[serde(default = "default_max_pages")]
971 pub default_max_pages: u32,
972 /// Proxy URL for crawler requests. Supports HTTP, HTTPS, and SOCKS5
973 /// (e.g. "http://proxy:8080" or "socks5://user:pass@proxy:1080").
974 #[serde(default)]
975 pub proxy: Option<String>,
976 /// TTL in seconds for completed crawl jobs before cleanup (default: 3600)
977 #[serde(default = "default_job_ttl")]
978 pub job_ttl_secs: u64,
979 #[serde(default)]
980 pub stealth: StealthConfig,
981 /// Floor for the per-host limiter interval, in milliseconds. When a host
982 /// advertises `Crawl-delay` in robots.txt, the higher of the two wins.
983 /// Default `0` — robots.txt is the authoritative source, this is a
984 /// per-deployment safety net.
985 #[serde(default)]
986 pub per_host_min_interval_ms: u64,
987 /// Maximum concurrent in-flight requests against a single eTLD+1.
988 /// Default `1` — strict ethics posture; operators raise consciously via
989 /// config when scraping their own infrastructure.
990 #[serde(default = "default_per_host_max_concurrent")]
991 pub per_host_max_concurrent: u32,
992}
993
994fn default_per_host_max_concurrent() -> u32 {
995 1
996}
997
998impl Default for CrawlerConfig {
999 fn default() -> Self {
1000 Self {
1001 max_concurrency: default_concurrency(),
1002 requests_per_second: default_rps(),
1003 respect_robots_txt: true,
1004 user_agent: default_ua(),
1005 default_max_depth: default_depth(),
1006 default_max_pages: default_max_pages(),
1007 proxy: None,
1008 job_ttl_secs: default_job_ttl(),
1009 stealth: StealthConfig::default(),
1010 per_host_min_interval_ms: 0,
1011 per_host_max_concurrent: default_per_host_max_concurrent(),
1012 }
1013 }
1014}
1015
1016fn default_concurrency() -> usize {
1017 10
1018}
1019fn default_rps() -> f64 {
1020 10.0
1021}
1022fn default_true() -> bool {
1023 true
1024}
1025fn default_ua() -> String {
1026 // Modern Chrome UA. The legacy "CRW/0.1" was rejected by UA-filtering sites
1027 // (opencorporates, killeenisd, wsj) returning 403/404. Kept in sync with the
1028 // Sec-Ch-Ua client hint in `crw-renderer/src/http_only.rs`.
1029 "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 \
1030 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36"
1031 .into()
1032}
1033fn default_depth() -> u32 {
1034 2
1035}
1036fn default_max_pages() -> u32 {
1037 100
1038}
1039fn default_job_ttl() -> u64 {
1040 3600
1041}
1042
1043#[derive(Debug, Clone, Deserialize)]
1044pub struct ExtractionConfig {
1045 #[serde(default = "default_format")]
1046 pub default_format: String,
1047 #[serde(default = "default_true_ext")]
1048 pub only_main_content: bool,
1049 #[serde(default)]
1050 pub llm: Option<LlmConfig>,
1051 /// Hostname → CSS selector overrides applied before readability narrowing.
1052 /// Match is exact host (no wildcard); user-supplied selector still wins.
1053 #[serde(default)]
1054 pub domain_selectors: std::collections::HashMap<String, String>,
1055 #[serde(default)]
1056 pub llm_fallback: LlmFallbackConfig,
1057 /// Bytes below which an HTTP-tier extraction is treated as "thin"
1058 /// and triggers a JS-renderer escalation. Default 100.
1059 #[serde(default = "default_http_retry_threshold")]
1060 pub http_retry_threshold_bytes: usize,
1061 /// Bytes below which a LightPanda-tier extraction is treated as
1062 /// "thin" and triggers a Chrome escalation. Default 2000 (LP often
1063 /// returns SPA husks of 90–500B that pass HTML-shape checks).
1064 #[serde(default = "default_lightpanda_retry_threshold")]
1065 pub lightpanda_retry_threshold_bytes: usize,
1066}
1067
1068fn default_http_retry_threshold() -> usize {
1069 100
1070}
1071
1072fn default_lightpanda_retry_threshold() -> usize {
1073 2000
1074}
1075
1076impl Default for ExtractionConfig {
1077 fn default() -> Self {
1078 Self {
1079 default_format: default_format(),
1080 only_main_content: true,
1081 llm: None,
1082 domain_selectors: std::collections::HashMap::new(),
1083 llm_fallback: LlmFallbackConfig::default(),
1084 http_retry_threshold_bytes: default_http_retry_threshold(),
1085 lightpanda_retry_threshold_bytes: default_lightpanda_retry_threshold(),
1086 }
1087 }
1088}
1089
1090#[derive(Debug, Clone, Deserialize)]
1091pub struct LlmFallbackConfig {
1092 #[serde(default)]
1093 pub enable: bool,
1094 #[serde(default = "default_llm_quality_threshold")]
1095 pub quality_threshold: f32,
1096 #[serde(default = "default_llm_max_html_bytes")]
1097 pub max_html_bytes: usize,
1098 /// When true (and `enable` is true), invoke the LLM on every page rather
1099 /// than only when DOM-based extraction scores below `quality_threshold`.
1100 /// Mirrors the "LLM as primary extractor" pattern used by Reader-LM,
1101 /// Firecrawl, and similar services. Higher cost, higher recall.
1102 #[serde(default)]
1103 pub always_run: bool,
1104}
1105
1106impl Default for LlmFallbackConfig {
1107 fn default() -> Self {
1108 Self {
1109 enable: false,
1110 quality_threshold: default_llm_quality_threshold(),
1111 max_html_bytes: default_llm_max_html_bytes(),
1112 always_run: false,
1113 }
1114 }
1115}
1116
1117fn default_llm_quality_threshold() -> f32 {
1118 0.3
1119}
1120fn default_llm_max_html_bytes() -> usize {
1121 100_000
1122}
1123
1124#[derive(Debug, Clone, Deserialize)]
1125pub struct LlmConfig {
1126 #[serde(default = "default_llm_provider")]
1127 pub provider: String,
1128 pub api_key: String,
1129 #[serde(default = "default_llm_model")]
1130 pub model: String,
1131 #[serde(default)]
1132 pub base_url: Option<String>,
1133 #[serde(default = "default_llm_max_tokens")]
1134 pub max_tokens: u32,
1135 /// Azure OpenAI API version (e.g. "2024-05-01-preview"). Required when
1136 /// `provider = "azure"`; ignored otherwise.
1137 #[serde(default)]
1138 pub azure_api_version: Option<String>,
1139 /// Max parallel LLM calls for fan-out (e.g. per-result search summaries).
1140 /// Bounded to avoid hitting provider rate limits.
1141 #[serde(default = "default_llm_max_concurrency")]
1142 pub max_concurrency: usize,
1143 /// Byte cap on content sent to the LLM in a single call. Content beyond
1144 /// the cap is truncated on a UTF-8 char boundary.
1145 #[serde(default = "default_llm_max_html_bytes")]
1146 pub max_html_bytes: usize,
1147 /// When set, opencore refuses LLM-touching requests that lack this header
1148 /// AND do not supply `llm_api_key` in the body. SaaS deploys set this so
1149 /// direct public callers can't access LLM features.
1150 #[serde(default)]
1151 pub require_byok_header: Option<String>,
1152 /// Sampling temperature for the LLM call. `None` (default) sends no
1153 /// `temperature` key, preserving each provider's default (DeepSeek = 1) and
1154 /// current prod behavior. The benchmark/eval harness sets `0.0` (with a
1155 /// seed) to make answers deterministic so a real +2-3pp lever is
1156 /// distinguishable from sampling noise. Prod stays `None` until temp=0 is
1157 /// proven not to raise abstention.
1158 #[serde(default)]
1159 pub temperature: Option<f32>,
1160}
1161
1162impl Default for LlmConfig {
1163 fn default() -> Self {
1164 Self {
1165 provider: default_llm_provider(),
1166 api_key: String::new(),
1167 model: default_llm_model(),
1168 base_url: None,
1169 max_tokens: default_llm_max_tokens(),
1170 azure_api_version: None,
1171 max_concurrency: default_llm_max_concurrency(),
1172 max_html_bytes: default_llm_max_html_bytes(),
1173 require_byok_header: None,
1174 temperature: None,
1175 }
1176 }
1177}
1178
1179fn default_llm_max_concurrency() -> usize {
1180 4
1181}
1182
1183fn default_llm_provider() -> String {
1184 "anthropic".into()
1185}
1186fn default_llm_model() -> String {
1187 "claude-sonnet-4-20250514".into()
1188}
1189fn default_llm_max_tokens() -> u32 {
1190 4096
1191}
1192
1193fn default_format() -> String {
1194 "markdown".into()
1195}
1196fn default_true_ext() -> bool {
1197 true
1198}
1199
1200/// Custom deserializer for Vec<String> that accepts:
1201/// - TOML array: `api_keys = ["key1", "key2"]`
1202/// - JSON array: `["key1", "key2"]` (for env vars)
1203/// - Comma-separated: `key1,key2` (for simple env var usage)
1204fn deserialize_string_vec<'de, D>(deserializer: D) -> Result<Vec<String>, D::Error>
1205where
1206 D: serde::Deserializer<'de>,
1207{
1208 #[derive(serde::Deserialize)]
1209 #[serde(untagged)]
1210 enum StringOrVec {
1211 Vec(Vec<String>),
1212 Str(String),
1213 }
1214
1215 match StringOrVec::deserialize(deserializer)? {
1216 StringOrVec::Vec(v) => Ok(v),
1217 StringOrVec::Str(s) => {
1218 let s = s.trim();
1219 // Try JSON array first
1220 if s.starts_with('[') {
1221 serde_json::from_str(s).map_err(serde::de::Error::custom)
1222 } else {
1223 // Comma-separated fallback
1224 Ok(s.split(',')
1225 .map(|s| s.trim().to_string())
1226 .filter(|s| !s.is_empty())
1227 .collect())
1228 }
1229 }
1230 }
1231}
1232
1233#[derive(Debug, Clone, Default, Deserialize)]
1234pub struct AuthConfig {
1235 #[serde(default, deserialize_with = "deserialize_string_vec")]
1236 pub api_keys: Vec<String>,
1237}
1238
1239/// Path of the per-user config file written by `crw setup`. Returns `None` if
1240/// the home directory cannot be resolved (e.g. headless container with no
1241/// `$HOME`). Honors `$CRW_USER_CONFIG_DIR` for tests so we don't have to
1242/// monkey-patch `$HOME`.
1243pub fn user_config_path() -> Option<std::path::PathBuf> {
1244 if let Ok(dir) = std::env::var("CRW_USER_CONFIG_DIR") {
1245 return Some(std::path::PathBuf::from(dir).join("config.toml"));
1246 }
1247 let home = std::env::var_os("HOME")?;
1248 Some(
1249 std::path::PathBuf::from(home)
1250 .join(".config")
1251 .join("crw")
1252 .join("config.toml"),
1253 )
1254}
1255
1256impl AppConfig {
1257 /// Load config from config.default.toml + per-user config + environment
1258 /// variable overrides.
1259 ///
1260 /// Precedence (highest wins):
1261 /// 1. `CRW_*` env vars (CI/Docker)
1262 /// 2. `$CRW_CONFIG` file (or `config.local.toml` in cwd)
1263 /// 3. `~/.config/crw/config.toml` (written by `crw setup`)
1264 /// 4. `config.default.toml` (bundled defaults)
1265 ///
1266 /// Env stays on top so a one-off `CRW_FOO=bar crw …` always wins over
1267 /// whatever the user has saved, matching how every other shell tool works.
1268 pub fn load() -> Result<Self, config::ConfigError> {
1269 let mut builder = config::Config::builder()
1270 .add_source(config::File::with_name("config.default").required(false));
1271
1272 // User-level config — written atomically by `crw setup`. Optional, so
1273 // a never-configured machine simply reads defaults + env.
1274 if let Some(user_cfg) = user_config_path()
1275 && user_cfg.exists()
1276 {
1277 builder = builder.add_source(config::File::from(user_cfg).required(false));
1278 }
1279
1280 // Load optional override config file (e.g. config.docker.toml in containers).
1281 if let Ok(extra) = std::env::var("CRW_CONFIG") {
1282 builder = builder.add_source(config::File::with_name(&extra).required(true));
1283 } else {
1284 builder = builder.add_source(config::File::with_name("config.local").required(false));
1285 }
1286
1287 let cfg = builder
1288 .add_source(
1289 config::Environment::with_prefix("CRW")
1290 .prefix_separator("_")
1291 .separator("__")
1292 .try_parsing(true),
1293 )
1294 .build()?;
1295 cfg.try_deserialize()
1296 }
1297
1298 /// Compute the effective end-to-end request deadline (ms). Implements the
1299 /// issue-#35 auto-extension policy:
1300 ///
1301 /// 1. If the caller supplied an explicit `requested_deadline_ms`, return it
1302 /// verbatim — operators trust the request budget over our heuristic.
1303 /// 2. Otherwise, when `request.auto_extend_deadline_for_ladder` is on,
1304 /// return `max(deadline_ms_default, ladder_min + wait_for_extra)`.
1305 /// `ladder_min` covers the configured tier ladder; `wait_for_extra`
1306 /// compensates for callers that bumped `wait_for_ms` above the default
1307 /// SPA budget (8s) — without it, a long `wait_for` would silently
1308 /// re-clamp inside CDP.
1309 /// 3. When the policy is disabled, return `deadline_ms_default` unchanged.
1310 ///
1311 /// `wait_for_ms` is the per-request override (ScrapeRequest::wait_for /
1312 /// CrawlRequest::wait_for); pass `None` for sub-fetches that don't
1313 /// surface a wait_for to the caller (search/map enrichment).
1314 pub fn effective_deadline_ms(
1315 &self,
1316 requested_deadline_ms: Option<u64>,
1317 wait_for_ms: Option<u64>,
1318 ) -> u64 {
1319 if let Some(explicit) = requested_deadline_ms {
1320 return explicit;
1321 }
1322 let default_ms = self.request.deadline_ms_default;
1323 if !self.request.auto_extend_deadline_for_ladder {
1324 return default_ms;
1325 }
1326 // Issue #35 is specifically about CDP tier overhead silently clamping
1327 // chrome_timeout_ms. HTTP-only deployments don't suffer the same
1328 // problem (the HTTP renderer respects deadline.remaining without the
1329 // extra fetch/challenge/stability overhead). Skip the extension when
1330 // no CDP tiers are configured so HTTP-only users keep the strict
1331 // operator-configured default.
1332 if self.renderer.cdp_tier_count() == 0 {
1333 return default_ms;
1334 }
1335 let ladder_min = self.renderer.min_deadline_for_full_ladder_ms();
1336 // Mirrors crw_renderer::cdp::SPA_SELECTOR_MAX_MS. The CDP module
1337 // adds `wait_for_ms.unwrap_or(SPA_SELECTOR_MAX_MS)` to its internal
1338 // timeout, so when the caller exceeds the default we need to extend
1339 // the deadline per active CDP tier.
1340 const SPA_DEFAULT_MS: u64 = 8_000;
1341 // Clamp `wait_for_ms` to MAX_WAIT_FOR_MS so the inner deadline never
1342 // exceeds the Tower envelope, which is sized off the same constant in
1343 // `effective_request_timeout_secs`. A pathological caller passing
1344 // `wait_for: 600_000` without `deadlineMs` would otherwise be cancelled
1345 // by Tower before the inner CDP loop noticed the bigger budget.
1346 let extra = if let Some(w) = wait_for_ms {
1347 let bounded = w.min(MAX_WAIT_FOR_MS);
1348 let per_tier = bounded.saturating_sub(SPA_DEFAULT_MS);
1349 per_tier.saturating_mul(self.renderer.cdp_tier_count() as u64)
1350 } else {
1351 0
1352 };
1353 default_ms.max(ladder_min.saturating_add(extra))
1354 }
1355
1356 /// Tower middleware outer timeout (seconds). Must accommodate the longest
1357 /// legitimate handler runtime so a healthy request isn't cancelled by the
1358 /// outer layer before the inner deadline fires.
1359 ///
1360 /// Covers the three route envelopes:
1361 /// - `/scrape`, `/mcp` — auto-extended scrape deadline.
1362 /// - `/search` — SearXNG fetch + bounded enrichment fan-out
1363 /// (`ceil(max_limit / max_concurrency)` batches × scrape_ms).
1364 /// - `/crawl/jobs/:id`, `/map` — handler-side caps up to 300s.
1365 ///
1366 /// When auto-extend is disabled, returns the operator-configured baseline
1367 /// unchanged.
1368 pub fn effective_request_timeout_secs(&self) -> u64 {
1369 let baseline = self.server.request_timeout_secs;
1370 if !self.request.auto_extend_deadline_for_ladder {
1371 return baseline;
1372 }
1373 const OUTER_BUFFER_SECS: u64 = 5;
1374 // `/map` handler caps `req.timeout.unwrap_or(120).min(300)`; the outer
1375 // must cover the upper bound so callers passing `timeout=300` aren't
1376 // cancelled mid-flight.
1377 const MAP_REQUEST_TIMEOUT_CEILING_MS: u64 = 300_000;
1378 // Cover the worst-case implicit scrape: caller bumps `wait_for` to the
1379 // configured maximum without supplying `deadlineMs`. The same
1380 // [`MAX_WAIT_FOR_MS`] constant is used inside `effective_deadline_ms`
1381 // to clamp the inner extension, so the inner deadline can never
1382 // exceed this outer envelope.
1383 let scrape_ms = self.effective_deadline_ms(None, Some(MAX_WAIT_FOR_MS));
1384
1385 // Search enrichment: bounded by max_concurrency. Worst case sequential
1386 // batching with low concurrency: ceil(max_limit / max_concurrency)
1387 // batches each bounded by scrape_ms.
1388 let conc = (self.crawler.max_concurrency.max(1)) as u64;
1389 let max_results = self.search.max_limit as u64;
1390 let enrich_batches = max_results.div_ceil(conc);
1391 let search_enrichment_ms = enrich_batches.saturating_mul(scrape_ms);
1392 let search_ms = self.search.timeout_ms.saturating_add(search_enrichment_ms);
1393
1394 let max_handler_ms = scrape_ms.max(search_ms).max(MAP_REQUEST_TIMEOUT_CEILING_MS);
1395 let needed_secs = max_handler_ms
1396 .div_ceil(1_000)
1397 .saturating_add(OUTER_BUFFER_SECS);
1398 baseline.max(needed_secs)
1399 }
1400}
1401
1402#[cfg(test)]
1403mod tests {
1404 use super::*;
1405
1406 /// Env var tests modify process-wide state; serialize them to avoid cross-test
1407 /// interference (e.g. `force_js` alias + `render_js_default` direct both set).
1408 static ENV_LOCK: std::sync::Mutex<()> = std::sync::Mutex::new(());
1409
1410 fn clear_renderer_env() {
1411 for k in [
1412 "CRW_RENDERER__MODE",
1413 "CRW_RENDERER__FORCE_JS",
1414 "CRW_RENDERER__RENDER_JS_DEFAULT",
1415 "CRW_RENDERER__LIGHTPANDA__WS_URL",
1416 "CRW_SERVER__PORT",
1417 ] {
1418 unsafe { std::env::remove_var(k) };
1419 }
1420 }
1421
1422 #[test]
1423 fn renderer_mode_parses_variants() {
1424 #[derive(Deserialize)]
1425 struct Wrap {
1426 mode: RendererMode,
1427 }
1428 let cases = [
1429 ("mode = \"auto\"", RendererMode::Auto),
1430 ("mode = \"none\"", RendererMode::None),
1431 ("mode = \"lightpanda\"", RendererMode::Lightpanda),
1432 ("mode = \"chrome\"", RendererMode::Chrome),
1433 ("mode = \"playwright\"", RendererMode::Playwright),
1434 ];
1435 for (toml_str, expected) in cases {
1436 let w: Wrap = toml::from_str(toml_str).unwrap();
1437 assert_eq!(w.mode, expected, "toml: {toml_str}");
1438 }
1439 }
1440
1441 #[test]
1442 fn renderer_mode_bogus_errors() {
1443 #[derive(Deserialize)]
1444 struct Wrap {
1445 #[allow(dead_code)]
1446 mode: RendererMode,
1447 }
1448 let err: Result<Wrap, _> = toml::from_str("mode = \"bogus\"");
1449 assert!(err.is_err(), "bogus mode should fail to parse");
1450 }
1451
1452 #[test]
1453 fn renderer_config_default_mode_is_auto() {
1454 let cfg = RendererConfig::default();
1455 assert_eq!(cfg.mode, RendererMode::Auto);
1456 assert_eq!(cfg.render_js_default, None);
1457 }
1458
1459 #[test]
1460 fn render_js_default_force_js_alias() {
1461 let cfg: RendererConfig = toml::from_str("force_js = true").unwrap();
1462 assert_eq!(cfg.render_js_default, Some(true));
1463 }
1464
1465 #[test]
1466 fn render_js_default_direct_field() {
1467 let cfg: RendererConfig = toml::from_str("render_js_default = false").unwrap();
1468 assert_eq!(cfg.render_js_default, Some(false));
1469 }
1470
1471 #[test]
1472 fn env_var_renderer_mode_chrome() {
1473 let _g = ENV_LOCK.lock().unwrap();
1474 clear_renderer_env();
1475 unsafe { std::env::set_var("CRW_RENDERER__MODE", "chrome") };
1476 let cfg = AppConfig::load().unwrap();
1477 clear_renderer_env();
1478 assert_eq!(cfg.renderer.mode, RendererMode::Chrome);
1479 }
1480
1481 #[test]
1482 fn env_var_force_js_alias_works() {
1483 let _g = ENV_LOCK.lock().unwrap();
1484 clear_renderer_env();
1485 unsafe { std::env::set_var("CRW_RENDERER__FORCE_JS", "true") };
1486 let cfg = AppConfig::load().unwrap();
1487 clear_renderer_env();
1488 assert_eq!(cfg.renderer.render_js_default, Some(true));
1489 }
1490
1491 #[test]
1492 fn env_var_render_js_default_direct() {
1493 let _g = ENV_LOCK.lock().unwrap();
1494 clear_renderer_env();
1495 unsafe { std::env::set_var("CRW_RENDERER__RENDER_JS_DEFAULT", "true") };
1496 let cfg = AppConfig::load().unwrap();
1497 clear_renderer_env();
1498 assert_eq!(cfg.renderer.render_js_default, Some(true));
1499 }
1500
1501 #[test]
1502 fn request_config_defaults_match_plan() {
1503 let r = RequestConfig::default();
1504 assert_eq!(r.deadline_ms_default, 8000);
1505 assert!(r.auto_extend_deadline_for_ladder);
1506 }
1507
1508 #[test]
1509 fn default_app_config_enables_auto_extend() {
1510 // Programmatic Default must mirror serde defaults — issue #35.
1511 let cfg = AppConfig::default();
1512 assert!(cfg.request.auto_extend_deadline_for_ladder);
1513 assert_eq!(cfg.request.deadline_ms_default, 8000);
1514 }
1515
1516 fn renderer_with_chrome_only(chrome_ms: u64) -> RendererConfig {
1517 RendererConfig {
1518 mode: RendererMode::Chrome,
1519 page_timeout_ms: chrome_ms,
1520 chrome_timeout_ms: Some(chrome_ms),
1521 chrome: Some(CdpEndpoint {
1522 ws_url: "ws://chrome:9222".into(),
1523 }),
1524 ..Default::default()
1525 }
1526 }
1527
1528 #[test]
1529 #[cfg(feature = "cdp")]
1530 fn min_deadline_full_ladder_chrome_only() {
1531 // chrome-only mode: http (page_timeout) + chrome + 1 * 28000.
1532 let r = renderer_with_chrome_only(30_000);
1533 // page_timeout_ms is set to chrome_ms here, so http_timeout() → 30s.
1534 assert_eq!(
1535 r.min_deadline_for_full_ladder_ms(),
1536 30_000 + 30_000 + 28_000
1537 );
1538 }
1539
1540 #[test]
1541 #[cfg(feature = "cdp")]
1542 fn min_deadline_full_ladder_auto_three_tiers() {
1543 let r = RendererConfig {
1544 mode: RendererMode::Auto,
1545 page_timeout_ms: 15_000,
1546 http_timeout_ms: Some(15_000),
1547 lightpanda_timeout_ms: Some(2_500),
1548 chrome_timeout_ms: Some(30_000),
1549 lightpanda: Some(CdpEndpoint {
1550 ws_url: "ws://lp:9222".into(),
1551 }),
1552 chrome: Some(CdpEndpoint {
1553 ws_url: "ws://chrome:9222".into(),
1554 }),
1555 ..Default::default()
1556 };
1557 // http(15) + lp(2.5) + chrome(30) + 2*28 = 47.5 + 56 = 103_500.
1558 assert_eq!(
1559 r.min_deadline_for_full_ladder_ms(),
1560 15_000 + 2_500 + 30_000 + 2 * 28_000
1561 );
1562 assert_eq!(r.cdp_tier_count(), 2);
1563 }
1564
1565 #[test]
1566 fn effective_deadline_explicit_bypasses_auto_extend() {
1567 let mut cfg = AppConfig::default();
1568 cfg.request.auto_extend_deadline_for_ladder = true;
1569 cfg.renderer = renderer_with_chrome_only(30_000);
1570 // Explicit override beats both default and ladder_min.
1571 assert_eq!(cfg.effective_deadline_ms(Some(5_000), None), 5_000);
1572 assert_eq!(cfg.effective_deadline_ms(Some(500_000), None), 500_000);
1573 }
1574
1575 #[test]
1576 #[cfg(feature = "cdp")]
1577 fn effective_deadline_auto_extend_raises_to_ladder_min() {
1578 let mut cfg = AppConfig::default();
1579 cfg.request.auto_extend_deadline_for_ladder = true;
1580 cfg.request.deadline_ms_default = 8_000;
1581 cfg.renderer = renderer_with_chrome_only(30_000);
1582 let expected = cfg.renderer.min_deadline_for_full_ladder_ms();
1583 assert!(expected > 8_000);
1584 assert_eq!(cfg.effective_deadline_ms(None, None), expected);
1585 }
1586
1587 #[test]
1588 fn effective_deadline_default_wins_when_higher_than_ladder() {
1589 let mut cfg = AppConfig::default();
1590 cfg.request.auto_extend_deadline_for_ladder = true;
1591 cfg.request.deadline_ms_default = 1_000_000;
1592 cfg.renderer = renderer_with_chrome_only(30_000);
1593 assert_eq!(cfg.effective_deadline_ms(None, None), 1_000_000);
1594 }
1595
1596 #[test]
1597 fn effective_deadline_auto_extend_disabled_returns_baseline() {
1598 let mut cfg = AppConfig::default();
1599 cfg.request.auto_extend_deadline_for_ladder = false;
1600 cfg.request.deadline_ms_default = 8_000;
1601 cfg.renderer = renderer_with_chrome_only(30_000);
1602 assert_eq!(cfg.effective_deadline_ms(None, None), 8_000);
1603 }
1604
1605 #[test]
1606 #[cfg(feature = "cdp")]
1607 fn effective_deadline_extends_for_long_wait_for() {
1608 let mut cfg = AppConfig::default();
1609 cfg.request.auto_extend_deadline_for_ladder = true;
1610 cfg.request.deadline_ms_default = 8_000;
1611 cfg.renderer = renderer_with_chrome_only(30_000);
1612 let base = cfg.renderer.min_deadline_for_full_ladder_ms();
1613 let tier_count = cfg.renderer.cdp_tier_count() as u64;
1614 // wait_for = 20000 → per-tier extra = 12000 over SPA_DEFAULT_MS (8000).
1615 let with_wait = cfg.effective_deadline_ms(None, Some(20_000));
1616 assert_eq!(with_wait, base + 12_000 * tier_count);
1617 // wait_for below SPA default → no extra.
1618 assert_eq!(cfg.effective_deadline_ms(None, Some(2_000)), base);
1619 }
1620
1621 #[test]
1622 fn effective_request_timeout_covers_map_ceiling() {
1623 let mut cfg = AppConfig::default();
1624 cfg.request.auto_extend_deadline_for_ladder = true;
1625 cfg.request.deadline_ms_default = 8_000;
1626 cfg.renderer = renderer_with_chrome_only(30_000);
1627 cfg.search.timeout_ms = 15_000;
1628 cfg.crawler.max_concurrency = 10;
1629 cfg.search.max_limit = 20;
1630 cfg.server.request_timeout_secs = 60;
1631 // Map ceiling 300s + 5s buffer = 305s minimum.
1632 assert!(cfg.effective_request_timeout_secs() >= 305);
1633 }
1634
1635 #[test]
1636 fn effective_request_timeout_disabled_returns_baseline() {
1637 let mut cfg = AppConfig::default();
1638 cfg.request.auto_extend_deadline_for_ladder = false;
1639 cfg.server.request_timeout_secs = 60;
1640 assert_eq!(cfg.effective_request_timeout_secs(), 60);
1641 }
1642
1643 #[test]
1644 fn effective_request_timeout_respects_operator_override() {
1645 let mut cfg = AppConfig::default();
1646 cfg.request.auto_extend_deadline_for_ladder = true;
1647 cfg.server.request_timeout_secs = 600; // operator-configured high
1648 cfg.renderer = renderer_with_chrome_only(30_000);
1649 // Operator's explicit 600s should win over the auto-computed 305s.
1650 assert_eq!(cfg.effective_request_timeout_secs(), 600);
1651 }
1652
1653 #[test]
1654 fn effective_request_timeout_search_sequential_batching() {
1655 // Low concurrency forces ceil(max_limit/conc) batches → larger search_ms.
1656 let mut cfg = AppConfig::default();
1657 cfg.request.auto_extend_deadline_for_ladder = true;
1658 cfg.request.deadline_ms_default = 8_000;
1659 cfg.renderer = renderer_with_chrome_only(30_000);
1660 cfg.search.timeout_ms = 15_000;
1661 cfg.search.max_limit = 20;
1662 cfg.crawler.max_concurrency = 1;
1663 cfg.server.request_timeout_secs = 60;
1664 // The Tower envelope must cover the worst-case implicit scrape with
1665 // `wait_for` bumped to MAX_WAIT_FOR_MS (60s), because callers can do
1666 // that without supplying `deadlineMs`. Mirror that in the expected.
1667 let secs = cfg.effective_request_timeout_secs();
1668 let scrape_ms = cfg.effective_deadline_ms(None, Some(60_000));
1669 let expected_search_ms = 15_000 + 20 * scrape_ms;
1670 let expected_max_ms = scrape_ms.max(expected_search_ms).max(300_000);
1671 let expected_secs = expected_max_ms.div_ceil(1_000) + 5;
1672 assert_eq!(secs, 60u64.max(expected_secs));
1673 }
1674
1675 #[test]
1676 #[cfg(not(feature = "cdp"))]
1677 fn cdp_tier_count_zero_without_cdp_feature() {
1678 // Even when chrome/lightpanda are configured, a binary built without
1679 // the `cdp` feature can never construct a JS renderer. The deadline
1680 // policy must observe that and collapse to HTTP-only behavior.
1681 let r = RendererConfig {
1682 mode: RendererMode::Auto,
1683 page_timeout_ms: 15_000,
1684 chrome_timeout_ms: Some(30_000),
1685 chrome: Some(CdpEndpoint {
1686 ws_url: "ws://chrome:9222".into(),
1687 }),
1688 lightpanda: Some(CdpEndpoint {
1689 ws_url: "ws://lp:9222".into(),
1690 }),
1691 ..Default::default()
1692 };
1693 assert_eq!(r.cdp_tier_count(), 0);
1694 // Only the HTTP tier contributes to the ladder budget.
1695 assert_eq!(r.min_deadline_for_full_ladder_ms(), 15_000);
1696 }
1697
1698 #[test]
1699 fn effective_deadline_skipped_for_http_only_mode() {
1700 // P2 from codex review: HTTP-only deployments don't suffer the CDP
1701 // clamping problem (no fetch/challenge/stability overhead). The
1702 // auto-extension must NOT silently bump their default from 8s to 30s
1703 // just because page_timeout_ms defaults high.
1704 let mut cfg = AppConfig::default();
1705 cfg.request.auto_extend_deadline_for_ladder = true;
1706 cfg.request.deadline_ms_default = 8_000;
1707 cfg.renderer = RendererConfig {
1708 mode: RendererMode::Auto,
1709 page_timeout_ms: 30_000,
1710 // No CDP endpoints configured.
1711 lightpanda: None,
1712 playwright: None,
1713 chrome: None,
1714 ..Default::default()
1715 };
1716 assert_eq!(cfg.renderer.cdp_tier_count(), 0);
1717 assert_eq!(cfg.effective_deadline_ms(None, None), 8_000);
1718 assert_eq!(cfg.effective_deadline_ms(None, Some(30_000)), 8_000);
1719 }
1720
1721 #[test]
1722 #[cfg(feature = "cdp")]
1723 fn min_deadline_full_ladder_playwright_only() {
1724 // Playwright tier contributes one chrome_timeout + one CDP overhead,
1725 // matching the runtime predicate in `crw-renderer/src/lib.rs`.
1726 let r = RendererConfig {
1727 mode: RendererMode::Playwright,
1728 page_timeout_ms: 15_000,
1729 http_timeout_ms: Some(15_000),
1730 chrome_timeout_ms: Some(30_000),
1731 playwright: Some(CdpEndpoint {
1732 ws_url: "ws://playwright:9222".into(),
1733 }),
1734 ..Default::default()
1735 };
1736 assert_eq!(r.cdp_tier_count(), 1);
1737 // http(15) + chrome-equivalent(30) + 1 * 28 overhead.
1738 assert_eq!(
1739 r.min_deadline_for_full_ladder_ms(),
1740 15_000 + 30_000 + 28_000
1741 );
1742 }
1743
1744 #[test]
1745 fn renderer_phase_toggles_default_off_or_safe() {
1746 let r = RendererConfig::default();
1747 assert!(!r.chrome_intercept_resources);
1748 assert!(!r.chrome_intercept_stylesheets);
1749 assert!(r.chrome_host_intercept_disable.is_empty());
1750 assert_eq!(r.chrome_nav_budget_ms, 12_000);
1751 assert!(!r.chrome_context_pool_enabled);
1752 assert!(!r.use_predictor);
1753 }
1754
1755 #[test]
1756 fn crawler_per_host_limiter_defaults() {
1757 let c = CrawlerConfig::default();
1758 assert_eq!(c.per_host_min_interval_ms, 0);
1759 assert_eq!(c.per_host_max_concurrent, 1);
1760 }
1761
1762 #[test]
1763 fn env_var_overrides_toml_defaults() {
1764 let _g = ENV_LOCK.lock().unwrap();
1765 clear_renderer_env();
1766 unsafe {
1767 std::env::set_var("CRW_SERVER__PORT", "4444");
1768 std::env::set_var("CRW_RENDERER__LIGHTPANDA__WS_URL", "ws://test:9999/");
1769 }
1770 let cfg = AppConfig::load().unwrap();
1771 clear_renderer_env();
1772
1773 assert_eq!(cfg.server.port, 4444, "env var should override server.port");
1774 assert_eq!(
1775 cfg.renderer.lightpanda.as_ref().unwrap().ws_url,
1776 "ws://test:9999/",
1777 "env var should override renderer.lightpanda.ws_url"
1778 );
1779 }
1780
1781 #[test]
1782 fn user_config_path_honors_override_env() {
1783 let _g = ENV_LOCK.lock().unwrap();
1784 let tmp = std::env::temp_dir().join(format!("crw-cfg-test-{}", std::process::id()));
1785 unsafe {
1786 std::env::set_var("CRW_USER_CONFIG_DIR", &tmp);
1787 }
1788 let p = user_config_path().unwrap();
1789 unsafe {
1790 std::env::remove_var("CRW_USER_CONFIG_DIR");
1791 }
1792 assert_eq!(p, tmp.join("config.toml"));
1793 }
1794
1795 #[test]
1796 fn user_config_file_is_picked_up_by_load() {
1797 let _g = ENV_LOCK.lock().unwrap();
1798 clear_renderer_env();
1799 let tmp = std::env::temp_dir().join(format!("crw-load-test-{}", std::process::id()));
1800 std::fs::create_dir_all(&tmp).unwrap();
1801 let cfg_path = tmp.join("config.toml");
1802 std::fs::write(
1803 &cfg_path,
1804 r#"
1805[client]
1806api_url = "https://api.example.com"
1807api_key = "test-key-123"
1808
1809[search]
1810searxng_url = "http://localhost:9999"
1811
1812[extraction.llm]
1813provider = "deepseek"
1814api_key = "sk-test"
1815model = "deepseek-chat"
1816"#,
1817 )
1818 .unwrap();
1819
1820 unsafe {
1821 std::env::set_var("CRW_USER_CONFIG_DIR", &tmp);
1822 }
1823 let cfg = AppConfig::load().unwrap();
1824 unsafe {
1825 std::env::remove_var("CRW_USER_CONFIG_DIR");
1826 }
1827 std::fs::remove_dir_all(&tmp).ok();
1828
1829 assert_eq!(
1830 cfg.client.api_url.as_deref(),
1831 Some("https://api.example.com")
1832 );
1833 assert_eq!(cfg.client.api_key.as_deref(), Some("test-key-123"));
1834 assert_eq!(
1835 cfg.search.searxng_url.as_deref(),
1836 Some("http://localhost:9999")
1837 );
1838 let llm = cfg.extraction.llm.expect("llm config present");
1839 assert_eq!(llm.provider, "deepseek");
1840 assert_eq!(llm.api_key, "sk-test");
1841 }
1842
1843 #[test]
1844 fn env_var_beats_user_config() {
1845 let _g = ENV_LOCK.lock().unwrap();
1846 clear_renderer_env();
1847 let tmp = std::env::temp_dir().join(format!("crw-prec-test-{}", std::process::id()));
1848 std::fs::create_dir_all(&tmp).unwrap();
1849 std::fs::write(
1850 tmp.join("config.toml"),
1851 r#"
1852[search]
1853searxng_url = "http://from-file:8080"
1854"#,
1855 )
1856 .unwrap();
1857
1858 unsafe {
1859 std::env::set_var("CRW_USER_CONFIG_DIR", &tmp);
1860 std::env::set_var("CRW_SEARCH__SEARXNG_URL", "http://from-env:8080");
1861 }
1862 let cfg = AppConfig::load().unwrap();
1863 unsafe {
1864 std::env::remove_var("CRW_USER_CONFIG_DIR");
1865 std::env::remove_var("CRW_SEARCH__SEARXNG_URL");
1866 }
1867 std::fs::remove_dir_all(&tmp).ok();
1868
1869 assert_eq!(
1870 cfg.search.searxng_url.as_deref(),
1871 Some("http://from-env:8080"),
1872 "env var must win over user config file"
1873 );
1874 }
1875
1876 #[test]
1877 fn effective_proxy_credentials_appends_country_suffix() {
1878 let cfg = RendererConfig {
1879 proxy_base_user: Some("abc".into()),
1880 proxy_base_pass: Some("pw".into()),
1881 proxy_default_country: Some("de".into()),
1882 ..Default::default()
1883 };
1884 let (u, p) = cfg.effective_proxy_credentials(Some("us")).unwrap();
1885 assert_eq!(u, "abc__cr.us");
1886 assert_eq!(p, "pw");
1887 // Per-request wins over default.
1888 let (u, _) = cfg.effective_proxy_credentials(Some("GB")).unwrap();
1889 assert_eq!(u, "abc__cr.gb", "uppercase input is normalized");
1890 // Default country used when per-request omits it.
1891 let (u, _) = cfg.effective_proxy_credentials(None).unwrap();
1892 assert_eq!(u, "abc__cr.de");
1893 }
1894
1895 #[test]
1896 fn effective_proxy_credentials_invalid_country_uses_global_pool() {
1897 let cfg = RendererConfig {
1898 proxy_base_user: Some("abc".into()),
1899 proxy_base_pass: Some("pw".into()),
1900 ..Default::default()
1901 };
1902 // 3-letter ISO code → rejected, no suffix (global pool).
1903 let (u, _) = cfg.effective_proxy_credentials(Some("usa")).unwrap();
1904 assert_eq!(u, "abc");
1905 // Digits → rejected.
1906 let (u, _) = cfg.effective_proxy_credentials(Some("u1")).unwrap();
1907 assert_eq!(u, "abc");
1908 // Empty string after trim → rejected.
1909 let (u, _) = cfg.effective_proxy_credentials(Some(" ")).unwrap();
1910 assert_eq!(u, "abc");
1911 }
1912
1913 #[test]
1914 fn effective_proxy_credentials_no_base_returns_none() {
1915 let cfg = RendererConfig::default();
1916 assert!(cfg.effective_proxy_credentials(Some("us")).is_none());
1917
1918 let only_user = RendererConfig {
1919 proxy_base_user: Some("abc".into()),
1920 ..Default::default()
1921 };
1922 assert!(only_user.effective_proxy_credentials(Some("us")).is_none());
1923 }
1924}