Skip to main content

rover/config/
mod.rs

1//! Configuration loading.
2//!
3//! M1 covers a tiny subset of the full schema documented in PRD §12.
4//! Subsequent milestones extend this struct.
5
6pub mod edit;
7pub mod provenance;
8
9use serde::{Deserialize, Serialize};
10use std::path::{Path, PathBuf};
11use std::time::Duration;
12use thiserror::Error;
13
14#[derive(Debug, Error)]
15pub enum ConfigError {
16    #[error("failed to read config at {path}: {source}")]
17    Read {
18        path: String,
19        source: std::io::Error,
20    },
21
22    #[error("failed to parse config at {path}: {source}")]
23    Parse {
24        path: String,
25        source: toml::de::Error,
26    },
27
28    #[error("invalid config at {path}: {message}")]
29    Invalid { path: String, message: String },
30}
31
32#[derive(Debug, Clone, Default, Deserialize, Serialize)]
33#[serde(deny_unknown_fields)]
34pub struct Config {
35    #[serde(default)]
36    pub fetch: FetchConfig,
37
38    #[serde(default)]
39    pub ssrf: SsrfConfig,
40
41    #[serde(default)]
42    pub debug: DebugConfig,
43
44    #[serde(default)]
45    pub cache: CacheConfig,
46
47    #[serde(default)]
48    pub tokenizer: TokenizerConfig,
49
50    #[serde(default)]
51    pub mcp: McpConfig,
52
53    #[serde(default)]
54    pub output: OutputConfig,
55
56    #[serde(default)]
57    pub rate_limit: RateLimitConfig,
58
59    #[serde(default)]
60    pub robots: RobotsConfig,
61
62    #[serde(default)]
63    pub summarization: SummarizationConfig,
64
65    #[serde(default)]
66    pub backends: std::collections::HashMap<String, BackendConfig>,
67
68    #[serde(default)]
69    pub headless: HeadlessConfig,
70
71    #[serde(default)]
72    pub image_captions: ImageCaptionsConfig,
73
74    #[serde(default)]
75    pub captioners: std::collections::BTreeMap<String, CaptionerConfig>,
76
77    #[serde(default)]
78    pub prompt_injection: PromptInjectionConfig,
79}
80
81#[derive(Debug, Clone, Deserialize, Serialize)]
82#[serde(deny_unknown_fields)]
83pub struct FetchConfig {
84    #[serde(default = "default_user_agent")]
85    pub user_agent: String,
86
87    /// Request timeout in seconds. Stored as u64 for TOML friendliness.
88    #[serde(default = "default_timeout_secs")]
89    pub timeout_secs: u64,
90}
91
92impl Default for FetchConfig {
93    fn default() -> Self {
94        Self {
95            user_agent: default_user_agent(),
96            timeout_secs: default_timeout_secs(),
97        }
98    }
99}
100
101impl FetchConfig {
102    pub fn timeout(&self) -> Duration {
103        Duration::from_secs(self.timeout_secs)
104    }
105}
106
107impl Config {
108    /// Apply CLI / MCP override flags onto an already-loaded config.
109    ///
110    /// Centralises the override logic shared by `rover fetch`, `rover mcp`, and
111    /// (M6) `rover batch`. Bypasses `config::validate`; concurrency widths are
112    /// clamped to >=1 to avoid `Semaphore::new(0)` silently hanging on acquire
113    /// (regression fix from M5 commit 02bd7e8).
114    pub fn apply_overrides(
115        &mut self,
116        rate_limit_rpm: Option<u32>,
117        per_host_concurrency: Option<u32>,
118        global_concurrency: Option<u32>,
119        max_retries: Option<u8>,
120        ignore_robots: bool,
121    ) {
122        if let Some(v) = rate_limit_rpm {
123            self.rate_limit.requests_per_minute_per_domain = v;
124        }
125        if let Some(v) = per_host_concurrency {
126            self.rate_limit.per_domain_concurrency = v.max(1);
127        }
128        if let Some(v) = global_concurrency {
129            self.rate_limit.global_concurrency = v.max(1);
130        }
131        if let Some(v) = max_retries {
132            self.rate_limit.max_retries = v;
133        }
134        if ignore_robots {
135            self.robots.respect = false;
136        }
137    }
138
139    /// Test-only convenience for swapping the SSRF level on an
140    /// already-loaded config. Production callers go through TOML.
141    #[cfg(any(test, feature = "test-loopback"))]
142    pub fn with_ssrf_level(mut self, level: &str) -> Self {
143        self.ssrf.level = level.to_string();
144        self
145    }
146}
147
148fn default_user_agent() -> String {
149    format!(
150        "Rover/{} (+https://github.com/aaronbassett/rover)",
151        env!("CARGO_PKG_VERSION")
152    )
153}
154
155fn default_timeout_secs() -> u64 {
156    15
157}
158
159/// Cache configuration. All durations are parsed by `humantime` (e.g. "1h",
160/// "5m", "7d", "30s"). Defaults follow PRD §12.
161#[derive(Debug, Clone, Deserialize, Serialize)]
162#[serde(deny_unknown_fields)]
163pub struct CacheConfig {
164    #[serde(default = "default_cache_default_ttl", with = "humantime_serde")]
165    pub default_ttl: Duration,
166
167    #[serde(default = "default_cache_min_ttl", with = "humantime_serde")]
168    pub min_ttl: Duration,
169
170    #[serde(default = "default_cache_max_ttl", with = "humantime_serde")]
171    pub max_ttl: Duration,
172
173    /// Stale-while-revalidate grace window. When a cache entry expired no
174    /// more than this long ago, `fetch_with_cache` may serve the stale row
175    /// and queue a background `revalidate` task. Beyond this window the
176    /// row is treated as a cache miss and re-fetched synchronously, so
177    /// callers never receive arbitrarily old content from the cache.
178    /// Default: 5 minutes.
179    #[serde(default = "default_cache_swr_window", with = "humantime_serde")]
180    pub stale_while_revalidate_window: Duration,
181
182    #[serde(default)]
183    pub override_no_store: bool,
184
185    #[serde(default)]
186    pub override_no_store_domains: Vec<String>,
187
188    /// When true, store the gzipped raw HTML alongside the extracted Markdown.
189    /// Disabled by default to keep the database small.
190    #[serde(default)]
191    pub store_raw_html: bool,
192}
193
194impl Default for CacheConfig {
195    fn default() -> Self {
196        Self {
197            default_ttl: default_cache_default_ttl(),
198            min_ttl: default_cache_min_ttl(),
199            max_ttl: default_cache_max_ttl(),
200            stale_while_revalidate_window: default_cache_swr_window(),
201            override_no_store: false,
202            override_no_store_domains: vec![],
203            store_raw_html: false,
204        }
205    }
206}
207
208fn default_cache_default_ttl() -> Duration {
209    // 15 minutes. Tightened from 1h so that, absent an explicit `Cache-Control`
210    // max-age, a cache poisoned with stale or attacker-influenced content has a
211    // short blast radius before the next revalidation. Origins that want longer
212    // caching can still say so via response headers.
213    Duration::from_secs(15 * 60)
214}
215
216fn default_cache_min_ttl() -> Duration {
217    Duration::from_secs(300)
218}
219
220fn default_cache_max_ttl() -> Duration {
221    Duration::from_secs(7 * 86400)
222}
223
224fn default_cache_swr_window() -> Duration {
225    Duration::from_secs(5 * 60)
226}
227
228/// Tokenizer configuration. The `default` family is used for token counting
229/// in the frontmatter and the MCP layer when callers don't specify one.
230#[derive(Debug, Clone, Deserialize, Serialize)]
231#[serde(deny_unknown_fields)]
232pub struct TokenizerConfig {
233    #[serde(default = "default_tokenizer")]
234    pub default: crate::tokenizer::Tokenizer,
235}
236
237impl Default for TokenizerConfig {
238    fn default() -> Self {
239        Self {
240            default: default_tokenizer(),
241        }
242    }
243}
244
245fn default_tokenizer() -> crate::tokenizer::Tokenizer {
246    crate::tokenizer::Tokenizer::O200k
247}
248
249/// MCP server configuration. Durations are parsed by `humantime`
250/// (e.g. "5s", "60s", "2m"). Both intervals must be non-zero.
251#[derive(Debug, Clone, Deserialize, Serialize)]
252#[serde(deny_unknown_fields)]
253pub struct McpConfig {
254    #[serde(default = "default_heartbeat_interval", with = "humantime_serde")]
255    pub heartbeat_interval: Duration,
256
257    #[serde(default = "default_reap_threshold", with = "humantime_serde")]
258    pub reap_threshold: Duration,
259}
260
261impl Default for McpConfig {
262    fn default() -> Self {
263        Self {
264            heartbeat_interval: default_heartbeat_interval(),
265            reap_threshold: default_reap_threshold(),
266        }
267    }
268}
269
270fn default_heartbeat_interval() -> Duration {
271    Duration::from_secs(5)
272}
273
274fn default_reap_threshold() -> Duration {
275    Duration::from_secs(60)
276}
277
278/// Output configuration. When `dir` is `None`, `ROVER_OUTPUT_DIR` (if set)
279/// takes precedence, otherwise the platform `data_local_dir()/rover/output`
280/// default applies. See `OutputPaths::resolve`.
281#[derive(Debug, Clone, Default, Deserialize, Serialize)]
282#[serde(deny_unknown_fields)]
283pub struct OutputConfig {
284    #[serde(default)]
285    pub dir: Option<std::path::PathBuf>,
286}
287
288/// Per-domain pacing knobs. All HTTP-bound code paths run through a single
289/// `Pacer` built from this struct at startup. See M5 design spec §3 and §4.
290#[derive(Debug, Clone, Deserialize, Serialize)]
291#[serde(deny_unknown_fields)]
292pub struct RateLimitConfig {
293    #[serde(default = "default_rpm_per_domain")]
294    pub requests_per_minute_per_domain: u32,
295
296    #[serde(default = "default_per_domain_concurrency")]
297    pub per_domain_concurrency: u32,
298
299    #[serde(default = "default_global_concurrency")]
300    pub global_concurrency: u32,
301
302    #[serde(default = "default_max_retries")]
303    pub max_retries: u8,
304
305    #[serde(default = "default_initial_backoff", with = "humantime_serde")]
306    pub initial_backoff: Duration,
307
308    #[serde(default = "default_max_backoff", with = "humantime_serde")]
309    pub max_backoff: Duration,
310
311    #[serde(default = "default_retry_after_ceiling", with = "humantime_serde")]
312    pub retry_after_ceiling: Duration,
313
314    /// Deterministic seed for the backoff jitter RNG. `None` (default) means
315    /// entropy; set in tests to make timing assertions reproducible.
316    #[serde(default)]
317    pub jitter_seed: Option<u64>,
318
319    /// Threshold (seconds) above which a server-provided `Retry-After`
320    /// converts a synchronous fetch into a deferred `retry` task instead of
321    /// sleeping in-line. See M6 design §3.
322    #[serde(default = "default_deferred_threshold_secs")]
323    pub deferred_retry_threshold_secs: u64,
324}
325
326impl Default for RateLimitConfig {
327    fn default() -> Self {
328        Self {
329            requests_per_minute_per_domain: default_rpm_per_domain(),
330            per_domain_concurrency: default_per_domain_concurrency(),
331            global_concurrency: default_global_concurrency(),
332            max_retries: default_max_retries(),
333            initial_backoff: default_initial_backoff(),
334            max_backoff: default_max_backoff(),
335            retry_after_ceiling: default_retry_after_ceiling(),
336            jitter_seed: None,
337            deferred_retry_threshold_secs: default_deferred_threshold_secs(),
338        }
339    }
340}
341
342fn default_rpm_per_domain() -> u32 {
343    60
344}
345fn default_per_domain_concurrency() -> u32 {
346    2
347}
348fn default_global_concurrency() -> u32 {
349    8
350}
351fn default_max_retries() -> u8 {
352    3
353}
354fn default_initial_backoff() -> Duration {
355    Duration::from_millis(500)
356}
357fn default_max_backoff() -> Duration {
358    Duration::from_secs(30)
359}
360fn default_retry_after_ceiling() -> Duration {
361    Duration::from_secs(300)
362}
363fn default_deferred_threshold_secs() -> u64 {
364    30
365}
366
367/// Robots.txt fetch + respect knobs.
368#[derive(Debug, Clone, Deserialize, Serialize)]
369#[serde(deny_unknown_fields)]
370pub struct RobotsConfig {
371    #[serde(default = "default_respect")]
372    pub respect: bool,
373
374    /// Hosts for which robots.txt is not fetched and rules are not enforced.
375    /// Lowercased in-place by `validate`.
376    #[serde(default)]
377    pub ignore_domains: Vec<String>,
378
379    /// Used when the robots.txt HTTP response has no `Cache-Control: max-age`.
380    #[serde(default = "default_robots_ttl", with = "humantime_serde")]
381    pub default_ttl: Duration,
382
383    /// Used when robots.txt fetch failed with 5xx or transport error (fail-closed).
384    /// Short by design so a recovered server is picked up quickly.
385    #[serde(default = "default_robots_failure_ttl", with = "humantime_serde")]
386    pub failure_ttl: Duration,
387}
388
389impl Default for RobotsConfig {
390    fn default() -> Self {
391        Self {
392            respect: default_respect(),
393            ignore_domains: Vec::new(),
394            default_ttl: default_robots_ttl(),
395            failure_ttl: default_robots_failure_ttl(),
396        }
397    }
398}
399
400fn default_respect() -> bool {
401    // Rover is an agent's browser, not a spider or scraper: it fetches the
402    // page a user/agent explicitly asked for, one at a time. robots.txt governs
403    // automated crawling, so the gate defaults off. Set `robots.respect = true`
404    // (or pass nothing and rely on rate limits) to opt back into enforcement.
405    false
406}
407fn default_robots_ttl() -> Duration {
408    Duration::from_secs(24 * 3600)
409}
410fn default_robots_failure_ttl() -> Duration {
411    Duration::from_secs(5 * 60)
412}
413
414/// Top-level `[summarization]` section.
415#[derive(Debug, Clone, Deserialize, Serialize)]
416#[serde(deny_unknown_fields)]
417pub struct SummarizationConfig {
418    #[serde(default = "default_summarization_backend")]
419    pub default_backend: String,
420
421    #[serde(default = "default_summarization_mode")]
422    pub default_mode: String,
423
424    #[serde(default = "default_summarization_style")]
425    pub default_style: String,
426
427    #[serde(default = "default_summarization_fallback")]
428    pub fallback_to_extractive: bool,
429
430    /// Per-table summarization defaults consumed by the
431    /// `TablesMode::Summarize` hook in `mcp::tools::fetch`. Lives under
432    /// `[summarization.tables]` in the config file.
433    #[serde(default)]
434    pub tables: TablesSummarizationConfig,
435}
436
437impl Default for SummarizationConfig {
438    fn default() -> Self {
439        Self {
440            default_backend: default_summarization_backend(),
441            default_mode: default_summarization_mode(),
442            default_style: default_summarization_style(),
443            fallback_to_extractive: default_summarization_fallback(),
444            tables: TablesSummarizationConfig::default(),
445        }
446    }
447}
448
449fn default_summarization_backend() -> String {
450    "default".to_string()
451}
452fn default_summarization_mode() -> String {
453    "abstractive".to_string()
454}
455fn default_summarization_style() -> String {
456    "prose".to_string()
457}
458fn default_summarization_fallback() -> bool {
459    true
460}
461
462/// `[summarization.tables]` block. Controls the per-table summarize
463/// defaults used by the `TablesMode::Summarize` hook.
464#[derive(Debug, Clone, Deserialize, Serialize)]
465#[serde(deny_unknown_fields)]
466pub struct TablesSummarizationConfig {
467    #[serde(default = "default_tables_target_tokens")]
468    pub target_tokens: usize,
469    #[serde(default = "default_tables_focus")]
470    pub focus: String,
471}
472
473impl Default for TablesSummarizationConfig {
474    fn default() -> Self {
475        Self {
476            target_tokens: default_tables_target_tokens(),
477            focus: default_tables_focus(),
478        }
479    }
480}
481
482fn default_tables_target_tokens() -> usize {
483    150
484}
485fn default_tables_focus() -> String {
486    "Describe what this table shows. Highlight any extreme values or notable rows.".to_string()
487}
488
489/// One `[backends.<name>]` block. Free-form `kind`/`provider` strings —
490/// validation lives in `summarizer::registry::build` where the parsed
491/// values are matched against the typed enum.
492#[derive(Debug, Clone, Deserialize, Serialize, Default)]
493#[serde(deny_unknown_fields)]
494pub struct BackendConfig {
495    pub kind: String,
496    #[serde(default)]
497    pub provider: Option<String>,
498    #[serde(default)]
499    pub model: Option<String>,
500    #[serde(default)]
501    pub base_url: Option<String>,
502    #[serde(default)]
503    pub api_key_env: Option<String>,
504}
505
506/// `[headless]` configuration block. M9 adds browser/headless-fetch knobs.
507#[derive(Debug, Clone, Deserialize, Serialize)]
508#[serde(deny_unknown_fields)]
509pub struct HeadlessConfig {
510    #[serde(default = "default_headless_max_concurrent")]
511    pub max_concurrent: usize,
512
513    /// Path to a Chrome/Chromium executable. Empty string means auto-detect.
514    #[serde(default)]
515    pub chrome_executable: String,
516
517    /// Fulfill image requests with empty 200 (saves bandwidth + render time).
518    #[serde(default = "default_block_images")]
519    pub block_images: bool,
520
521    /// Fulfill font requests with empty 200.
522    #[serde(default = "default_block_fonts")]
523    pub block_fonts: bool,
524
525    /// Fulfill audio/video/track requests with empty 200.
526    #[serde(default = "default_block_media")]
527    pub block_media: bool,
528
529    /// Fulfill CSS requests with empty 200. Default `false` — many SPAs need
530    /// layout to render correctly.
531    #[serde(default)]
532    pub block_css: bool,
533
534    /// Fulfill third-party analytics/tracker requests with empty 200.
535    #[serde(default = "default_block_third_party")]
536    pub block_third_party: bool,
537
538    /// Disable service workers at browser init via CDP bypass. Honored by
539    /// `HeadlessRenderer` setup (not by the intercept handler).
540    #[serde(default = "default_block_service_workers")]
541    pub block_service_workers: bool,
542
543    /// Default wait condition: `"domcontentloaded"` or `"networkidle0"`
544    /// (wait for the network to fully settle — captures post-load XHR content).
545    #[serde(default = "default_headless_wait")]
546    pub default_wait: String,
547
548    /// Per-render timeout in seconds (covers the wait phase).
549    #[serde(default = "default_headless_timeout_secs")]
550    pub timeout_secs: u64,
551
552    /// Whether `HeadlessMode::Auto` should run the SPA detection heuristic.
553    #[serde(default = "default_auto_detect_spa")]
554    pub auto_detect_spa: bool,
555
556    /// In `Auto` mode, the delay (in seconds) before escalating to a headless
557    /// render once the plain HTTP fetch is in — i.e. between detecting that a
558    /// render is needed (an unrendered SPA, or a bot-protection challenge) and
559    /// launching/driving the browser. Gives the origin a breather between the
560    /// lightweight fetch and the heavier browser hit. `0` disables the pause.
561    /// Does not apply to `On` mode, which has no detection step.
562    #[serde(default = "default_headless_launch_delay_secs")]
563    pub launch_delay_secs: u64,
564}
565
566impl HeadlessConfig {
567    /// Render timeout as a `Duration`.
568    pub fn timeout(&self) -> std::time::Duration {
569        std::time::Duration::from_secs(self.timeout_secs)
570    }
571
572    /// Auto-mode pre-render escalation delay as a `Duration`.
573    pub fn launch_delay(&self) -> std::time::Duration {
574        std::time::Duration::from_secs(self.launch_delay_secs)
575    }
576}
577
578impl Default for HeadlessConfig {
579    fn default() -> Self {
580        Self {
581            max_concurrent: default_headless_max_concurrent(),
582            chrome_executable: String::new(),
583            block_images: default_block_images(),
584            block_fonts: default_block_fonts(),
585            block_media: default_block_media(),
586            block_css: false,
587            block_third_party: default_block_third_party(),
588            block_service_workers: default_block_service_workers(),
589            default_wait: default_headless_wait(),
590            timeout_secs: default_headless_timeout_secs(),
591            auto_detect_spa: default_auto_detect_spa(),
592            launch_delay_secs: default_headless_launch_delay_secs(),
593        }
594    }
595}
596
597fn default_headless_max_concurrent() -> usize {
598    4
599}
600
601fn default_headless_wait() -> String {
602    "domcontentloaded".to_string()
603}
604
605fn default_headless_timeout_secs() -> u64 {
606    15
607}
608
609fn default_headless_launch_delay_secs() -> u64 {
610    2
611}
612
613fn default_auto_detect_spa() -> bool {
614    true
615}
616
617fn default_block_images() -> bool {
618    true
619}
620
621fn default_block_fonts() -> bool {
622    true
623}
624
625fn default_block_media() -> bool {
626    true
627}
628
629fn default_block_third_party() -> bool {
630    true
631}
632
633fn default_block_service_workers() -> bool {
634    true
635}
636
637/// `[image_captions]` defaults block.
638#[derive(Debug, Clone, Deserialize, Serialize)]
639#[serde(default, deny_unknown_fields)]
640pub struct ImageCaptionsConfig {
641    pub default: Option<String>,
642    pub max_tokens: usize,
643    pub max_per_page: usize,
644    pub min_width: u32,
645    pub min_height: u32,
646    #[serde(deserialize_with = "humanbytes_to_u64")]
647    pub max_bytes: u64,
648    pub max_concurrent: usize,
649}
650
651impl Default for ImageCaptionsConfig {
652    fn default() -> Self {
653        Self {
654            default: None,
655            max_tokens: 50,
656            max_per_page: 10,
657            min_width: 200,
658            min_height: 200,
659            max_bytes: 10 * 1024 * 1024,
660            max_concurrent: 2,
661        }
662    }
663}
664
665/// `[captioners.<name>]` block. Mirrors `BackendConfig` (M7).
666#[derive(Debug, Clone, Default, Deserialize, Serialize)]
667#[serde(default, deny_unknown_fields)]
668pub struct CaptionerConfig {
669    pub kind: String,
670    pub provider: Option<String>,
671    pub model: Option<String>,
672    pub base_url: Option<String>,
673    pub api_key_env: Option<String>,
674}
675
676/// Parse a human-readable byte size string such as "10MiB", "1.5GiB", "1000"
677/// into a raw `u64` byte count.
678pub fn parse_human_bytes(s: &str) -> Result<u64, String> {
679    let s = s.trim();
680    if let Ok(n) = s.parse::<u64>() {
681        return Ok(n);
682    }
683    let (num_str, unit) = s
684        .find(|c: char| c.is_ascii_alphabetic())
685        .map(|i| (&s[..i], &s[i..]))
686        .ok_or_else(|| format!("invalid size: {s}"))?;
687    let num: f64 = num_str
688        .trim()
689        .parse()
690        .map_err(|_| format!("invalid size number: {num_str}"))?;
691    let mult: u64 = match unit.trim().to_ascii_uppercase().as_str() {
692        "B" => 1,
693        "K" | "KB" => 1_000,
694        "KIB" => 1_024,
695        "M" | "MB" => 1_000_000,
696        "MIB" => 1_024 * 1_024,
697        "G" | "GB" => 1_000_000_000,
698        "GIB" => 1_024 * 1_024 * 1_024,
699        other => return Err(format!("unknown size unit: {other}")),
700    };
701    Ok((num * mult as f64) as u64)
702}
703
704fn humanbytes_to_u64<'de, D>(d: D) -> Result<u64, D::Error>
705where
706    D: serde::Deserializer<'de>,
707{
708    use serde::de::Error as _;
709    let v = toml::Value::deserialize(d)?;
710    match v {
711        toml::Value::Integer(n) if n >= 0 => Ok(n as u64),
712        toml::Value::String(s) => parse_human_bytes(&s).map_err(D::Error::custom),
713        other => Err(D::Error::custom(format!(
714            "expected integer bytes or humansize string, got {other:?}",
715        ))),
716    }
717}
718
719/// Top-level `[ssrf]` section. M8 introduces this — earlier milestones
720/// hardcoded `SsrfLevel::Strict`. The `level` field is a free-form string
721/// here so the file accepts unknown levels with a typed error from the
722/// fetcher rather than a serde error; `validate_url`/`validate_addresses`
723/// reject malformed levels at first use.
724#[derive(Debug, Clone, Deserialize, Serialize)]
725#[serde(deny_unknown_fields)]
726pub struct SsrfConfig {
727    #[serde(default = "default_ssrf_level")]
728    pub level: String,
729
730    #[serde(default = "default_ssrf_project_root")]
731    pub project_root: std::path::PathBuf,
732}
733
734impl Default for SsrfConfig {
735    fn default() -> Self {
736        Self {
737            level: default_ssrf_level(),
738            project_root: default_ssrf_project_root(),
739        }
740    }
741}
742
743fn default_ssrf_level() -> String {
744    "strict".to_string()
745}
746
747fn default_ssrf_project_root() -> std::path::PathBuf {
748    std::path::PathBuf::from(".")
749}
750
751/// Top-level `[prompt_injection]` section. `level` and `model` are free-form
752/// strings here (mirroring `SsrfConfig.level`); `guard::GuardConfig::from_config`
753/// parses them into typed enums at first use, surfacing a typed error rather
754/// than a serde error.
755#[derive(Debug, Clone, Deserialize, Serialize)]
756#[serde(deny_unknown_fields)]
757pub struct PromptInjectionConfig {
758    #[serde(default = "default_pi_level")]
759    pub level: String,
760
761    #[serde(default = "default_pi_model")]
762    pub model: String,
763
764    #[serde(default = "default_pi_model_threshold")]
765    pub model_threshold: f64,
766
767    #[serde(default)]
768    pub allowlist: PromptInjectionAllowlist,
769
770    #[serde(default)]
771    pub agent_overrides: PromptInjectionOverrides,
772}
773
774impl Default for PromptInjectionConfig {
775    fn default() -> Self {
776        Self {
777            level: default_pi_level(),
778            model: default_pi_model(),
779            model_threshold: default_pi_model_threshold(),
780            allowlist: PromptInjectionAllowlist::default(),
781            agent_overrides: PromptInjectionOverrides::default(),
782        }
783    }
784}
785
786/// Per-method URL-glob allowlists. A URL matching the glob list skips that
787/// method on OUTPUT for that URL. A bare `"*"` disables the method entirely.
788#[derive(Debug, Clone, Default, Deserialize, Serialize)]
789#[serde(deny_unknown_fields)]
790pub struct PromptInjectionAllowlist {
791    #[serde(default)]
792    pub wrap: Vec<String>,
793    #[serde(default)]
794    pub patterns: Vec<String>,
795    #[serde(default)]
796    pub model: Vec<String>,
797}
798
799/// Per-method agent-override grants (default: all deny). The MCP `security`
800/// arg is honored for a method only when its grant here is `true`.
801#[derive(Debug, Clone, Default, Deserialize, Serialize)]
802#[serde(deny_unknown_fields)]
803pub struct PromptInjectionOverrides {
804    #[serde(default)]
805    pub wrap: bool,
806    #[serde(default)]
807    pub patterns: bool,
808    #[serde(default)]
809    pub model: bool,
810    #[serde(default)]
811    pub level: bool,
812}
813
814fn default_pi_level() -> String {
815    "moderate".to_string()
816}
817fn default_pi_model() -> String {
818    "disabled".to_string()
819}
820fn default_pi_model_threshold() -> f64 {
821    0.9
822}
823
824/// Top-level `[debug]` section. M8 introduces this for HAR recording and
825/// log-level overrides.
826///
827/// `har_body_cap` accepts either a raw integer (bytes) or a humansize
828/// string like "64KiB" / "1MiB" via a custom deserializer. The internal
829/// representation is `u64` bytes.
830#[derive(Debug, Clone, Deserialize, Serialize)]
831#[serde(deny_unknown_fields)]
832pub struct DebugConfig {
833    #[serde(default = "default_debug_har_path")]
834    pub har_path: String,
835
836    #[serde(
837        default = "default_debug_har_body_cap",
838        deserialize_with = "deserialize_humansize"
839    )]
840    pub har_body_cap: u64,
841
842    #[serde(default = "default_debug_log_level")]
843    pub log_level: String,
844}
845
846impl Default for DebugConfig {
847    fn default() -> Self {
848        Self {
849            har_path: default_debug_har_path(),
850            har_body_cap: default_debug_har_body_cap(),
851            log_level: default_debug_log_level(),
852        }
853    }
854}
855
856fn default_debug_har_path() -> String {
857    String::new()
858}
859
860fn default_debug_har_body_cap() -> u64 {
861    64 * 1024
862}
863
864fn default_debug_log_level() -> String {
865    "info".to_string()
866}
867
868fn deserialize_humansize<'de, D>(deserializer: D) -> Result<u64, D::Error>
869where
870    D: serde::Deserializer<'de>,
871{
872    use serde::de::Error as _;
873    let v = toml::Value::deserialize(deserializer)?;
874    match v {
875        toml::Value::Integer(n) if n >= 0 => Ok(n as u64),
876        toml::Value::String(s) => parse_humansize(&s).map_err(D::Error::custom),
877        other => Err(D::Error::custom(format!(
878            "expected integer bytes or humansize string, got {other:?}",
879        ))),
880    }
881}
882
883fn parse_humansize(s: &str) -> Result<u64, String> {
884    let s = s.trim();
885    let (num_part, suffix) = s
886        .find(|c: char| c.is_alphabetic())
887        .map(|i| (&s[..i], &s[i..]))
888        .unwrap_or((s, ""));
889    let n: u64 = num_part
890        .trim()
891        .parse()
892        .map_err(|_| format!("invalid number in `{s}`"))?;
893    let mult: u64 = match suffix.trim() {
894        "" | "B" => 1,
895        "KiB" => 1024,
896        "MiB" => 1024 * 1024,
897        "GiB" => 1024 * 1024 * 1024,
898        other => {
899            return Err(format!(
900                "unknown size suffix `{other}` (expected KiB|MiB|GiB)"
901            ));
902        }
903    };
904    Ok(n * mult)
905}
906
907/// Load config. If `path` is provided, the file must exist and parse cleanly.
908/// If `path` is None, return defaults.
909pub fn load(path: Option<&Path>) -> Result<Config, ConfigError> {
910    let Some(path) = path else {
911        return Ok(Config::default());
912    };
913
914    let bytes = std::fs::read_to_string(path).map_err(|source| ConfigError::Read {
915        path: path.display().to_string(),
916        source,
917    })?;
918    let mut cfg: Config = toml::from_str(&bytes).map_err(|source| ConfigError::Parse {
919        path: path.display().to_string(),
920        source,
921    })?;
922    validate(&mut cfg).map_err(|message| ConfigError::Invalid {
923        path: path.display().to_string(),
924        message,
925    })?;
926    Ok(cfg)
927}
928
929/// Ordered config-file candidates searched when `--config` is absent.
930///
931/// When `ROVER_CONFIG` is set it designates the sole candidate (an explicit
932/// redirect should not silently fall through to other locations). Otherwise the
933/// platform config dir (`<config_dir>/rover/rover.toml`) is tried first, then a
934/// project-local `./rover.toml`.
935fn config_candidates_from(
936    rover_config_env: Option<&str>,
937    config_dir: Option<&Path>,
938) -> Vec<PathBuf> {
939    if let Some(p) = rover_config_env {
940        return vec![PathBuf::from(p)];
941    }
942    let mut candidates = Vec::with_capacity(2);
943    if let Some(dir) = config_dir {
944        candidates.push(dir.join("rover").join("rover.toml"));
945    }
946    candidates.push(PathBuf::from("rover.toml"));
947    candidates
948}
949
950fn config_candidates() -> Vec<PathBuf> {
951    config_candidates_from(
952        std::env::var("ROVER_CONFIG").ok().as_deref(),
953        dirs::config_dir().as_deref(),
954    )
955}
956
957/// The canonical config path: where `rover config set` creates a new file, and
958/// where `rover config show` reports when no file exists yet. This is the first
959/// (highest-precedence) candidate, regardless of whether it exists on disk.
960pub fn default_config_path() -> PathBuf {
961    config_candidates()
962        .into_iter()
963        .next()
964        .expect("config_candidates always yields at least one path")
965}
966
967/// The first existing config file among the ordered candidates, or `None` when
968/// none exists (built-in defaults apply).
969///
970/// Shared by the runtime subcommands and by `config show` / `config set` so all
971/// of them agree on which file is "the active config" — closing the footgun
972/// where `config set` wrote a file the runtime never read.
973pub fn resolve_existing_config_path() -> Option<PathBuf> {
974    config_candidates().into_iter().find(|p| p.is_file())
975}
976
977/// Load the effective config, resolving the default path when `--config` is
978/// absent.
979///
980/// - `Some(path)`: an explicitly requested file. It MUST exist and parse — a
981///   typo in `--config` fails loudly rather than silently falling back to
982///   defaults.
983/// - `None`: search the default candidates (`ROVER_CONFIG`, then the platform
984///   config dir, then `./rover.toml`) and load the first that exists; if none
985///   exists, fall back to built-in defaults (the config file is optional).
986///
987/// Runtime subcommands call this instead of [`load`] so a saved config file is
988/// honored without requiring `--config` on every invocation.
989pub fn load_resolved(explicit: Option<&Path>) -> Result<Config, ConfigError> {
990    if let Some(path) = explicit {
991        tracing::debug!(path = %path.display(), "loading config from --config");
992        return load(Some(path));
993    }
994    match resolve_existing_config_path() {
995        Some(path) => {
996            tracing::debug!(path = %path.display(), "loading config from resolved default path");
997            load(Some(&path))
998        }
999        None => {
1000            tracing::debug!("no config file found at any default path; using built-in defaults");
1001            Ok(Config::default())
1002        }
1003    }
1004}
1005
1006/// Pure core shared with the public [`load_resolved`], with the resolved
1007/// "active config" path injected so both branches are unit-testable without
1008/// touching process env or the real config dir.
1009#[cfg(test)]
1010fn load_resolved_from(
1011    explicit: Option<&Path>,
1012    resolved_existing: Option<&Path>,
1013) -> Result<Config, ConfigError> {
1014    match (explicit, resolved_existing) {
1015        (Some(path), _) => load(Some(path)),
1016        (None, Some(path)) => load(Some(path)),
1017        (None, None) => Ok(Config::default()),
1018    }
1019}
1020
1021fn validate(cfg: &mut Config) -> Result<(), String> {
1022    if cfg.fetch.timeout_secs == 0 {
1023        return Err("fetch.timeout_secs must be > 0".to_string());
1024    }
1025    if cfg.cache.min_ttl > cfg.cache.default_ttl {
1026        return Err(format!(
1027            "cache.min_ttl ({:?}) must be <= cache.default_ttl ({:?})",
1028            cfg.cache.min_ttl, cfg.cache.default_ttl
1029        ));
1030    }
1031    if cfg.cache.default_ttl > cfg.cache.max_ttl {
1032        return Err(format!(
1033            "cache.default_ttl ({:?}) must be <= cache.max_ttl ({:?})",
1034            cfg.cache.default_ttl, cfg.cache.max_ttl
1035        ));
1036    }
1037    for d in &mut cfg.cache.override_no_store_domains {
1038        d.make_ascii_lowercase();
1039    }
1040    if cfg.mcp.heartbeat_interval.is_zero() {
1041        return Err("mcp.heartbeat_interval must be > 0".to_string());
1042    }
1043    if cfg.mcp.reap_threshold.is_zero() {
1044        return Err("mcp.reap_threshold must be > 0".to_string());
1045    }
1046
1047    // RateLimitConfig
1048    if cfg.rate_limit.requests_per_minute_per_domain == 0 {
1049        return Err("rate_limit.requests_per_minute_per_domain must be > 0".to_string());
1050    }
1051    if cfg.rate_limit.requests_per_minute_per_domain > 6000 {
1052        return Err(format!(
1053            "rate_limit.requests_per_minute_per_domain ({}) exceeds sanity cap 6000 (100 req/s)",
1054            cfg.rate_limit.requests_per_minute_per_domain
1055        ));
1056    }
1057    if cfg.rate_limit.per_domain_concurrency == 0 {
1058        return Err("rate_limit.per_domain_concurrency must be > 0".to_string());
1059    }
1060    if cfg.rate_limit.global_concurrency == 0 {
1061        return Err("rate_limit.global_concurrency must be > 0".to_string());
1062    }
1063    if cfg.rate_limit.max_retries > 10 {
1064        return Err(format!(
1065            "rate_limit.max_retries ({}) exceeds sanity cap 10",
1066            cfg.rate_limit.max_retries
1067        ));
1068    }
1069    if cfg.rate_limit.initial_backoff > cfg.rate_limit.max_backoff {
1070        return Err(format!(
1071            "rate_limit.initial_backoff ({:?}) must be <= max_backoff ({:?})",
1072            cfg.rate_limit.initial_backoff, cfg.rate_limit.max_backoff
1073        ));
1074    }
1075    if cfg.rate_limit.retry_after_ceiling.is_zero() {
1076        return Err("rate_limit.retry_after_ceiling must be > 0".to_string());
1077    }
1078
1079    // RobotsConfig
1080    for d in &mut cfg.robots.ignore_domains {
1081        d.make_ascii_lowercase();
1082    }
1083    if cfg.robots.failure_ttl > cfg.robots.default_ttl {
1084        return Err(format!(
1085            "robots.failure_ttl ({:?}) must be <= robots.default_ttl ({:?})",
1086            cfg.robots.failure_ttl, cfg.robots.default_ttl
1087        ));
1088    }
1089
1090    Ok(())
1091}
1092
1093#[cfg(test)]
1094mod tests {
1095    use super::*;
1096    use std::io::Write;
1097
1098    #[test]
1099    fn apply_overrides_clamps_concurrency_minimum() {
1100        let mut cfg = Config::default();
1101        cfg.apply_overrides(None, Some(0), Some(0), None, false);
1102        assert_eq!(cfg.rate_limit.per_domain_concurrency, 1);
1103        assert_eq!(cfg.rate_limit.global_concurrency, 1);
1104    }
1105
1106    #[test]
1107    fn apply_overrides_leaves_unset_fields_untouched() {
1108        let mut cfg = Config::default();
1109        let baseline_rpm = cfg.rate_limit.requests_per_minute_per_domain;
1110        let baseline_retries = cfg.rate_limit.max_retries;
1111        let baseline_respect = cfg.robots.respect;
1112        cfg.apply_overrides(None, None, None, None, false);
1113        assert_eq!(cfg.rate_limit.requests_per_minute_per_domain, baseline_rpm);
1114        assert_eq!(cfg.rate_limit.max_retries, baseline_retries);
1115        assert_eq!(cfg.robots.respect, baseline_respect);
1116    }
1117
1118    #[test]
1119    fn apply_overrides_disables_robots_when_requested() {
1120        let mut cfg = Config::default();
1121        // Start enabled so the assertion proves the override flips it, not just
1122        // that it matches the (now off-by-default) baseline.
1123        cfg.robots.respect = true;
1124        cfg.apply_overrides(None, None, None, None, true);
1125        assert!(!cfg.robots.respect);
1126    }
1127
1128    #[test]
1129    fn apply_overrides_sets_explicit_values() {
1130        let mut cfg = Config::default();
1131        cfg.apply_overrides(Some(30), Some(4), Some(16), Some(5), false);
1132        assert_eq!(cfg.rate_limit.requests_per_minute_per_domain, 30);
1133        assert_eq!(cfg.rate_limit.per_domain_concurrency, 4);
1134        assert_eq!(cfg.rate_limit.global_concurrency, 16);
1135        assert_eq!(cfg.rate_limit.max_retries, 5);
1136    }
1137
1138    #[test]
1139    fn default_config_has_sensible_values() {
1140        let cfg = Config::default();
1141        assert!(cfg.fetch.user_agent.starts_with("Rover/"));
1142        assert_eq!(cfg.fetch.timeout_secs, 15);
1143
1144        // Cache defaults per PRD §12 (default_ttl tightened to 15m).
1145        assert_eq!(cfg.cache.default_ttl, Duration::from_secs(15 * 60));
1146        assert_eq!(cfg.cache.min_ttl, Duration::from_secs(300));
1147        assert_eq!(cfg.cache.max_ttl, Duration::from_secs(7 * 86400));
1148        assert!(!cfg.cache.override_no_store);
1149        assert!(cfg.cache.override_no_store_domains.is_empty());
1150        assert!(!cfg.cache.store_raw_html);
1151    }
1152
1153    #[test]
1154    fn load_with_no_path_returns_default() {
1155        let cfg = load(None).unwrap();
1156        assert_eq!(cfg.fetch.timeout_secs, 15);
1157    }
1158
1159    #[test]
1160    fn load_from_file_overrides_defaults() {
1161        let mut file = tempfile::NamedTempFile::new().unwrap();
1162        writeln!(
1163            file,
1164            r#"
1165[fetch]
1166user_agent = "test-ua"
1167timeout_secs = 5
1168"#
1169        )
1170        .unwrap();
1171
1172        let cfg = load(Some(file.path())).unwrap();
1173        assert_eq!(cfg.fetch.user_agent, "test-ua");
1174        assert_eq!(cfg.fetch.timeout_secs, 5);
1175    }
1176
1177    #[test]
1178    fn load_missing_file_errors() {
1179        let result = load(Some(Path::new("/no/such/path/__rover_test__.toml")));
1180        assert!(matches!(result, Err(ConfigError::Read { .. })));
1181    }
1182
1183    #[test]
1184    fn load_malformed_toml_errors() {
1185        let mut file = tempfile::NamedTempFile::new().unwrap();
1186        writeln!(file, "not = valid = toml").unwrap();
1187        let result = load(Some(file.path()));
1188        assert!(matches!(result, Err(ConfigError::Parse { .. })));
1189    }
1190
1191    #[test]
1192    fn load_unknown_field_errors() {
1193        let mut file = tempfile::NamedTempFile::new().unwrap();
1194        writeln!(
1195            file,
1196            r#"
1197[fetch]
1198unknown_field = "x"
1199"#
1200        )
1201        .unwrap();
1202        let result = load(Some(file.path()));
1203        assert!(matches!(result, Err(ConfigError::Parse { .. })));
1204    }
1205
1206    #[test]
1207    fn load_unknown_field_in_cache_errors() {
1208        let mut file = tempfile::NamedTempFile::new().unwrap();
1209        writeln!(
1210            file,
1211            r#"
1212[cache]
1213unknown_field = "x"
1214"#
1215        )
1216        .unwrap();
1217        let result = load(Some(file.path()));
1218        assert!(matches!(result, Err(ConfigError::Parse { .. })));
1219    }
1220
1221    #[test]
1222    fn load_rejects_zero_timeout() {
1223        let mut file = tempfile::NamedTempFile::new().unwrap();
1224        writeln!(
1225            file,
1226            r#"
1227[fetch]
1228timeout_secs = 0
1229"#
1230        )
1231        .unwrap();
1232        let result = load(Some(file.path()));
1233        assert!(matches!(result, Err(ConfigError::Invalid { .. })));
1234    }
1235
1236    #[test]
1237    fn load_cache_overrides() {
1238        let mut file = tempfile::NamedTempFile::new().unwrap();
1239        writeln!(
1240            file,
1241            r#"
1242[cache]
1243default_ttl = "30m"
1244min_ttl = "1m"
1245max_ttl = "1d"
1246override_no_store = true
1247override_no_store_domains = ["docs.example.com"]
1248store_raw_html = true
1249"#
1250        )
1251        .unwrap();
1252
1253        let cfg = load(Some(file.path())).unwrap();
1254        assert_eq!(cfg.cache.default_ttl, Duration::from_secs(30 * 60));
1255        assert_eq!(cfg.cache.min_ttl, Duration::from_secs(60));
1256        assert_eq!(cfg.cache.max_ttl, Duration::from_secs(86400));
1257        assert!(cfg.cache.override_no_store);
1258        assert_eq!(
1259            cfg.cache.override_no_store_domains,
1260            vec!["docs.example.com".to_string()]
1261        );
1262        assert!(cfg.cache.store_raw_html);
1263    }
1264
1265    #[test]
1266    fn load_rejects_min_greater_than_default() {
1267        let mut file = tempfile::NamedTempFile::new().unwrap();
1268        writeln!(
1269            file,
1270            r#"
1271[cache]
1272default_ttl = "1m"
1273min_ttl = "10m"
1274"#
1275        )
1276        .unwrap();
1277        let result = load(Some(file.path()));
1278        assert!(matches!(result, Err(ConfigError::Invalid { .. })));
1279    }
1280
1281    #[test]
1282    fn load_rejects_default_greater_than_max() {
1283        let mut file = tempfile::NamedTempFile::new().unwrap();
1284        writeln!(
1285            file,
1286            r#"
1287[cache]
1288default_ttl = "10d"
1289max_ttl = "1d"
1290"#
1291        )
1292        .unwrap();
1293        let result = load(Some(file.path()));
1294        assert!(matches!(result, Err(ConfigError::Invalid { .. })));
1295    }
1296
1297    #[test]
1298    fn override_no_store_domains_normalized_to_lowercase() {
1299        let mut file = tempfile::NamedTempFile::new().unwrap();
1300        writeln!(
1301            file,
1302            r#"
1303[cache]
1304override_no_store_domains = ["DOCS.example.COM", "CDN.foo.com"]
1305"#
1306        )
1307        .unwrap();
1308        let cfg = load(Some(file.path())).unwrap();
1309        assert_eq!(
1310            cfg.cache.override_no_store_domains,
1311            vec!["docs.example.com".to_string(), "cdn.foo.com".to_string()]
1312        );
1313    }
1314
1315    #[test]
1316    fn load_accepts_equal_ttls() {
1317        let mut file = tempfile::NamedTempFile::new().unwrap();
1318        writeln!(
1319            file,
1320            r#"
1321[cache]
1322default_ttl = "1h"
1323min_ttl = "1h"
1324max_ttl = "1h"
1325"#
1326        )
1327        .unwrap();
1328        let cfg = load(Some(file.path())).unwrap();
1329        assert_eq!(cfg.cache.default_ttl, Duration::from_secs(3600));
1330    }
1331
1332    #[test]
1333    fn default_tokenizer_is_o200k() {
1334        let cfg = Config::default();
1335        assert_eq!(cfg.tokenizer.default, crate::tokenizer::Tokenizer::O200k);
1336    }
1337
1338    #[test]
1339    fn default_mcp_intervals() {
1340        let cfg = Config::default();
1341        assert_eq!(cfg.mcp.heartbeat_interval, Duration::from_secs(5));
1342        assert_eq!(cfg.mcp.reap_threshold, Duration::from_secs(60));
1343    }
1344
1345    #[test]
1346    fn load_tokenizer_override() {
1347        let mut file = tempfile::NamedTempFile::new().unwrap();
1348        writeln!(
1349            file,
1350            r#"
1351[tokenizer]
1352default = "claude"
1353"#
1354        )
1355        .unwrap();
1356        let cfg = load(Some(file.path())).unwrap();
1357        assert_eq!(cfg.tokenizer.default, crate::tokenizer::Tokenizer::Claude);
1358    }
1359
1360    #[test]
1361    fn load_unknown_tokenizer_errors() {
1362        let mut file = tempfile::NamedTempFile::new().unwrap();
1363        writeln!(
1364            file,
1365            r#"
1366[tokenizer]
1367default = "gpt-5"
1368"#
1369        )
1370        .unwrap();
1371        let result = load(Some(file.path()));
1372        assert!(matches!(result, Err(ConfigError::Parse { .. })));
1373    }
1374
1375    #[test]
1376    fn load_mcp_overrides() {
1377        let mut file = tempfile::NamedTempFile::new().unwrap();
1378        writeln!(
1379            file,
1380            r#"
1381[mcp]
1382heartbeat_interval = "10s"
1383reap_threshold = "2m"
1384"#
1385        )
1386        .unwrap();
1387        let cfg = load(Some(file.path())).unwrap();
1388        assert_eq!(cfg.mcp.heartbeat_interval, Duration::from_secs(10));
1389        assert_eq!(cfg.mcp.reap_threshold, Duration::from_secs(120));
1390    }
1391
1392    #[test]
1393    fn load_output_dir_override() {
1394        let mut file = tempfile::NamedTempFile::new().unwrap();
1395        writeln!(
1396            file,
1397            r#"
1398[output]
1399dir = "/tmp/rover-out"
1400"#
1401        )
1402        .unwrap();
1403        let cfg = load(Some(file.path())).unwrap();
1404        assert_eq!(
1405            cfg.output.dir.as_deref().unwrap().to_str(),
1406            Some("/tmp/rover-out")
1407        );
1408    }
1409
1410    #[test]
1411    fn load_rejects_zero_heartbeat() {
1412        let mut file = tempfile::NamedTempFile::new().unwrap();
1413        writeln!(
1414            file,
1415            r#"
1416[mcp]
1417heartbeat_interval = "0s"
1418"#
1419        )
1420        .unwrap();
1421        let result = load(Some(file.path()));
1422        assert!(matches!(result, Err(ConfigError::Invalid { .. })));
1423    }
1424
1425    #[test]
1426    fn default_rate_limit_matches_prd() {
1427        let cfg = Config::default();
1428        assert_eq!(cfg.rate_limit.requests_per_minute_per_domain, 60);
1429        assert_eq!(cfg.rate_limit.per_domain_concurrency, 2);
1430        assert_eq!(cfg.rate_limit.global_concurrency, 8);
1431        assert_eq!(cfg.rate_limit.max_retries, 3);
1432    }
1433
1434    #[test]
1435    fn default_robots_matches_prd() {
1436        let cfg = Config::default();
1437        // Rover is an agent browser, not a crawler: robots enforcement is off
1438        // by default (opt in with `robots.respect = true`).
1439        assert!(!cfg.robots.respect);
1440        assert!(cfg.robots.ignore_domains.is_empty());
1441        assert_eq!(cfg.robots.default_ttl, Duration::from_secs(24 * 3600));
1442        assert_eq!(cfg.robots.failure_ttl, Duration::from_secs(300));
1443    }
1444
1445    #[test]
1446    fn load_rate_limit_overrides() {
1447        let mut file = tempfile::NamedTempFile::new().unwrap();
1448        writeln!(
1449            file,
1450            r#"
1451[rate_limit]
1452requests_per_minute_per_domain = 120
1453per_domain_concurrency = 4
1454global_concurrency = 16
1455max_retries = 5
1456initial_backoff = "250ms"
1457max_backoff = "60s"
1458retry_after_ceiling = "10m"
1459jitter_seed = 42
1460"#
1461        )
1462        .unwrap();
1463        let cfg = load(Some(file.path())).unwrap();
1464        assert_eq!(cfg.rate_limit.requests_per_minute_per_domain, 120);
1465        assert_eq!(cfg.rate_limit.max_retries, 5);
1466        assert_eq!(cfg.rate_limit.jitter_seed, Some(42));
1467    }
1468
1469    #[test]
1470    fn load_robots_overrides() {
1471        let mut file = tempfile::NamedTempFile::new().unwrap();
1472        writeln!(
1473            file,
1474            r#"
1475[robots]
1476respect = false
1477ignore_domains = ["FOO.example.com", "bar.example.org"]
1478default_ttl = "12h"
1479failure_ttl = "2m"
1480"#
1481        )
1482        .unwrap();
1483        let cfg = load(Some(file.path())).unwrap();
1484        assert!(!cfg.robots.respect);
1485        assert_eq!(
1486            cfg.robots.ignore_domains,
1487            vec!["foo.example.com".to_string(), "bar.example.org".to_string()]
1488        );
1489        assert_eq!(cfg.robots.default_ttl, Duration::from_secs(12 * 3600));
1490        assert_eq!(cfg.robots.failure_ttl, Duration::from_secs(120));
1491    }
1492
1493    #[test]
1494    fn load_rejects_zero_rpm() {
1495        let mut file = tempfile::NamedTempFile::new().unwrap();
1496        writeln!(
1497            file,
1498            r#"
1499[rate_limit]
1500requests_per_minute_per_domain = 0
1501"#
1502        )
1503        .unwrap();
1504        assert!(matches!(
1505            load(Some(file.path())),
1506            Err(ConfigError::Invalid { .. })
1507        ));
1508    }
1509
1510    #[test]
1511    fn load_rejects_rpm_above_sanity_cap() {
1512        let mut file = tempfile::NamedTempFile::new().unwrap();
1513        writeln!(
1514            file,
1515            r#"
1516[rate_limit]
1517requests_per_minute_per_domain = 100000
1518"#
1519        )
1520        .unwrap();
1521        assert!(matches!(
1522            load(Some(file.path())),
1523            Err(ConfigError::Invalid { .. })
1524        ));
1525    }
1526
1527    #[test]
1528    fn load_rejects_max_retries_above_10() {
1529        let mut file = tempfile::NamedTempFile::new().unwrap();
1530        writeln!(
1531            file,
1532            r#"
1533[rate_limit]
1534max_retries = 11
1535"#
1536        )
1537        .unwrap();
1538        assert!(matches!(
1539            load(Some(file.path())),
1540            Err(ConfigError::Invalid { .. })
1541        ));
1542    }
1543
1544    #[test]
1545    fn load_rejects_backoff_inversion() {
1546        let mut file = tempfile::NamedTempFile::new().unwrap();
1547        writeln!(
1548            file,
1549            r#"
1550[rate_limit]
1551initial_backoff = "10s"
1552max_backoff = "5s"
1553"#
1554        )
1555        .unwrap();
1556        assert!(matches!(
1557            load(Some(file.path())),
1558            Err(ConfigError::Invalid { .. })
1559        ));
1560    }
1561
1562    #[test]
1563    fn load_rejects_failure_ttl_above_default_ttl() {
1564        let mut file = tempfile::NamedTempFile::new().unwrap();
1565        writeln!(
1566            file,
1567            r#"
1568[robots]
1569default_ttl = "1m"
1570failure_ttl = "10m"
1571"#
1572        )
1573        .unwrap();
1574        assert!(matches!(
1575            load(Some(file.path())),
1576            Err(ConfigError::Invalid { .. })
1577        ));
1578    }
1579
1580    #[test]
1581    fn summarization_section_parses_with_defaults() {
1582        let toml = r#"
1583[summarization]
1584"#;
1585        let cfg: Config = toml::from_str(toml).unwrap();
1586        assert_eq!(cfg.summarization.default_backend, "default");
1587        assert_eq!(cfg.summarization.default_mode, "abstractive");
1588        assert_eq!(cfg.summarization.default_style, "prose");
1589        assert!(cfg.summarization.fallback_to_extractive);
1590        assert_eq!(cfg.summarization.tables.target_tokens, 150);
1591        assert!(cfg.summarization.tables.focus.contains("Describe"));
1592    }
1593
1594    #[test]
1595    fn summarization_tables_block_overrides_defaults() {
1596        let toml = r#"
1597[summarization.tables]
1598target_tokens = 250
1599focus = "Custom table focus prompt."
1600"#;
1601        let cfg: Config = toml::from_str(toml).unwrap();
1602        assert_eq!(cfg.summarization.tables.target_tokens, 250);
1603        assert_eq!(cfg.summarization.tables.focus, "Custom table focus prompt.");
1604        // Sibling defaults remain in force.
1605        assert_eq!(cfg.summarization.default_backend, "default");
1606    }
1607
1608    #[test]
1609    fn backends_section_parses_extractive_block() {
1610        let toml = r#"
1611[backends.default]
1612kind = "extractive"
1613"#;
1614        let cfg: Config = toml::from_str(toml).unwrap();
1615        assert_eq!(cfg.backends.len(), 1);
1616        let b = cfg.backends.get("default").unwrap();
1617        assert_eq!(b.kind, "extractive");
1618        assert!(b.provider.is_none());
1619    }
1620
1621    #[test]
1622    fn backends_section_parses_cloud_block_with_all_fields() {
1623        let toml = r#"
1624[backends.lm_studio]
1625kind = "cloud"
1626provider = "openai_compat"
1627base_url = "http://localhost:1234/v1"
1628model = "qwen3.5-0.8b"
1629api_key_env = "LM_KEY"
1630"#;
1631        let cfg: Config = toml::from_str(toml).unwrap();
1632        let b = cfg.backends.get("lm_studio").unwrap();
1633        assert_eq!(b.kind, "cloud");
1634        assert_eq!(b.provider.as_deref(), Some("openai_compat"));
1635        assert_eq!(b.base_url.as_deref(), Some("http://localhost:1234/v1"));
1636        assert_eq!(b.model.as_deref(), Some("qwen3.5-0.8b"));
1637        assert_eq!(b.api_key_env.as_deref(), Some("LM_KEY"));
1638    }
1639
1640    #[test]
1641    fn missing_summarization_section_yields_defaults() {
1642        let cfg: Config = toml::from_str("").unwrap();
1643        assert_eq!(cfg.summarization.default_backend, "default");
1644        assert!(cfg.backends.is_empty());
1645    }
1646
1647    #[test]
1648    fn ssrf_section_parses_with_defaults() {
1649        let toml = r#"
1650[ssrf]
1651"#;
1652        let cfg: Config = toml::from_str(toml).unwrap();
1653        assert_eq!(cfg.ssrf.level, "strict");
1654        assert_eq!(cfg.ssrf.project_root, std::path::PathBuf::from("."));
1655    }
1656
1657    #[test]
1658    fn ssrf_section_accepts_each_level() {
1659        for level in &["strict", "loopback", "project", "lan", "none"] {
1660            let toml = format!("[ssrf]\nlevel = \"{level}\"\n");
1661            let cfg: Config = toml::from_str(&toml).unwrap();
1662            assert_eq!(cfg.ssrf.level, *level);
1663        }
1664    }
1665
1666    #[test]
1667    fn ssrf_section_rejects_unknown_field() {
1668        let toml = r#"
1669[ssrf]
1670level = "strict"
1671bogus = 1
1672"#;
1673        let r: Result<Config, _> = toml::from_str(toml);
1674        assert!(r.is_err(), "expected deny_unknown_fields rejection");
1675    }
1676
1677    #[test]
1678    fn missing_ssrf_section_yields_defaults() {
1679        let cfg: Config = toml::from_str("").unwrap();
1680        assert_eq!(cfg.ssrf.level, "strict");
1681    }
1682
1683    #[test]
1684    fn debug_section_parses_with_defaults() {
1685        let cfg: Config = toml::from_str("[debug]\n").unwrap();
1686        assert_eq!(cfg.debug.har_path, "");
1687        assert_eq!(cfg.debug.har_body_cap, 64 * 1024);
1688        assert_eq!(cfg.debug.log_level, "info");
1689    }
1690
1691    #[test]
1692    fn debug_section_har_body_cap_accepts_humansize() {
1693        let cfg: Config = toml::from_str(
1694            r#"[debug]
1695har_body_cap = "1MiB"
1696"#,
1697        )
1698        .unwrap();
1699        assert_eq!(cfg.debug.har_body_cap, 1024 * 1024);
1700    }
1701
1702    #[test]
1703    fn debug_section_har_body_cap_accepts_integer_bytes() {
1704        let cfg: Config = toml::from_str(
1705            r#"[debug]
1706har_body_cap = 8192
1707"#,
1708        )
1709        .unwrap();
1710        assert_eq!(cfg.debug.har_body_cap, 8192);
1711    }
1712
1713    #[test]
1714    fn debug_section_rejects_unknown_field() {
1715        let r: Result<Config, _> = toml::from_str(
1716            r#"[debug]
1717har_path = ""
1718bogus = 1
1719"#,
1720        );
1721        assert!(r.is_err());
1722    }
1723
1724    #[test]
1725    fn image_captions_defaults_match_spec() {
1726        let c = ImageCaptionsConfig::default();
1727        assert_eq!(c.max_tokens, 50);
1728        assert_eq!(c.max_per_page, 10);
1729        assert_eq!(c.min_width, 200);
1730        assert_eq!(c.min_height, 200);
1731        assert_eq!(c.max_bytes, 10 * 1024 * 1024);
1732        assert_eq!(c.max_concurrent, 2);
1733    }
1734
1735    #[test]
1736    fn human_bytes_parses_common_forms() {
1737        assert_eq!(parse_human_bytes("1024").unwrap(), 1024);
1738        assert_eq!(parse_human_bytes("10MiB").unwrap(), 10 * 1024 * 1024);
1739        assert_eq!(parse_human_bytes("10MB").unwrap(), 10_000_000);
1740        assert_eq!(
1741            parse_human_bytes("1.5GiB").unwrap(),
1742            (1.5_f64 * 1024.0 * 1024.0 * 1024.0) as u64
1743        );
1744        assert!(parse_human_bytes("bogus").is_err());
1745    }
1746
1747    #[test]
1748    fn image_captions_deserializes_from_toml() {
1749        let toml_str = r#"
1750[image_captions]
1751default = "openai"
1752max_per_page = 5
1753min_width = 100
1754min_height = 100
1755max_bytes = "1MiB"
1756"#;
1757        let cfg: Config = toml::from_str(toml_str).unwrap();
1758        assert_eq!(cfg.image_captions.default.as_deref(), Some("openai"));
1759        assert_eq!(cfg.image_captions.max_per_page, 5);
1760        assert_eq!(cfg.image_captions.max_bytes, 1024 * 1024);
1761        assert_eq!(cfg.image_captions.max_tokens, 50);
1762    }
1763
1764    #[test]
1765    fn captioners_block_round_trips() {
1766        let toml_str = r#"
1767[captioners.openai]
1768kind = "cloud"
1769provider = "openai"
1770model = "gpt-4o-mini"
1771api_key_env = "OPENAI_API_KEY"
1772
1773[captioners.local]
1774kind = "local"
1775model = "HuggingFaceTB/SmolVLM-256M-Instruct"
1776"#;
1777        let cfg: Config = toml::from_str(toml_str).unwrap();
1778        assert_eq!(cfg.captioners.len(), 2);
1779        assert_eq!(
1780            cfg.captioners.get("openai").unwrap().provider.as_deref(),
1781            Some("openai")
1782        );
1783        assert_eq!(cfg.captioners.get("local").unwrap().kind, "local");
1784    }
1785
1786    #[test]
1787    fn headless_m9_keys_default_correctly() {
1788        let h = HeadlessConfig::default();
1789        assert_eq!(h.max_concurrent, 4);
1790        assert!(h.chrome_executable.is_empty());
1791        assert_eq!(h.launch_delay_secs, 2);
1792        assert_eq!(h.launch_delay(), std::time::Duration::from_secs(2));
1793    }
1794
1795    #[test]
1796    fn headless_launch_delay_parses_and_disables() {
1797        let cfg: Config = toml::from_str("[headless]\nlaunch_delay_secs = 0\n").unwrap();
1798        assert_eq!(cfg.headless.launch_delay_secs, 0);
1799        assert!(cfg.headless.launch_delay().is_zero());
1800        let cfg: Config = toml::from_str("[headless]\nlaunch_delay_secs = 5\n").unwrap();
1801        assert_eq!(
1802            cfg.headless.launch_delay(),
1803            std::time::Duration::from_secs(5)
1804        );
1805    }
1806
1807    #[test]
1808    fn prompt_injection_defaults_when_absent() {
1809        let cfg: Config = toml::from_str("").unwrap();
1810        assert_eq!(cfg.prompt_injection.level, "moderate");
1811        assert_eq!(cfg.prompt_injection.model, "disabled");
1812        assert!((cfg.prompt_injection.model_threshold - 0.9).abs() < f64::EPSILON);
1813        assert!(cfg.prompt_injection.allowlist.wrap.is_empty());
1814        assert!(cfg.prompt_injection.allowlist.patterns.is_empty());
1815        assert!(cfg.prompt_injection.allowlist.model.is_empty());
1816        assert!(!cfg.prompt_injection.agent_overrides.wrap);
1817        assert!(!cfg.prompt_injection.agent_overrides.patterns);
1818        assert!(!cfg.prompt_injection.agent_overrides.model);
1819        assert!(!cfg.prompt_injection.agent_overrides.level);
1820    }
1821
1822    #[test]
1823    fn prompt_injection_parses_full_block() {
1824        let toml = r#"
1825[prompt_injection]
1826level = "strict"
1827model = "deberta-base"
1828model_threshold = 0.75
1829
1830[prompt_injection.allowlist]
1831wrap = ["https://*.internal.example.com/*"]
1832patterns = ["*"]
1833model = []
1834
1835[prompt_injection.agent_overrides]
1836wrap = true
1837patterns = false
1838model = true
1839level = true
1840"#;
1841        let cfg: Config = toml::from_str(toml).unwrap();
1842        assert_eq!(cfg.prompt_injection.level, "strict");
1843        assert_eq!(cfg.prompt_injection.model, "deberta-base");
1844        assert!((cfg.prompt_injection.model_threshold - 0.75).abs() < f64::EPSILON);
1845        assert_eq!(
1846            cfg.prompt_injection.allowlist.wrap,
1847            vec!["https://*.internal.example.com/*".to_string()]
1848        );
1849        assert_eq!(
1850            cfg.prompt_injection.allowlist.patterns,
1851            vec!["*".to_string()]
1852        );
1853        assert!(cfg.prompt_injection.agent_overrides.wrap);
1854        assert!(!cfg.prompt_injection.agent_overrides.patterns);
1855        assert!(cfg.prompt_injection.agent_overrides.model);
1856        assert!(cfg.prompt_injection.agent_overrides.level);
1857    }
1858
1859    #[test]
1860    fn prompt_injection_rejects_unknown_field() {
1861        let toml = "[prompt_injection]\nbogus = 1\n";
1862        let r: Result<Config, _> = toml::from_str(toml);
1863        assert!(r.is_err(), "expected deny_unknown_fields rejection");
1864    }
1865
1866    #[test]
1867    fn config_candidates_prefers_rover_config_env_as_sole_candidate() {
1868        let c = config_candidates_from(Some("/custom/x.toml"), Some(Path::new("/cfg")));
1869        assert_eq!(c, vec![std::path::PathBuf::from("/custom/x.toml")]);
1870    }
1871
1872    #[test]
1873    fn config_candidates_searches_platform_then_cwd() {
1874        let c = config_candidates_from(None, Some(Path::new("/cfg")));
1875        assert_eq!(
1876            c,
1877            vec![
1878                std::path::PathBuf::from("/cfg/rover/rover.toml"),
1879                std::path::PathBuf::from("rover.toml"),
1880            ]
1881        );
1882    }
1883
1884    #[test]
1885    fn config_candidates_falls_back_to_cwd_rover_toml() {
1886        let c = config_candidates_from(None, None);
1887        assert_eq!(c, vec![std::path::PathBuf::from("rover.toml")]);
1888    }
1889
1890    #[test]
1891    fn resolve_existing_prefers_platform_over_cwd_candidate() {
1892        // Lay down <tmp>/rover/rover.toml and confirm it is the chosen file.
1893        let tmp = tempfile::tempdir().unwrap();
1894        let rover_dir = tmp.path().join("rover");
1895        std::fs::create_dir_all(&rover_dir).unwrap();
1896        let platform_file = rover_dir.join("rover.toml");
1897        std::fs::write(&platform_file, "[fetch]\ntimeout_secs = 3\n").unwrap();
1898
1899        let resolved = config_candidates_from(None, Some(tmp.path()))
1900            .into_iter()
1901            .find(|p| p.is_file());
1902        assert_eq!(resolved, Some(platform_file));
1903    }
1904
1905    #[test]
1906    fn resolve_existing_is_none_when_no_candidate_exists() {
1907        let tmp = tempfile::tempdir().unwrap();
1908        // tmp has no rover/rover.toml, and the crate root has no ./rover.toml.
1909        let resolved = config_candidates_from(None, Some(tmp.path()))
1910            .into_iter()
1911            .find(|p| p.is_file());
1912        assert_eq!(resolved, None);
1913    }
1914
1915    #[test]
1916    fn load_resolved_uses_explicit_path_when_present() {
1917        let mut file = tempfile::NamedTempFile::new().unwrap();
1918        writeln!(file, "[fetch]\ntimeout_secs = 7\n").unwrap();
1919        // A resolved default must be ignored when --config is supplied.
1920        let cfg = load_resolved_from(Some(file.path()), None).unwrap();
1921        assert_eq!(cfg.fetch.timeout_secs, 7);
1922    }
1923
1924    #[test]
1925    fn load_resolved_errors_when_explicit_path_missing() {
1926        // An explicit --config typo must fail loudly, NOT fall back to the
1927        // resolved default or to built-in defaults.
1928        let mut default_file = tempfile::NamedTempFile::new().unwrap();
1929        writeln!(default_file, "[fetch]\ntimeout_secs = 9\n").unwrap();
1930        let result = load_resolved_from(
1931            Some(Path::new("/no/such/__rover_explicit__.toml")),
1932            Some(default_file.path()),
1933        );
1934        assert!(matches!(result, Err(ConfigError::Read { .. })));
1935    }
1936
1937    #[test]
1938    fn load_resolved_loads_resolved_default_when_no_explicit() {
1939        let mut file = tempfile::NamedTempFile::new().unwrap();
1940        writeln!(file, "[fetch]\ntimeout_secs = 11\n").unwrap();
1941        let cfg = load_resolved_from(None, Some(file.path())).unwrap();
1942        assert_eq!(cfg.fetch.timeout_secs, 11);
1943    }
1944
1945    #[test]
1946    fn load_resolved_falls_back_to_defaults_when_nothing_resolves() {
1947        let cfg = load_resolved_from(None, None).unwrap();
1948        assert_eq!(cfg.fetch.timeout_secs, default_timeout_secs());
1949    }
1950}