Skip to main content

rover/cli/
fetch.rs

1//! `rover fetch <url>` command.
2//!
3//! As of M2, `rover fetch` runs through the cache-aware orchestrator
4//! (`fetcher::cached::fetch_with_cache`). The CLI opens (or creates) the
5//! Rover cache database, dispatches the fetch, then renders the resulting
6//! `Page` row to stdout as a frontmatter envelope.
7
8use anyhow::Context;
9use jiff::Timestamp;
10use std::path::Path;
11use url::Url;
12
13use crate::config;
14use crate::extractor::frontmatter::{PageMeta, render};
15use crate::extractor::pipeline::extract;
16use crate::fetcher::cached::{
17    CacheStatus, ExtractResult, FetchOptions, fetch_with_cache, sha256_hex,
18};
19use crate::fetcher::client::build_http_client;
20use crate::fetcher::ssrf::SsrfLevel;
21use crate::storage::Db;
22
23pub struct Args {
24    pub url: String,
25    pub force_refresh: bool,
26    pub ignore_robots: bool,
27
28    /// Override `[fetch] user_agent` for this request.
29    pub user_agent: Option<String>,
30
31    /// Override `[fetch] timeout_secs` (per-request timeout) for this request.
32    pub timeout_secs: Option<u64>,
33
34    pub rate_limit_rpm: Option<u32>,
35    pub per_host_concurrency: Option<u32>,
36    pub global_concurrency: Option<u32>,
37    pub max_retries: Option<u8>,
38
39    /// Auto-summarize when the extracted markdown exceeds N tokens. Runs the
40    /// configured `[summarization]` backend (the offline extractive backend
41    /// by default) and replaces the body with a summary sized toward the
42    /// budget (best-effort; may land a few tokens over).
43    pub max_tokens: Option<usize>,
44
45    /// JSON `SummarizeOpts` blob — same shape as the MCP `summarize` tool
46    /// args minus the `url` field, e.g.
47    /// `--summarize '{"mode":"abstractive","target_tokens":500}'`. Applied
48    /// before `--max-tokens`; the body is replaced with the summary.
49    pub summarize: Option<String>,
50}
51
52pub async fn run(args: Args, config_path: Option<&Path>) -> anyhow::Result<()> {
53    let mut cfg = config::load_resolved(config_path).context("loading config")?;
54    cfg.apply_overrides(
55        args.rate_limit_rpm,
56        args.per_host_concurrency,
57        args.global_concurrency,
58        args.max_retries,
59        args.ignore_robots,
60    );
61    // Per-call fetch-transport overrides. Applied to the loaded config before
62    // the HTTP client is built (line below) and before FetchOptions.user_agent
63    // is derived, so both the sent UA header and robots matching use them.
64    if let Some(ua) = args.user_agent {
65        cfg.fetch.user_agent = ua;
66    }
67    if let Some(t) = args.timeout_secs {
68        if t == 0 {
69            anyhow::bail!("--timeout-secs must be greater than 0");
70        }
71        cfg.fetch.timeout_secs = t;
72    }
73    let url = Url::parse(&args.url).context("parsing URL argument")?;
74    let level = SsrfLevel::parse(&cfg.ssrf.level)
75        .with_context(|| format!("invalid [ssrf] level `{}` in config", cfg.ssrf.level))?;
76    let ssrf_project_root = if level == SsrfLevel::Project {
77        let raw = &cfg.ssrf.project_root;
78        let resolved = std::fs::canonicalize(raw)
79            .with_context(|| format!("canonicalizing ssrf.project_root `{}`", raw.display()))?;
80        tracing::info!(
81            target: "rover::ssrf",
82            project_root = %resolved.display(),
83            "ssrf level=project; project_root resolved",
84        );
85        Some(resolved)
86    } else {
87        None
88    };
89
90    // Parse the optional --summarize JSON blob up front so the user gets a
91    // clean error before any network or storage I/O.
92    let summarize_opts: Option<crate::mcp::tools::fetch::InlineSummarizeArgs> =
93        match args.summarize.as_deref() {
94            Some(s) => Some(serde_json::from_str(s).context("parsing --summarize JSON")?),
95            None => None,
96        };
97    if matches!(args.max_tokens, Some(0)) {
98        anyhow::bail!("--max-tokens must be greater than 0");
99    }
100
101    let data_dir = crate::paths::data_dir();
102    std::fs::create_dir_all(&data_dir).context("creating data dir")?;
103    let db = Db::open(data_dir.join("rover.db"))
104        .await
105        .context("opening cache database")?;
106
107    let client = build_http_client(&cfg.fetch.user_agent, cfg.fetch.timeout());
108    let pacer = crate::fetcher::concurrency::Pacer::new(&cfg.rate_limit);
109
110    // Optional HAR recorder for one-shot CLI runs. We flush once at the end
111    // of this subcommand rather than running an interval task — a single
112    // `fetch` invocation produces at most a handful of round-trips.
113    let har_recorder: Option<std::sync::Arc<crate::fetcher::har::HarRecorder>> =
114        if !cfg.debug.har_path.is_empty() {
115            let path = std::path::PathBuf::from(&cfg.debug.har_path);
116            let r = crate::fetcher::har::HarRecorder::new(path, cfg.debug.har_body_cap)
117                .with_context(|| format!("opening har file at {}", cfg.debug.har_path))?;
118            Some(std::sync::Arc::new(r))
119        } else {
120            None
121        };
122
123    // M9 fix C1: honor the server-config `auto_detect_spa` flag from the CLI
124    // path too. The CLI doesn't yet expose a `--headless` flag, so the only
125    // way to opt in is via `[headless] auto_detect_spa = true` in the
126    // config. Construction is lazy — we only launch Chromium if Auto-mode
127    // ends up needing it (the cached fetcher checks SPA heuristics first).
128    let headless_mode = if cfg.headless.auto_detect_spa {
129        crate::fetcher::HeadlessMode::Auto
130    } else {
131        crate::fetcher::HeadlessMode::Off
132    };
133    // Lazily-wired headless handle: constructing it launches no browser. The
134    // cached fetcher launches Chromium on first use only when a render actually
135    // happens (SPA detected, or a bot-challenge needs bypassing). A plain
136    // reqwest fetch that never needs the browser launches nothing — and so
137    // emits none of chromiumoxide's process-teardown noise.
138    #[cfg(feature = "headless")]
139    let headless: Option<crate::fetcher::headless::HeadlessHandle> =
140        if !matches!(headless_mode, crate::fetcher::HeadlessMode::Off) {
141            Some(crate::fetcher::headless::HeadlessHandle::new(
142                cfg.headless.clone(),
143            ))
144        } else {
145            None
146        };
147
148    let result = fetch_with_cache(
149        &db,
150        &client,
151        &pacer,
152        &cfg.rate_limit,
153        &cfg.robots,
154        &url,
155        &cfg.cache,
156        FetchOptions {
157            force_refresh: args.force_refresh,
158            ssrf_level: level,
159            ssrf_project_root,
160            har_recorder: har_recorder.clone(),
161            ignore_robots: args.ignore_robots,
162            user_agent: cfg.fetch.user_agent.clone(),
163            #[cfg(feature = "headless")]
164            headless: headless.clone(),
165            headless_mode,
166            // The one-shot CLI has no background scheduler to process a
167            // queued `revalidate` task — so on an expired entry we must
168            // refresh inline. Otherwise the row would stay at its old
169            // `fetched_at` indefinitely (the task we enqueued would
170            // outlive the process).
171            synchronous_revalidation: true,
172        },
173        |body, base| {
174            let extracted =
175                extract(body, Some(base)).map_err(crate::fetcher::FetcherError::Extract)?;
176            let content_hash = format!("sha256:{}", sha256_hex(extracted.body_md.as_bytes()));
177            Ok(ExtractResult {
178                title: extracted.title,
179                body_md: extracted.body_md,
180                content_hash,
181                metadata: extracted.metadata,
182            })
183        },
184    )
185    .await;
186
187    // Tear the headless browser down (if it was ever launched) immediately —
188    // it is only used inside `fetch_with_cache`. Doing it here, before the `?`
189    // and before any later fallible step, guarantees a clean shutdown on every
190    // path (success or error) so chromiumoxide's handler task never outlives
191    // this one-shot invocation and never logs a teardown warning. A no-op when
192    // the browser was never launched.
193    #[cfg(feature = "headless")]
194    if let Some(h) = headless {
195        h.shutdown().await;
196    }
197
198    let result = result.context("fetching URL")?;
199
200    if matches!(result.cache_status, CacheStatus::Stale { .. }) {
201        tracing::warn!(
202            target: "rover::cli::fetch",
203            url = url.as_str(),
204            "serving stale cache entry (network unavailable)"
205        );
206    }
207
208    let canonical =
209        Url::parse(&result.page.canonical_url).context("parsing canonical URL from cache row")?;
210
211    // Choose the tokenizer for frontmatter `estimated_tokens` from config.
212    let family = cfg.tokenizer.default;
213    crate::tokenizer::ensure_loaded(family)
214        .await
215        .context("loading default tokenizer")?;
216    let original_tokens = crate::tokenizer::count(&result.page.extracted_md, family)
217        .context("counting tokens for frontmatter")?;
218
219    // Recover the metadata persisted in the cache row (M2 `metadata_json`).
220    // If deserialization fails (legacy rows, corrupt JSON), fall back to empty
221    // — the frontmatter still renders, just without M4 fields.
222    let metadata: crate::extractor::ExtractedMetadata = result
223        .page
224        .metadata_json
225        .as_deref()
226        .and_then(|s| serde_json::from_str(s).ok())
227        .unwrap_or_default();
228    // Extraction quality is scored on the extracted body, before any
229    // summarization — it measures extraction fidelity, not the summary.
230    let quality = crate::extractor::quality::score(
231        &result.page.extracted_md,
232        result.page.extracted_md.chars().count().max(1),
233        !metadata.is_empty(),
234        result.page.title.is_some(),
235    );
236
237    // Optional summarization. Mirrors the MCP `fetch` path: an explicit
238    // `--summarize` blob runs first, then `--max-tokens` auto-summarizes when
239    // the body is over budget. Built lazily so a plain fetch pays nothing.
240    let (body_md, tokens, summarized) = if args.max_tokens.is_some() || summarize_opts.is_some() {
241        let registry = std::sync::Arc::new(
242            crate::summarizer::registry::build(&cfg, family)
243                .context("building summarizer registry")?,
244        );
245        // Harden content fed to model backends (always-on, matching the MCP
246        // path). Prompt-free extractive backends ignore it; cloud/local model
247        // backends get cleaned, nonce-wrapped content.
248        let guard = std::sync::Arc::new(
249            crate::guard::Guard::from_config(&cfg.prompt_injection)
250                .context("building prompt-injection guard")?,
251        );
252        let summarizer = crate::summarizer::SummarizerService::new(
253            db.clone(),
254            registry,
255            cfg.summarization.fallback_to_extractive,
256        )
257        .with_guard(guard);
258        let defaults = crate::summarizer::DefaultsHint::from_config(&cfg.summarization);
259        maybe_summarize(
260            &summarizer,
261            &defaults,
262            family,
263            result.page.extracted_md.clone(),
264            original_tokens,
265            args.max_tokens,
266            summarize_opts,
267        )
268        .await?
269    } else {
270        (result.page.extracted_md.clone(), original_tokens, false)
271    };
272
273    let meta = PageMeta {
274        url: &url,
275        canonical_url: &canonical,
276        title: result.page.title.as_deref(),
277        fetched_at: Timestamp::now(),
278        body: &body_md,
279        tokens,
280        tokenizer_name: family.as_str(),
281        description: metadata.description.as_deref(),
282        author: metadata.author.as_deref(),
283        published: metadata.published.as_deref(),
284        modified: metadata.modified.as_deref(),
285        image: metadata.image.as_deref(),
286        og_type: metadata.og_type.as_deref(),
287        language: metadata.language.as_deref(),
288        schema_types: &metadata.schema_types,
289        extraction_quality: quality,
290        summarized,
291        headless_render: result.page.render_reason.as_deref(),
292        tables_transformed: &[],
293        images_seen: 0,
294        images_downloaded: 0,
295        images_failed: 0,
296        images_processed: vec![],
297        prompt_injection: None,
298    };
299
300    let envelope = render(&meta);
301    print!("{envelope}");
302
303    if let Some(r) = &har_recorder
304        && let Err(e) = r.flush().await
305    {
306        tracing::warn!(target: "rover::fetcher", error = ?e, "har flush failed");
307    }
308
309    Ok(())
310}
311
312/// Apply optional summarization to `body`, mirroring the MCP `fetch` path.
313/// An explicit `--summarize` blob runs first; then `--max-tokens` triggers
314/// auto-summarization when the body is still over budget. Returns the
315/// (possibly summarized) body, its token count, and whether summarization ran.
316async fn maybe_summarize(
317    summarizer: &crate::summarizer::SummarizerService,
318    defaults: &crate::summarizer::DefaultsHint,
319    family: crate::tokenizer::Tokenizer,
320    body: String,
321    tokens: usize,
322    max_tokens: Option<usize>,
323    summarize: Option<crate::mcp::tools::fetch::InlineSummarizeArgs>,
324) -> anyhow::Result<(String, usize, bool)> {
325    let mut body = body;
326    let mut tokens = tokens;
327    let mut summarized = false;
328
329    // Explicit `--summarize` first: the body becomes the summary.
330    if let Some(inline) = summarize {
331        let opts = summarizer.resolve_defaults(
332            inline.mode.map(Into::into),
333            inline.style.map(Into::into),
334            inline.target_tokens,
335            inline.focus,
336            inline.preserve.into_iter().map(Into::into).collect(),
337            inline.backend,
338            defaults,
339        );
340        body = compact_body(summarizer, &body, &opts).await?;
341        tokens = crate::tokenizer::count(&body, family).context("counting summary tokens")?;
342        summarized = true;
343    }
344
345    // Auto-summarize on `--max-tokens` overflow. Best-effort: the offline
346    // extractive backend budgets by summing per-sentence token counts, so the
347    // joined summary can land a few tokens over the target. The CLI budget is
348    // a target, not a hard ceiling, so we emit the result rather than failing.
349    // (If an explicit `--summarize` already ran, we keep that result.)
350    if let Some(max) = max_tokens
351        && tokens > max
352        && !summarized
353    {
354        let opts = summarizer.resolve_defaults(None, None, Some(max), None, vec![], None, defaults);
355        body = compact_body(summarizer, &body, &opts).await?;
356        tokens = crate::tokenizer::count(&body, family).context("counting summary tokens")?;
357        summarized = true;
358    }
359
360    Ok((body, tokens, summarized))
361}
362
363/// Run the summarizer over `body` and return the summary markdown.
364async fn compact_body(
365    summarizer: &crate::summarizer::SummarizerService,
366    body: &str,
367    opts: &crate::summarizer::backend::CompactOpts,
368) -> anyhow::Result<String> {
369    let content_hash = format!("sha256:{}", sha256_hex(body.as_bytes()));
370    let r = summarizer
371        .compact(&content_hash, body, opts)
372        .await
373        .context("summarizing extracted markdown")?;
374    Ok(r.summary_md)
375}
376
377#[cfg(test)]
378mod tests {
379    use super::*;
380    use crate::summarizer::{DefaultsHint, SummarizerService};
381    use std::sync::Arc;
382
383    fn default_config() -> crate::config::Config {
384        toml::from_str("").unwrap()
385    }
386
387    async fn service() -> (SummarizerService, DefaultsHint, tempfile::TempDir) {
388        let tmp = tempfile::tempdir().unwrap();
389        let db = Db::open(tmp.path().join("t.db")).await.unwrap();
390        let cfg = default_config();
391        let family = cfg.tokenizer.default;
392        crate::tokenizer::ensure_loaded(family).await.unwrap();
393        let registry = Arc::new(crate::summarizer::registry::build(&cfg, family).unwrap());
394        let svc = SummarizerService::new(db, registry, cfg.summarization.fallback_to_extractive);
395        let defaults = DefaultsHint::from_config(&cfg.summarization);
396        (svc, defaults, tmp)
397    }
398
399    /// A multi-sentence body the extractive backend can rank and trim.
400    fn long_body() -> String {
401        let mut s = String::new();
402        for i in 0..80 {
403            s.push_str(&format!(
404                "Sentence number {i} states a distinct and self-contained fact about how a rover \
405                 fetches and prepares web content for an agent to reason over. "
406            ));
407        }
408        s
409    }
410
411    #[tokio::test]
412    #[allow(clippy::await_holding_lock)] // serialised vs. tokenizer-registry-clearing tests
413    async fn passthrough_when_under_budget_and_no_summarize() {
414        let _tok = crate::tokenizer::_test_mutex()
415            .lock()
416            .unwrap_or_else(|e| e.into_inner());
417        let (svc, defaults, _tmp) = service().await;
418        let family = default_config().tokenizer.default;
419        let body = "A short extracted body.".to_string();
420        let tokens = crate::tokenizer::count(&body, family).unwrap();
421        let (out, out_tokens, summarized) = maybe_summarize(
422            &svc,
423            &defaults,
424            family,
425            body.clone(),
426            tokens,
427            Some(10_000),
428            None,
429        )
430        .await
431        .unwrap();
432        assert!(!summarized, "should not summarize when under budget");
433        assert_eq!(out, body);
434        assert_eq!(out_tokens, tokens);
435    }
436
437    #[tokio::test]
438    #[allow(clippy::await_holding_lock)] // serialised vs. tokenizer-registry-clearing tests
439    async fn explicit_summarize_shrinks_body() {
440        let _tok = crate::tokenizer::_test_mutex()
441            .lock()
442            .unwrap_or_else(|e| e.into_inner());
443        let (svc, defaults, _tmp) = service().await;
444        let family = default_config().tokenizer.default;
445        let body = long_body();
446        let tokens = crate::tokenizer::count(&body, family).unwrap();
447        let inline = crate::mcp::tools::fetch::InlineSummarizeArgs {
448            target_tokens: Some(80),
449            ..Default::default()
450        };
451        let (out, out_tokens, summarized) =
452            maybe_summarize(&svc, &defaults, family, body, tokens, None, Some(inline))
453                .await
454                .unwrap();
455        assert!(summarized);
456        assert!(!out.is_empty());
457        assert!(
458            out_tokens < tokens,
459            "summary should be smaller than the original ({out_tokens} !< {tokens})"
460        );
461    }
462
463    #[tokio::test]
464    #[allow(clippy::await_holding_lock)] // serialised vs. tokenizer-registry-clearing tests
465    async fn max_tokens_auto_summarizes_over_budget() {
466        let _tok = crate::tokenizer::_test_mutex()
467            .lock()
468            .unwrap_or_else(|e| e.into_inner());
469        let (svc, defaults, _tmp) = service().await;
470        let family = default_config().tokenizer.default;
471        let body = long_body();
472        let tokens = crate::tokenizer::count(&body, family).unwrap();
473        assert!(
474            tokens > 400,
475            "fixture should exceed the budget (got {tokens})"
476        );
477        let (out, out_tokens, summarized) =
478            maybe_summarize(&svc, &defaults, family, body, tokens, Some(400), None)
479                .await
480                .unwrap();
481        assert!(summarized);
482        assert!(!out.is_empty());
483        assert!(
484            out_tokens < tokens,
485            "auto-summary should be smaller than the original ({out_tokens} !< {tokens})"
486        );
487    }
488}