Skip to main content

rover/cli/
fetch.rs

1//! `rover fetch <url>` command.
2//!
3//! As of M2, `rover fetch` runs through the cache-aware orchestrator
4//! (`fetcher::cached::fetch_with_cache`). The CLI opens (or creates) the
5//! Rover cache database, dispatches the fetch, then renders the resulting
6//! `Page` row to stdout as a frontmatter envelope.
7
8use anyhow::Context;
9use jiff::Timestamp;
10use std::path::Path;
11use url::Url;
12
13use crate::config;
14use crate::extractor::frontmatter::{PageMeta, render};
15use crate::extractor::pipeline::extract;
16use crate::fetcher::cached::{
17    CacheStatus, ExtractResult, FetchOptions, fetch_with_cache, sha256_hex,
18};
19use crate::fetcher::client::build_http_client;
20use crate::fetcher::ssrf::SsrfLevel;
21use crate::storage::Db;
22
23pub struct Args {
24    pub url: String,
25    pub force_refresh: bool,
26    pub ignore_robots: bool,
27
28    /// Override `[fetch] user_agent` for this request.
29    pub user_agent: Option<String>,
30
31    /// Override `[fetch] timeout_secs` (per-request timeout) for this request.
32    pub timeout_secs: Option<u64>,
33
34    pub rate_limit_rpm: Option<u32>,
35    pub per_host_concurrency: Option<u32>,
36    pub global_concurrency: Option<u32>,
37    pub max_retries: Option<u8>,
38
39    /// Auto-summarize when the extracted markdown exceeds N tokens. Runs the
40    /// configured `[summarization]` backend (the offline extractive backend
41    /// by default) and replaces the body with a summary sized toward the
42    /// budget (best-effort; may land a few tokens over).
43    pub max_tokens: Option<usize>,
44
45    /// JSON `SummarizeOpts` blob — same shape as the MCP `summarize` tool
46    /// args minus the `url` field, e.g.
47    /// `--summarize '{"mode":"abstractive","target_tokens":500}'`. Applied
48    /// before `--max-tokens`; the body is replaced with the summary.
49    pub summarize: Option<String>,
50}
51
52pub async fn run(args: Args, config_path: Option<&Path>) -> anyhow::Result<()> {
53    let mut cfg = config::load_resolved(config_path).context("loading config")?;
54    cfg.apply_overrides(
55        args.rate_limit_rpm,
56        args.per_host_concurrency,
57        args.global_concurrency,
58        args.max_retries,
59        args.ignore_robots,
60    );
61    // Per-call fetch-transport overrides. Applied to the loaded config before
62    // the HTTP client is built (line below) and before FetchOptions.user_agent
63    // is derived, so both the sent UA header and robots matching use them.
64    if let Some(ua) = args.user_agent {
65        cfg.fetch.user_agent = ua;
66    }
67    if let Some(t) = args.timeout_secs {
68        if t == 0 {
69            anyhow::bail!("--timeout-secs must be greater than 0");
70        }
71        cfg.fetch.timeout_secs = t;
72    }
73    let url = Url::parse(&args.url).context("parsing URL argument")?;
74    let level = SsrfLevel::parse(&cfg.ssrf.level)
75        .with_context(|| format!("invalid [ssrf] level `{}` in config", cfg.ssrf.level))?;
76    let ssrf_project_root = if level == SsrfLevel::Project {
77        let raw = &cfg.ssrf.project_root;
78        let resolved = std::fs::canonicalize(raw)
79            .with_context(|| format!("canonicalizing ssrf.project_root `{}`", raw.display()))?;
80        tracing::info!(
81            target: "rover::ssrf",
82            project_root = %resolved.display(),
83            "ssrf level=project; project_root resolved",
84        );
85        Some(resolved)
86    } else {
87        None
88    };
89
90    // Parse the optional --summarize JSON blob up front so the user gets a
91    // clean error before any network or storage I/O.
92    let summarize_opts: Option<crate::mcp::tools::fetch::InlineSummarizeArgs> =
93        match args.summarize.as_deref() {
94            Some(s) => Some(serde_json::from_str(s).context("parsing --summarize JSON")?),
95            None => None,
96        };
97    if matches!(args.max_tokens, Some(0)) {
98        anyhow::bail!("--max-tokens must be greater than 0");
99    }
100
101    let data_dir = crate::paths::data_dir();
102    std::fs::create_dir_all(&data_dir).context("creating data dir")?;
103    let db = Db::open(data_dir.join("rover.db"))
104        .await
105        .context("opening cache database")?;
106
107    let client = build_http_client(&cfg.fetch.user_agent, cfg.fetch.timeout());
108    let pacer = crate::fetcher::concurrency::Pacer::new(&cfg.rate_limit);
109
110    // Optional HAR recorder for one-shot CLI runs. We flush once at the end
111    // of this subcommand rather than running an interval task — a single
112    // `fetch` invocation produces at most a handful of round-trips.
113    let har_recorder: Option<std::sync::Arc<crate::fetcher::har::HarRecorder>> =
114        if !cfg.debug.har_path.is_empty() {
115            let path = std::path::PathBuf::from(&cfg.debug.har_path);
116            let r = crate::fetcher::har::HarRecorder::new(path, cfg.debug.har_body_cap)
117                .with_context(|| format!("opening har file at {}", cfg.debug.har_path))?;
118            Some(std::sync::Arc::new(r))
119        } else {
120            None
121        };
122
123    // M9 fix C1: honor the server-config `auto_detect_spa` flag from the CLI
124    // path too. The CLI doesn't yet expose a `--headless` flag, so the only
125    // way to opt in is via `[headless] auto_detect_spa = true` in the
126    // config. Construction is lazy — we only launch Chromium if Auto-mode
127    // ends up needing it (the cached fetcher checks SPA heuristics first).
128    let headless_mode = if cfg.headless.auto_detect_spa {
129        crate::fetcher::HeadlessMode::Auto
130    } else {
131        crate::fetcher::HeadlessMode::Off
132    };
133    #[cfg(feature = "headless")]
134    let headless: Option<std::sync::Arc<crate::fetcher::headless::HeadlessRenderer>> =
135        if !matches!(headless_mode, crate::fetcher::HeadlessMode::Off) {
136            let r = crate::fetcher::headless::HeadlessRenderer::new(&cfg.headless)
137                .await
138                .map(std::sync::Arc::new)
139                .context("launching headless renderer")?;
140            Some(r)
141        } else {
142            None
143        };
144
145    let result = fetch_with_cache(
146        &db,
147        &client,
148        &pacer,
149        &cfg.rate_limit,
150        &cfg.robots,
151        &url,
152        &cfg.cache,
153        FetchOptions {
154            force_refresh: args.force_refresh,
155            ssrf_level: level,
156            ssrf_project_root,
157            har_recorder: har_recorder.clone(),
158            ignore_robots: args.ignore_robots,
159            user_agent: cfg.fetch.user_agent.clone(),
160            #[cfg(feature = "headless")]
161            headless: headless.clone(),
162            headless_mode,
163            // The one-shot CLI has no background scheduler to process a
164            // queued `revalidate` task — so on an expired entry we must
165            // refresh inline. Otherwise the row would stay at its old
166            // `fetched_at` indefinitely (the task we enqueued would
167            // outlive the process).
168            synchronous_revalidation: true,
169        },
170        |body, base| {
171            let extracted =
172                extract(body, Some(base)).map_err(crate::fetcher::FetcherError::Extract)?;
173            let content_hash = format!("sha256:{}", sha256_hex(extracted.body_md.as_bytes()));
174            Ok(ExtractResult {
175                title: extracted.title,
176                body_md: extracted.body_md,
177                content_hash,
178                metadata: extracted.metadata,
179            })
180        },
181    )
182    .await
183    .context("fetching URL")?;
184
185    if matches!(result.cache_status, CacheStatus::Stale { .. }) {
186        tracing::warn!(
187            target: "rover::cli::fetch",
188            url = url.as_str(),
189            "serving stale cache entry (network unavailable)"
190        );
191    }
192
193    let canonical =
194        Url::parse(&result.page.canonical_url).context("parsing canonical URL from cache row")?;
195
196    // Choose the tokenizer for frontmatter `estimated_tokens` from config.
197    let family = cfg.tokenizer.default;
198    crate::tokenizer::ensure_loaded(family)
199        .await
200        .context("loading default tokenizer")?;
201    let original_tokens = crate::tokenizer::count(&result.page.extracted_md, family)
202        .context("counting tokens for frontmatter")?;
203
204    // Recover the metadata persisted in the cache row (M2 `metadata_json`).
205    // If deserialization fails (legacy rows, corrupt JSON), fall back to empty
206    // — the frontmatter still renders, just without M4 fields.
207    let metadata: crate::extractor::ExtractedMetadata = result
208        .page
209        .metadata_json
210        .as_deref()
211        .and_then(|s| serde_json::from_str(s).ok())
212        .unwrap_or_default();
213    // Extraction quality is scored on the extracted body, before any
214    // summarization — it measures extraction fidelity, not the summary.
215    let quality = crate::extractor::quality::score(
216        &result.page.extracted_md,
217        result.page.extracted_md.chars().count().max(1),
218        !metadata.is_empty(),
219        result.page.title.is_some(),
220    );
221
222    // Optional summarization. Mirrors the MCP `fetch` path: an explicit
223    // `--summarize` blob runs first, then `--max-tokens` auto-summarizes when
224    // the body is over budget. Built lazily so a plain fetch pays nothing.
225    let (body_md, tokens, summarized) = if args.max_tokens.is_some() || summarize_opts.is_some() {
226        let registry = std::sync::Arc::new(
227            crate::summarizer::registry::build(&cfg, family)
228                .context("building summarizer registry")?,
229        );
230        // Harden content fed to model backends (always-on, matching the MCP
231        // path). Prompt-free extractive backends ignore it; cloud/local model
232        // backends get cleaned, nonce-wrapped content.
233        let guard = std::sync::Arc::new(
234            crate::guard::Guard::from_config(&cfg.prompt_injection)
235                .context("building prompt-injection guard")?,
236        );
237        let summarizer = crate::summarizer::SummarizerService::new(
238            db.clone(),
239            registry,
240            cfg.summarization.fallback_to_extractive,
241        )
242        .with_guard(guard);
243        let defaults = crate::summarizer::DefaultsHint::from_config(&cfg.summarization);
244        maybe_summarize(
245            &summarizer,
246            &defaults,
247            family,
248            result.page.extracted_md.clone(),
249            original_tokens,
250            args.max_tokens,
251            summarize_opts,
252        )
253        .await?
254    } else {
255        (result.page.extracted_md.clone(), original_tokens, false)
256    };
257
258    let meta = PageMeta {
259        url: &url,
260        canonical_url: &canonical,
261        title: result.page.title.as_deref(),
262        fetched_at: Timestamp::now(),
263        body: &body_md,
264        tokens,
265        tokenizer_name: family.as_str(),
266        description: metadata.description.as_deref(),
267        author: metadata.author.as_deref(),
268        published: metadata.published.as_deref(),
269        modified: metadata.modified.as_deref(),
270        image: metadata.image.as_deref(),
271        og_type: metadata.og_type.as_deref(),
272        language: metadata.language.as_deref(),
273        schema_types: &metadata.schema_types,
274        extraction_quality: quality,
275        summarized,
276        tables_transformed: &[],
277        images_seen: 0,
278        images_downloaded: 0,
279        images_failed: 0,
280        images_processed: vec![],
281        prompt_injection: None,
282    };
283
284    let envelope = render(&meta);
285    print!("{envelope}");
286
287    if let Some(r) = &har_recorder
288        && let Err(e) = r.flush().await
289    {
290        tracing::warn!(target: "rover::fetcher", error = ?e, "har flush failed");
291    }
292
293    // M9 fix C1: tear down the renderer cleanly so chromiumoxide's handler
294    // task doesn't outlive this one-shot CLI invocation. `try_unwrap` is
295    // expected to succeed — `fetch_with_cache` returned, so the only other
296    // strong reference (the one we passed into `FetchOptions`) is gone.
297    #[cfg(feature = "headless")]
298    if let Some(renderer) = headless {
299        match std::sync::Arc::try_unwrap(renderer) {
300            Ok(r) => r.shutdown().await,
301            Err(_still_shared) => {
302                tracing::warn!(
303                    target: "rover::cli::fetch",
304                    "headless renderer still has outstanding Arc references at shutdown; skipping explicit shutdown",
305                );
306            }
307        }
308    }
309
310    Ok(())
311}
312
313/// Apply optional summarization to `body`, mirroring the MCP `fetch` path.
314/// An explicit `--summarize` blob runs first; then `--max-tokens` triggers
315/// auto-summarization when the body is still over budget. Returns the
316/// (possibly summarized) body, its token count, and whether summarization ran.
317async fn maybe_summarize(
318    summarizer: &crate::summarizer::SummarizerService,
319    defaults: &crate::summarizer::DefaultsHint,
320    family: crate::tokenizer::Tokenizer,
321    body: String,
322    tokens: usize,
323    max_tokens: Option<usize>,
324    summarize: Option<crate::mcp::tools::fetch::InlineSummarizeArgs>,
325) -> anyhow::Result<(String, usize, bool)> {
326    let mut body = body;
327    let mut tokens = tokens;
328    let mut summarized = false;
329
330    // Explicit `--summarize` first: the body becomes the summary.
331    if let Some(inline) = summarize {
332        let opts = summarizer.resolve_defaults(
333            inline.mode.map(Into::into),
334            inline.style.map(Into::into),
335            inline.target_tokens,
336            inline.focus,
337            inline.preserve.into_iter().map(Into::into).collect(),
338            inline.backend,
339            defaults,
340        );
341        body = compact_body(summarizer, &body, &opts).await?;
342        tokens = crate::tokenizer::count(&body, family).context("counting summary tokens")?;
343        summarized = true;
344    }
345
346    // Auto-summarize on `--max-tokens` overflow. Best-effort: the offline
347    // extractive backend budgets by summing per-sentence token counts, so the
348    // joined summary can land a few tokens over the target. The CLI budget is
349    // a target, not a hard ceiling, so we emit the result rather than failing.
350    // (If an explicit `--summarize` already ran, we keep that result.)
351    if let Some(max) = max_tokens
352        && tokens > max
353        && !summarized
354    {
355        let opts = summarizer.resolve_defaults(None, None, Some(max), None, vec![], None, defaults);
356        body = compact_body(summarizer, &body, &opts).await?;
357        tokens = crate::tokenizer::count(&body, family).context("counting summary tokens")?;
358        summarized = true;
359    }
360
361    Ok((body, tokens, summarized))
362}
363
364/// Run the summarizer over `body` and return the summary markdown.
365async fn compact_body(
366    summarizer: &crate::summarizer::SummarizerService,
367    body: &str,
368    opts: &crate::summarizer::backend::CompactOpts,
369) -> anyhow::Result<String> {
370    let content_hash = format!("sha256:{}", sha256_hex(body.as_bytes()));
371    let r = summarizer
372        .compact(&content_hash, body, opts)
373        .await
374        .context("summarizing extracted markdown")?;
375    Ok(r.summary_md)
376}
377
378#[cfg(test)]
379mod tests {
380    use super::*;
381    use crate::summarizer::{DefaultsHint, SummarizerService};
382    use std::sync::Arc;
383
384    fn default_config() -> crate::config::Config {
385        toml::from_str("").unwrap()
386    }
387
388    async fn service() -> (SummarizerService, DefaultsHint, tempfile::TempDir) {
389        let tmp = tempfile::tempdir().unwrap();
390        let db = Db::open(tmp.path().join("t.db")).await.unwrap();
391        let cfg = default_config();
392        let family = cfg.tokenizer.default;
393        crate::tokenizer::ensure_loaded(family).await.unwrap();
394        let registry = Arc::new(crate::summarizer::registry::build(&cfg, family).unwrap());
395        let svc = SummarizerService::new(db, registry, cfg.summarization.fallback_to_extractive);
396        let defaults = DefaultsHint::from_config(&cfg.summarization);
397        (svc, defaults, tmp)
398    }
399
400    /// A multi-sentence body the extractive backend can rank and trim.
401    fn long_body() -> String {
402        let mut s = String::new();
403        for i in 0..80 {
404            s.push_str(&format!(
405                "Sentence number {i} states a distinct and self-contained fact about how a rover \
406                 fetches and prepares web content for an agent to reason over. "
407            ));
408        }
409        s
410    }
411
412    #[tokio::test]
413    #[allow(clippy::await_holding_lock)] // serialised vs. tokenizer-registry-clearing tests
414    async fn passthrough_when_under_budget_and_no_summarize() {
415        let _tok = crate::tokenizer::_test_mutex()
416            .lock()
417            .unwrap_or_else(|e| e.into_inner());
418        let (svc, defaults, _tmp) = service().await;
419        let family = default_config().tokenizer.default;
420        let body = "A short extracted body.".to_string();
421        let tokens = crate::tokenizer::count(&body, family).unwrap();
422        let (out, out_tokens, summarized) = maybe_summarize(
423            &svc,
424            &defaults,
425            family,
426            body.clone(),
427            tokens,
428            Some(10_000),
429            None,
430        )
431        .await
432        .unwrap();
433        assert!(!summarized, "should not summarize when under budget");
434        assert_eq!(out, body);
435        assert_eq!(out_tokens, tokens);
436    }
437
438    #[tokio::test]
439    #[allow(clippy::await_holding_lock)] // serialised vs. tokenizer-registry-clearing tests
440    async fn explicit_summarize_shrinks_body() {
441        let _tok = crate::tokenizer::_test_mutex()
442            .lock()
443            .unwrap_or_else(|e| e.into_inner());
444        let (svc, defaults, _tmp) = service().await;
445        let family = default_config().tokenizer.default;
446        let body = long_body();
447        let tokens = crate::tokenizer::count(&body, family).unwrap();
448        let inline = crate::mcp::tools::fetch::InlineSummarizeArgs {
449            target_tokens: Some(80),
450            ..Default::default()
451        };
452        let (out, out_tokens, summarized) =
453            maybe_summarize(&svc, &defaults, family, body, tokens, None, Some(inline))
454                .await
455                .unwrap();
456        assert!(summarized);
457        assert!(!out.is_empty());
458        assert!(
459            out_tokens < tokens,
460            "summary should be smaller than the original ({out_tokens} !< {tokens})"
461        );
462    }
463
464    #[tokio::test]
465    #[allow(clippy::await_holding_lock)] // serialised vs. tokenizer-registry-clearing tests
466    async fn max_tokens_auto_summarizes_over_budget() {
467        let _tok = crate::tokenizer::_test_mutex()
468            .lock()
469            .unwrap_or_else(|e| e.into_inner());
470        let (svc, defaults, _tmp) = service().await;
471        let family = default_config().tokenizer.default;
472        let body = long_body();
473        let tokens = crate::tokenizer::count(&body, family).unwrap();
474        assert!(
475            tokens > 400,
476            "fixture should exceed the budget (got {tokens})"
477        );
478        let (out, out_tokens, summarized) =
479            maybe_summarize(&svc, &defaults, family, body, tokens, Some(400), None)
480                .await
481                .unwrap();
482        assert!(summarized);
483        assert!(!out.is_empty());
484        assert!(
485            out_tokens < tokens,
486            "auto-summary should be smaller than the original ({out_tokens} !< {tokens})"
487        );
488    }
489}