rover-fetch 0.2.0

An MCP server for fetching and prepping web content for LLM agents.
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
//! `rover fetch <url>` command.
//!
//! As of M2, `rover fetch` runs through the cache-aware orchestrator
//! (`fetcher::cached::fetch_with_cache`). The CLI opens (or creates) the
//! Rover cache database, dispatches the fetch, then renders the resulting
//! `Page` row to stdout as a frontmatter envelope.

use anyhow::Context;
use jiff::Timestamp;
use std::path::Path;
use url::Url;

use crate::config;
use crate::extractor::frontmatter::{PageMeta, render};
use crate::extractor::pipeline::extract;
use crate::fetcher::cached::{
    CacheStatus, ExtractResult, FetchOptions, fetch_with_cache, sha256_hex,
};
use crate::fetcher::client::build_http_client;
use crate::fetcher::ssrf::SsrfLevel;
use crate::storage::Db;

pub struct Args {
    pub url: String,
    pub force_refresh: bool,
    pub ignore_robots: bool,

    /// Override `[fetch] user_agent` for this request.
    pub user_agent: Option<String>,

    /// Override `[fetch] timeout_secs` (per-request timeout) for this request.
    pub timeout_secs: Option<u64>,

    pub rate_limit_rpm: Option<u32>,
    pub per_host_concurrency: Option<u32>,
    pub global_concurrency: Option<u32>,
    pub max_retries: Option<u8>,

    /// Auto-summarize when the extracted markdown exceeds N tokens. Runs the
    /// configured `[summarization]` backend (the offline extractive backend
    /// by default) and replaces the body with a summary sized toward the
    /// budget (best-effort; may land a few tokens over).
    pub max_tokens: Option<usize>,

    /// JSON `SummarizeOpts` blob — same shape as the MCP `summarize` tool
    /// args minus the `url` field, e.g.
    /// `--summarize '{"mode":"abstractive","target_tokens":500}'`. Applied
    /// before `--max-tokens`; the body is replaced with the summary.
    pub summarize: Option<String>,
}

pub async fn run(args: Args, config_path: Option<&Path>) -> anyhow::Result<()> {
    let mut cfg = config::load_resolved(config_path).context("loading config")?;
    cfg.apply_overrides(
        args.rate_limit_rpm,
        args.per_host_concurrency,
        args.global_concurrency,
        args.max_retries,
        args.ignore_robots,
    );
    // Per-call fetch-transport overrides. Applied to the loaded config before
    // the HTTP client is built (line below) and before FetchOptions.user_agent
    // is derived, so both the sent UA header and robots matching use them.
    if let Some(ua) = args.user_agent {
        cfg.fetch.user_agent = ua;
    }
    if let Some(t) = args.timeout_secs {
        if t == 0 {
            anyhow::bail!("--timeout-secs must be greater than 0");
        }
        cfg.fetch.timeout_secs = t;
    }
    let url = Url::parse(&args.url).context("parsing URL argument")?;
    let level = SsrfLevel::parse(&cfg.ssrf.level)
        .with_context(|| format!("invalid [ssrf] level `{}` in config", cfg.ssrf.level))?;
    let ssrf_project_root = if level == SsrfLevel::Project {
        let raw = &cfg.ssrf.project_root;
        let resolved = std::fs::canonicalize(raw)
            .with_context(|| format!("canonicalizing ssrf.project_root `{}`", raw.display()))?;
        tracing::info!(
            target: "rover::ssrf",
            project_root = %resolved.display(),
            "ssrf level=project; project_root resolved",
        );
        Some(resolved)
    } else {
        None
    };

    // Parse the optional --summarize JSON blob up front so the user gets a
    // clean error before any network or storage I/O.
    let summarize_opts: Option<crate::mcp::tools::fetch::InlineSummarizeArgs> =
        match args.summarize.as_deref() {
            Some(s) => Some(serde_json::from_str(s).context("parsing --summarize JSON")?),
            None => None,
        };
    if matches!(args.max_tokens, Some(0)) {
        anyhow::bail!("--max-tokens must be greater than 0");
    }

    let data_dir = crate::paths::data_dir();
    std::fs::create_dir_all(&data_dir).context("creating data dir")?;
    let db = Db::open(data_dir.join("rover.db"))
        .await
        .context("opening cache database")?;

    let client = build_http_client(&cfg.fetch.user_agent, cfg.fetch.timeout());
    let pacer = crate::fetcher::concurrency::Pacer::new(&cfg.rate_limit);

    // Optional HAR recorder for one-shot CLI runs. We flush once at the end
    // of this subcommand rather than running an interval task — a single
    // `fetch` invocation produces at most a handful of round-trips.
    let har_recorder: Option<std::sync::Arc<crate::fetcher::har::HarRecorder>> =
        if !cfg.debug.har_path.is_empty() {
            let path = std::path::PathBuf::from(&cfg.debug.har_path);
            let r = crate::fetcher::har::HarRecorder::new(path, cfg.debug.har_body_cap)
                .with_context(|| format!("opening har file at {}", cfg.debug.har_path))?;
            Some(std::sync::Arc::new(r))
        } else {
            None
        };

    // M9 fix C1: honor the server-config `auto_detect_spa` flag from the CLI
    // path too. The CLI doesn't yet expose a `--headless` flag, so the only
    // way to opt in is via `[headless] auto_detect_spa = true` in the
    // config. Construction is lazy — we only launch Chromium if Auto-mode
    // ends up needing it (the cached fetcher checks SPA heuristics first).
    let headless_mode = if cfg.headless.auto_detect_spa {
        crate::fetcher::HeadlessMode::Auto
    } else {
        crate::fetcher::HeadlessMode::Off
    };
    #[cfg(feature = "headless")]
    let headless: Option<std::sync::Arc<crate::fetcher::headless::HeadlessRenderer>> =
        if !matches!(headless_mode, crate::fetcher::HeadlessMode::Off) {
            let r = crate::fetcher::headless::HeadlessRenderer::new(&cfg.headless)
                .await
                .map(std::sync::Arc::new)
                .context("launching headless renderer")?;
            Some(r)
        } else {
            None
        };

    let result = fetch_with_cache(
        &db,
        &client,
        &pacer,
        &cfg.rate_limit,
        &cfg.robots,
        &url,
        &cfg.cache,
        FetchOptions {
            force_refresh: args.force_refresh,
            ssrf_level: level,
            ssrf_project_root,
            har_recorder: har_recorder.clone(),
            ignore_robots: args.ignore_robots,
            user_agent: cfg.fetch.user_agent.clone(),
            #[cfg(feature = "headless")]
            headless: headless.clone(),
            headless_mode,
            // The one-shot CLI has no background scheduler to process a
            // queued `revalidate` task — so on an expired entry we must
            // refresh inline. Otherwise the row would stay at its old
            // `fetched_at` indefinitely (the task we enqueued would
            // outlive the process).
            synchronous_revalidation: true,
        },
        |body, base| {
            let extracted =
                extract(body, Some(base)).map_err(crate::fetcher::FetcherError::Extract)?;
            let content_hash = format!("sha256:{}", sha256_hex(extracted.body_md.as_bytes()));
            Ok(ExtractResult {
                title: extracted.title,
                body_md: extracted.body_md,
                content_hash,
                metadata: extracted.metadata,
            })
        },
    )
    .await
    .context("fetching URL")?;

    if matches!(result.cache_status, CacheStatus::Stale { .. }) {
        tracing::warn!(
            target: "rover::cli::fetch",
            url = url.as_str(),
            "serving stale cache entry (network unavailable)"
        );
    }

    let canonical =
        Url::parse(&result.page.canonical_url).context("parsing canonical URL from cache row")?;

    // Choose the tokenizer for frontmatter `estimated_tokens` from config.
    let family = cfg.tokenizer.default;
    crate::tokenizer::ensure_loaded(family)
        .await
        .context("loading default tokenizer")?;
    let original_tokens = crate::tokenizer::count(&result.page.extracted_md, family)
        .context("counting tokens for frontmatter")?;

    // Recover the metadata persisted in the cache row (M2 `metadata_json`).
    // If deserialization fails (legacy rows, corrupt JSON), fall back to empty
    // — the frontmatter still renders, just without M4 fields.
    let metadata: crate::extractor::ExtractedMetadata = result
        .page
        .metadata_json
        .as_deref()
        .and_then(|s| serde_json::from_str(s).ok())
        .unwrap_or_default();
    // Extraction quality is scored on the extracted body, before any
    // summarization — it measures extraction fidelity, not the summary.
    let quality = crate::extractor::quality::score(
        &result.page.extracted_md,
        result.page.extracted_md.chars().count().max(1),
        !metadata.is_empty(),
        result.page.title.is_some(),
    );

    // Optional summarization. Mirrors the MCP `fetch` path: an explicit
    // `--summarize` blob runs first, then `--max-tokens` auto-summarizes when
    // the body is over budget. Built lazily so a plain fetch pays nothing.
    let (body_md, tokens, summarized) = if args.max_tokens.is_some() || summarize_opts.is_some() {
        let registry = std::sync::Arc::new(
            crate::summarizer::registry::build(&cfg, family)
                .context("building summarizer registry")?,
        );
        // Harden content fed to model backends (always-on, matching the MCP
        // path). Prompt-free extractive backends ignore it; cloud/local model
        // backends get cleaned, nonce-wrapped content.
        let guard = std::sync::Arc::new(
            crate::guard::Guard::from_config(&cfg.prompt_injection)
                .context("building prompt-injection guard")?,
        );
        let summarizer = crate::summarizer::SummarizerService::new(
            db.clone(),
            registry,
            cfg.summarization.fallback_to_extractive,
        )
        .with_guard(guard);
        let defaults = crate::summarizer::DefaultsHint::from_config(&cfg.summarization);
        maybe_summarize(
            &summarizer,
            &defaults,
            family,
            result.page.extracted_md.clone(),
            original_tokens,
            args.max_tokens,
            summarize_opts,
        )
        .await?
    } else {
        (result.page.extracted_md.clone(), original_tokens, false)
    };

    let meta = PageMeta {
        url: &url,
        canonical_url: &canonical,
        title: result.page.title.as_deref(),
        fetched_at: Timestamp::now(),
        body: &body_md,
        tokens,
        tokenizer_name: family.as_str(),
        description: metadata.description.as_deref(),
        author: metadata.author.as_deref(),
        published: metadata.published.as_deref(),
        modified: metadata.modified.as_deref(),
        image: metadata.image.as_deref(),
        og_type: metadata.og_type.as_deref(),
        language: metadata.language.as_deref(),
        schema_types: &metadata.schema_types,
        extraction_quality: quality,
        summarized,
        tables_transformed: &[],
        images_seen: 0,
        images_downloaded: 0,
        images_failed: 0,
        images_processed: vec![],
        prompt_injection: None,
    };

    let envelope = render(&meta);
    print!("{envelope}");

    if let Some(r) = &har_recorder
        && let Err(e) = r.flush().await
    {
        tracing::warn!(target: "rover::fetcher", error = ?e, "har flush failed");
    }

    // M9 fix C1: tear down the renderer cleanly so chromiumoxide's handler
    // task doesn't outlive this one-shot CLI invocation. `try_unwrap` is
    // expected to succeed — `fetch_with_cache` returned, so the only other
    // strong reference (the one we passed into `FetchOptions`) is gone.
    #[cfg(feature = "headless")]
    if let Some(renderer) = headless {
        match std::sync::Arc::try_unwrap(renderer) {
            Ok(r) => r.shutdown().await,
            Err(_still_shared) => {
                tracing::warn!(
                    target: "rover::cli::fetch",
                    "headless renderer still has outstanding Arc references at shutdown; skipping explicit shutdown",
                );
            }
        }
    }

    Ok(())
}

/// Apply optional summarization to `body`, mirroring the MCP `fetch` path.
/// An explicit `--summarize` blob runs first; then `--max-tokens` triggers
/// auto-summarization when the body is still over budget. Returns the
/// (possibly summarized) body, its token count, and whether summarization ran.
async fn maybe_summarize(
    summarizer: &crate::summarizer::SummarizerService,
    defaults: &crate::summarizer::DefaultsHint,
    family: crate::tokenizer::Tokenizer,
    body: String,
    tokens: usize,
    max_tokens: Option<usize>,
    summarize: Option<crate::mcp::tools::fetch::InlineSummarizeArgs>,
) -> anyhow::Result<(String, usize, bool)> {
    let mut body = body;
    let mut tokens = tokens;
    let mut summarized = false;

    // Explicit `--summarize` first: the body becomes the summary.
    if let Some(inline) = summarize {
        let opts = summarizer.resolve_defaults(
            inline.mode.map(Into::into),
            inline.style.map(Into::into),
            inline.target_tokens,
            inline.focus,
            inline.preserve.into_iter().map(Into::into).collect(),
            inline.backend,
            defaults,
        );
        body = compact_body(summarizer, &body, &opts).await?;
        tokens = crate::tokenizer::count(&body, family).context("counting summary tokens")?;
        summarized = true;
    }

    // Auto-summarize on `--max-tokens` overflow. Best-effort: the offline
    // extractive backend budgets by summing per-sentence token counts, so the
    // joined summary can land a few tokens over the target. The CLI budget is
    // a target, not a hard ceiling, so we emit the result rather than failing.
    // (If an explicit `--summarize` already ran, we keep that result.)
    if let Some(max) = max_tokens
        && tokens > max
        && !summarized
    {
        let opts = summarizer.resolve_defaults(None, None, Some(max), None, vec![], None, defaults);
        body = compact_body(summarizer, &body, &opts).await?;
        tokens = crate::tokenizer::count(&body, family).context("counting summary tokens")?;
        summarized = true;
    }

    Ok((body, tokens, summarized))
}

/// Run the summarizer over `body` and return the summary markdown.
async fn compact_body(
    summarizer: &crate::summarizer::SummarizerService,
    body: &str,
    opts: &crate::summarizer::backend::CompactOpts,
) -> anyhow::Result<String> {
    let content_hash = format!("sha256:{}", sha256_hex(body.as_bytes()));
    let r = summarizer
        .compact(&content_hash, body, opts)
        .await
        .context("summarizing extracted markdown")?;
    Ok(r.summary_md)
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::summarizer::{DefaultsHint, SummarizerService};
    use std::sync::Arc;

    fn default_config() -> crate::config::Config {
        toml::from_str("").unwrap()
    }

    async fn service() -> (SummarizerService, DefaultsHint, tempfile::TempDir) {
        let tmp = tempfile::tempdir().unwrap();
        let db = Db::open(tmp.path().join("t.db")).await.unwrap();
        let cfg = default_config();
        let family = cfg.tokenizer.default;
        crate::tokenizer::ensure_loaded(family).await.unwrap();
        let registry = Arc::new(crate::summarizer::registry::build(&cfg, family).unwrap());
        let svc = SummarizerService::new(db, registry, cfg.summarization.fallback_to_extractive);
        let defaults = DefaultsHint::from_config(&cfg.summarization);
        (svc, defaults, tmp)
    }

    /// A multi-sentence body the extractive backend can rank and trim.
    fn long_body() -> String {
        let mut s = String::new();
        for i in 0..80 {
            s.push_str(&format!(
                "Sentence number {i} states a distinct and self-contained fact about how a rover \
                 fetches and prepares web content for an agent to reason over. "
            ));
        }
        s
    }

    #[tokio::test]
    #[allow(clippy::await_holding_lock)] // serialised vs. tokenizer-registry-clearing tests
    async fn passthrough_when_under_budget_and_no_summarize() {
        let _tok = crate::tokenizer::_test_mutex()
            .lock()
            .unwrap_or_else(|e| e.into_inner());
        let (svc, defaults, _tmp) = service().await;
        let family = default_config().tokenizer.default;
        let body = "A short extracted body.".to_string();
        let tokens = crate::tokenizer::count(&body, family).unwrap();
        let (out, out_tokens, summarized) = maybe_summarize(
            &svc,
            &defaults,
            family,
            body.clone(),
            tokens,
            Some(10_000),
            None,
        )
        .await
        .unwrap();
        assert!(!summarized, "should not summarize when under budget");
        assert_eq!(out, body);
        assert_eq!(out_tokens, tokens);
    }

    #[tokio::test]
    #[allow(clippy::await_holding_lock)] // serialised vs. tokenizer-registry-clearing tests
    async fn explicit_summarize_shrinks_body() {
        let _tok = crate::tokenizer::_test_mutex()
            .lock()
            .unwrap_or_else(|e| e.into_inner());
        let (svc, defaults, _tmp) = service().await;
        let family = default_config().tokenizer.default;
        let body = long_body();
        let tokens = crate::tokenizer::count(&body, family).unwrap();
        let inline = crate::mcp::tools::fetch::InlineSummarizeArgs {
            target_tokens: Some(80),
            ..Default::default()
        };
        let (out, out_tokens, summarized) =
            maybe_summarize(&svc, &defaults, family, body, tokens, None, Some(inline))
                .await
                .unwrap();
        assert!(summarized);
        assert!(!out.is_empty());
        assert!(
            out_tokens < tokens,
            "summary should be smaller than the original ({out_tokens} !< {tokens})"
        );
    }

    #[tokio::test]
    #[allow(clippy::await_holding_lock)] // serialised vs. tokenizer-registry-clearing tests
    async fn max_tokens_auto_summarizes_over_budget() {
        let _tok = crate::tokenizer::_test_mutex()
            .lock()
            .unwrap_or_else(|e| e.into_inner());
        let (svc, defaults, _tmp) = service().await;
        let family = default_config().tokenizer.default;
        let body = long_body();
        let tokens = crate::tokenizer::count(&body, family).unwrap();
        assert!(
            tokens > 400,
            "fixture should exceed the budget (got {tokens})"
        );
        let (out, out_tokens, summarized) =
            maybe_summarize(&svc, &defaults, family, body, tokens, Some(400), None)
                .await
                .unwrap();
        assert!(summarized);
        assert!(!out.is_empty());
        assert!(
            out_tokens < tokens,
            "auto-summary should be smaller than the original ({out_tokens} !< {tokens})"
        );
    }
}