Skip to main content

rover/mcp/tools/
fetch.rs

1//! MCP `fetch` tool — wraps the M1/M2 pipeline behind a typed arg struct.
2
3use schemars::JsonSchema;
4use serde::{Deserialize, Serialize};
5use url::Url;
6
7use crate::extractor::frontmatter::{PageMeta, render as render_frontmatter};
8use crate::extractor::options::{ImagesMode, SampleStrategy, TablesMode};
9use crate::extractor::pipeline::extract;
10use crate::fetcher::cached::{ExtractResult, FetchOptions, fetch_with_cache, sha256_hex};
11use crate::mcp::envelope::{
12    CacheStatus, CountResponse, CountSingleResponse, CountSource, FetchResponse,
13};
14use crate::mcp::error::McpError;
15use crate::mcp::handler::{RoverHandler, resolve_tokenizer};
16use crate::tokenizer;
17
18/// Wire-side `fetch` tool arguments.
19///
20/// Live in M3+M4+M7+M9: `url`, `force_refresh`, `count_only`, `tokenizer`,
21/// `max_tokens`, `tables`, `images`, `metadata`, `summarize`, `headless`.
22///
23/// `tokenizer` is exposed as a string on the wire (rather than the
24/// [`Tokenizer`] enum) so the JSON schema doesn't have to mirror the
25/// enum's manual `Serialize`/`Deserialize` impls. Parsing happens inside
26/// [`RoverHandler::fetch_inner`].
27#[derive(Debug, Clone, Default, Serialize, Deserialize, JsonSchema)]
28#[serde(deny_unknown_fields)]
29pub struct FetchArgs {
30    pub url: String,
31
32    #[serde(default)]
33    pub force_refresh: bool,
34
35    /// Override the `[fetch] user_agent` for this call only. Applies to the
36    /// page request, its redirects, image sub-fetches, and robots matching.
37    #[serde(default)]
38    pub user_agent: Option<String>,
39
40    /// Override the `[fetch] timeout_secs` (per-request timeout, seconds) for
41    /// this call only. Must be greater than 0.
42    #[serde(default)]
43    pub timeout_secs: Option<u64>,
44
45    #[serde(default)]
46    pub count_only: bool,
47
48    #[serde(default)]
49    pub tokenizer: Option<String>,
50
51    #[serde(default)]
52    pub max_tokens: Option<usize>,
53
54    #[serde(default)]
55    pub tables: Option<TablesArg>,
56
57    #[serde(default)]
58    pub images: Option<ImagesArg>,
59
60    #[serde(default)]
61    pub metadata: Option<MetadataArg>,
62
63    /// Inline summarize request. When present, the returned `markdown` is
64    /// the summary of the extracted body (post tables/images passes), and
65    /// `FetchResponse.summarized` is `true`. The shape mirrors
66    /// [`crate::mcp::tools::summarize::SummarizeArgs`] minus the `url`.
67    ///
68    /// Any unset sub-field falls back to its default (mode/style/preserve/
69    /// target_tokens/focus/backend from the `[summarization]` config; same
70    /// defaults the standalone `summarize` tool uses).
71    #[serde(default)]
72    pub summarize: Option<InlineSummarizeArgs>,
73
74    #[serde(default)]
75    pub headless: Option<HeadlessArg>,
76
77    /// Optional per-call guard overrides. Each field is honored only if its
78    /// corresponding `[prompt_injection.agent_overrides]` grant is `true`.
79    #[serde(default)]
80    pub security: Option<crate::guard::SecurityArg>,
81}
82
83/// Inline `summarize` sub-arg for the `fetch` tool. Re-uses the same
84/// enums as the standalone `summarize` tool so a single CLI/schema source
85/// of truth covers both call sites.
86#[derive(Debug, Clone, Default, Serialize, Deserialize, JsonSchema)]
87#[serde(deny_unknown_fields)]
88pub struct InlineSummarizeArgs {
89    #[serde(default)]
90    pub target_tokens: Option<usize>,
91
92    #[serde(default)]
93    pub mode: Option<crate::mcp::tools::summarize::SummarizeMode>,
94
95    #[serde(default)]
96    pub focus: Option<String>,
97
98    #[serde(default)]
99    pub preserve: Vec<crate::mcp::tools::summarize::SummarizePreserve>,
100
101    #[serde(default)]
102    pub style: Option<crate::mcp::tools::summarize::SummarizeStyle>,
103
104    #[serde(default)]
105    pub backend: Option<String>,
106}
107
108/// Wire shape for `tables`.
109///
110/// Serializes via a custom flat shape (`{mode, strategy?, head?, tail?, rows?, seed?}`)
111/// so that `deny_unknown_fields` semantics on the outer args still surface
112/// stray keys inside the tables arg — `#[serde(flatten)]` is incompatible
113/// with `deny_unknown_fields`, so we hand-roll the parser instead.
114#[derive(Debug, Clone)]
115pub enum TablesArg {
116    Embed,
117    Drop,
118    CsvFile,
119    Summarize,
120    Sample { strategy: SampleArg },
121}
122
123#[derive(Debug, Clone)]
124pub enum SampleArg {
125    HeadTail { head: usize, tail: usize },
126    RandomSeed { rows: usize, seed: u64 },
127}
128
129#[derive(Debug, Serialize, Deserialize, JsonSchema)]
130#[serde(deny_unknown_fields, rename_all = "snake_case")]
131struct TablesArgWire {
132    mode: TablesModeWire,
133    #[serde(default)]
134    strategy: Option<SampleStrategyWire>,
135    #[serde(default)]
136    head: Option<usize>,
137    #[serde(default)]
138    tail: Option<usize>,
139    #[serde(default)]
140    rows: Option<usize>,
141    #[serde(default)]
142    seed: Option<u64>,
143}
144
145#[derive(Debug, Serialize, Deserialize, JsonSchema)]
146#[serde(rename_all = "snake_case")]
147enum TablesModeWire {
148    Embed,
149    Drop,
150    CsvFile,
151    Summarize,
152    Sample,
153}
154
155#[derive(Debug, Serialize, Deserialize, JsonSchema)]
156#[serde(rename_all = "snake_case")]
157enum SampleStrategyWire {
158    HeadTail,
159    RandomSeed,
160}
161
162impl<'de> Deserialize<'de> for TablesArg {
163    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
164    where
165        D: serde::Deserializer<'de>,
166    {
167        let w = TablesArgWire::deserialize(deserializer)?;
168        match w.mode {
169            TablesModeWire::Embed => Ok(TablesArg::Embed),
170            TablesModeWire::Drop => Ok(TablesArg::Drop),
171            TablesModeWire::CsvFile => Ok(TablesArg::CsvFile),
172            TablesModeWire::Summarize => Ok(TablesArg::Summarize),
173            TablesModeWire::Sample => {
174                let strategy = w.strategy.unwrap_or(SampleStrategyWire::HeadTail);
175                let inner = match strategy {
176                    SampleStrategyWire::HeadTail => SampleArg::HeadTail {
177                        head: w.head.unwrap_or_else(default_head),
178                        tail: w.tail.unwrap_or_else(default_tail),
179                    },
180                    SampleStrategyWire::RandomSeed => SampleArg::RandomSeed {
181                        rows: w.rows.unwrap_or_else(default_random_rows),
182                        seed: w.seed.unwrap_or_else(default_random_seed),
183                    },
184                };
185                Ok(TablesArg::Sample { strategy: inner })
186            }
187        }
188    }
189}
190
191impl Serialize for TablesArg {
192    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
193    where
194        S: serde::Serializer,
195    {
196        let w = match self {
197            TablesArg::Embed => TablesArgWire {
198                mode: TablesModeWire::Embed,
199                strategy: None,
200                head: None,
201                tail: None,
202                rows: None,
203                seed: None,
204            },
205            TablesArg::Drop => TablesArgWire {
206                mode: TablesModeWire::Drop,
207                strategy: None,
208                head: None,
209                tail: None,
210                rows: None,
211                seed: None,
212            },
213            TablesArg::CsvFile => TablesArgWire {
214                mode: TablesModeWire::CsvFile,
215                strategy: None,
216                head: None,
217                tail: None,
218                rows: None,
219                seed: None,
220            },
221            TablesArg::Summarize => TablesArgWire {
222                mode: TablesModeWire::Summarize,
223                strategy: None,
224                head: None,
225                tail: None,
226                rows: None,
227                seed: None,
228            },
229            TablesArg::Sample {
230                strategy: SampleArg::HeadTail { head, tail },
231            } => TablesArgWire {
232                mode: TablesModeWire::Sample,
233                strategy: Some(SampleStrategyWire::HeadTail),
234                head: Some(*head),
235                tail: Some(*tail),
236                rows: None,
237                seed: None,
238            },
239            TablesArg::Sample {
240                strategy: SampleArg::RandomSeed { rows, seed },
241            } => TablesArgWire {
242                mode: TablesModeWire::Sample,
243                strategy: Some(SampleStrategyWire::RandomSeed),
244                head: None,
245                tail: None,
246                rows: Some(*rows),
247                seed: Some(*seed),
248            },
249        };
250        w.serialize(serializer)
251    }
252}
253
254impl JsonSchema for TablesArg {
255    fn schema_name() -> std::borrow::Cow<'static, str> {
256        "TablesArg".into()
257    }
258
259    fn schema_id() -> std::borrow::Cow<'static, str> {
260        concat!(module_path!(), "::TablesArg").into()
261    }
262
263    fn json_schema(generator: &mut schemars::SchemaGenerator) -> schemars::Schema {
264        <TablesArgWire as JsonSchema>::json_schema(generator)
265    }
266}
267
268fn default_head() -> usize {
269    5
270}
271fn default_tail() -> usize {
272    5
273}
274fn default_random_rows() -> usize {
275    10
276}
277fn default_random_seed() -> u64 {
278    42
279}
280
281#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
282#[serde(deny_unknown_fields, rename_all = "snake_case", tag = "mode")]
283pub enum ImagesArg {
284    Keep,
285    AltTextOnly,
286    Download,
287    Drop,
288    /// Caption images via a configured captioner. Use `[image_captions]` /
289    /// `[captioners.<name>]` in config; per-call override via `captioner`.
290    Caption {
291        #[serde(default)]
292        captioner: Option<String>,
293    },
294}
295
296#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
297#[serde(deny_unknown_fields, rename_all = "snake_case")]
298pub enum MetadataArg {
299    Include,
300    Skip,
301}
302
303/// Wire shape for the `headless` arg.
304///
305/// All fields are optional; when omitted, `mode` falls back to the server's
306/// `[headless] auto_detect_spa` config key (`Auto` when true, `Off` when false).
307#[derive(Debug, Clone, Default, Serialize, Deserialize, JsonSchema)]
308#[serde(deny_unknown_fields)]
309pub struct HeadlessArg {
310    #[serde(default)]
311    pub mode: Option<HeadlessModeWire>,
312}
313
314/// Wire variant for `headless.mode`.
315#[derive(Debug, Clone, Copy, Serialize, Deserialize, JsonSchema)]
316#[serde(rename_all = "snake_case")]
317pub enum HeadlessModeWire {
318    Off,
319    On,
320    Auto,
321}
322
323fn tables_mode(arg: Option<&TablesArg>) -> Result<TablesMode, McpError> {
324    Ok(match arg {
325        None | Some(TablesArg::Embed) => TablesMode::Embed,
326        Some(TablesArg::Drop) => TablesMode::Drop,
327        Some(TablesArg::CsvFile) => TablesMode::CsvFile,
328        Some(TablesArg::Sample { strategy }) => match strategy {
329            SampleArg::HeadTail { head, tail } => {
330                if *head == 0 || *tail == 0 {
331                    return Err(McpError::InvalidArgs(
332                        "tables.sample head/tail must be > 0".into(),
333                    ));
334                }
335                TablesMode::Sample(SampleStrategy::HeadTail {
336                    head: *head,
337                    tail: *tail,
338                })
339            }
340            SampleArg::RandomSeed { rows, seed } => {
341                if *rows == 0 {
342                    return Err(McpError::InvalidArgs(
343                        "tables.sample rows must be > 0".into(),
344                    ));
345                }
346                TablesMode::Sample(SampleStrategy::RandomSeed {
347                    rows: *rows,
348                    seed: *seed,
349                })
350            }
351        },
352        Some(TablesArg::Summarize) => TablesMode::Summarize,
353    })
354}
355
356fn images_mode(arg: Option<&ImagesArg>) -> Result<(ImagesMode, Option<String>), McpError> {
357    Ok(match arg {
358        None | Some(ImagesArg::AltTextOnly) => (ImagesMode::AltTextOnly, None),
359        Some(ImagesArg::Keep) => (ImagesMode::Keep, None),
360        Some(ImagesArg::Download) => (ImagesMode::Download, None),
361        Some(ImagesArg::Drop) => (ImagesMode::Drop, None),
362        Some(ImagesArg::Caption { captioner }) => (ImagesMode::Caption, captioner.clone()),
363    })
364}
365
366/// Resolve `[image_captions]` defaults plus an optional per-call captioner
367/// override into the budget knobs `extractor::images::apply` consumes.
368fn build_caption_filters(
369    cfg: &crate::config::ImageCaptionsConfig,
370    override_name: Option<String>,
371) -> crate::extractor::options::ImageCaptionFilters {
372    crate::extractor::options::ImageCaptionFilters {
373        max_per_page: cfg.max_per_page,
374        min_width: cfg.min_width,
375        min_height: cfg.min_height,
376        max_bytes: cfg.max_bytes,
377        max_tokens: cfg.max_tokens,
378        captioner_override: override_name,
379    }
380}
381
382/// Convert the optional wire `headless` arg into the fetcher's `HeadlessMode`.
383///
384/// When no arg (or no `mode` sub-field) is provided, the config's
385/// `auto_detect_spa` flag drives the default: `Auto` when true, `Off` when
386/// false.
387fn resolve_headless(
388    arg: Option<&HeadlessArg>,
389    config: &crate::config::HeadlessConfig,
390) -> crate::fetcher::cached::HeadlessMode {
391    let mode = arg.and_then(|a| a.mode).map(|m| match m {
392        HeadlessModeWire::Off => crate::fetcher::cached::HeadlessMode::Off,
393        HeadlessModeWire::On => crate::fetcher::cached::HeadlessMode::On,
394        HeadlessModeWire::Auto => crate::fetcher::cached::HeadlessMode::Auto,
395    });
396    mode.unwrap_or(if config.auto_detect_spa {
397        crate::fetcher::cached::HeadlessMode::Auto
398    } else {
399        crate::fetcher::cached::HeadlessMode::Off
400    })
401}
402
403/// One of the two response shapes the `fetch` tool can produce, depending
404/// on `count_only`.
405///
406/// `JsonSchema` is implemented manually so the generated schema is rooted
407/// at `type: "object"` with a `oneOf` of the two variants. The MCP spec
408/// requires `outputSchema.type == "object"`, but the default schemars
409/// derive for an `#[serde(untagged)]` enum emits a bare `oneOf` with no
410/// root type, which rmcp's `schema_for_output` rejects at startup.
411#[derive(Debug, Clone, Serialize, Deserialize)]
412#[serde(untagged)]
413pub enum FetchOutput {
414    Full(FetchResponse),
415    Count(CountResponse),
416}
417
418impl JsonSchema for FetchOutput {
419    fn schema_name() -> std::borrow::Cow<'static, str> {
420        "FetchOutput".into()
421    }
422
423    fn schema_id() -> std::borrow::Cow<'static, str> {
424        concat!(module_path!(), "::FetchOutput").into()
425    }
426
427    fn json_schema(generator: &mut schemars::SchemaGenerator) -> schemars::Schema {
428        let full = generator.subschema_for::<FetchResponse>();
429        let count = generator.subschema_for::<CountResponse>();
430        schemars::json_schema!({
431            "type": "object",
432            "oneOf": [full, count],
433        })
434    }
435}
436
437/// Per-call summarize bookkeeping carried through `fetch_inner` into the
438/// response envelope.
439#[derive(Debug, Clone)]
440struct SummarizeOutcome {
441    summarized: bool,
442    fallback: Option<crate::mcp::envelope::SummarizerFallbackInfo>,
443}
444
445impl RoverHandler {
446    /// Run the summarizer against `body_md` with `opts` and lift the result
447    /// into the wire envelope's fallback shape.
448    async fn run_compact(
449        &self,
450        body_md: &str,
451        opts: &crate::summarizer::backend::CompactOpts,
452    ) -> Result<(String, Option<crate::mcp::envelope::SummarizerFallbackInfo>), McpError> {
453        let content_hash = format!(
454            "sha256:{}",
455            crate::fetcher::cached::sha256_hex(body_md.as_bytes()),
456        );
457        let r = self
458            .summarizer
459            .compact(&content_hash, body_md, opts)
460            .await?;
461        // FallbackInfo.reason is &'static str (stable enum-ish); on the wire
462        // it's String because serde Deserialize needs an owned String
463        // symmetrically.
464        let fallback = r
465            .fallback
466            .map(|f| crate::mcp::envelope::SummarizerFallbackInfo {
467                from: f.from,
468                reason: f.reason.to_string(),
469            });
470        Ok((r.summary_md, fallback))
471    }
472
473    /// Tool body, decoupled from the `#[tool]` macro for unit testing.
474    /// Task 11 wires this into the router; here it's a plain async method.
475    pub async fn fetch_inner(&self, args: FetchArgs) -> Result<FetchOutput, McpError> {
476        let url = Url::parse(&args.url).map_err(|e| McpError::InvalidUrl(e.to_string()))?;
477        if matches!(args.max_tokens, Some(0)) {
478            return Err(McpError::InvalidArgs(
479                "max_tokens must be greater than 0".into(),
480            ));
481        }
482        if matches!(args.timeout_secs, Some(0)) {
483            return Err(McpError::InvalidArgs(
484                "timeout_secs must be greater than 0".into(),
485            ));
486        }
487
488        // Per-call user_agent / timeout overrides. The shared client bakes in
489        // the configured UA + timeout, so an override needs a fresh client
490        // (still SSRF-policed — `build_http_client` installs the validating
491        // resolver). Build one only when an override is actually present;
492        // otherwise reuse the shared client.
493        let effective_ua = args
494            .user_agent
495            .clone()
496            .unwrap_or_else(|| self.config.fetch.user_agent.clone());
497        let per_call_client;
498        let client: &reqwest::Client = if args.user_agent.is_some() || args.timeout_secs.is_some() {
499            let timeout = args
500                .timeout_secs
501                .map(std::time::Duration::from_secs)
502                .unwrap_or_else(|| self.config.fetch.timeout());
503            per_call_client = crate::fetcher::client::build_http_client(&effective_ua, timeout);
504            &per_call_client
505        } else {
506            &self.client
507        };
508
509        let family = resolve_tokenizer(args.tokenizer.as_deref(), &self.config)?;
510
511        let headless_mode = resolve_headless(args.headless.as_ref(), &self.config.headless);
512
513        // M9 fix C1: lazily build a `HeadlessRenderer` the first time a
514        // request actually wants one (`On`, or `Auto` — the cached fetcher
515        // only re-renders under Auto when the SPA heuristic fires, but we
516        // still hand it the renderer so it has the option). The renderer
517        // lives in a process-shared `OnceCell` on the handler, so subsequent
518        // fetches reuse the same Chromium instance.
519        // Wrap the server's process-shared renderer cell in a handle. No
520        // browser is launched here: the cached fetcher initializes it lazily on
521        // first real use (SPA detected, bot-challenge bypass, or `On` mode), and
522        // the shared cell means one Chromium instance serves every request.
523        #[cfg(feature = "headless")]
524        let headless: Option<crate::fetcher::headless::HeadlessHandle> =
525            if !matches!(headless_mode, crate::fetcher::cached::HeadlessMode::Off) {
526                Some(crate::fetcher::headless::HeadlessHandle::with_cell(
527                    self.headless_renderer.clone(),
528                    self.config.headless.clone(),
529                ))
530            } else {
531                None
532            };
533
534        let result = fetch_with_cache(
535            &self.db,
536            client,
537            &self.pacer,
538            &self.config.rate_limit,
539            &self.config.robots,
540            &url,
541            &self.config.cache,
542            FetchOptions {
543                force_refresh: args.force_refresh,
544                ssrf_level: self.ssrf_level,
545                ssrf_project_root: self.ssrf_project_root.clone(),
546                har_recorder: self.har_recorder.clone(),
547                ignore_robots: false,
548                user_agent: effective_ua.clone(),
549                #[cfg(feature = "headless")]
550                headless,
551                headless_mode,
552                synchronous_revalidation: false,
553            },
554            |body, base| {
555                let extracted =
556                    extract(body, Some(base)).map_err(crate::fetcher::FetcherError::Extract)?;
557                let content_hash = format!("sha256:{}", sha256_hex(extracted.body_md.as_bytes()));
558                Ok(ExtractResult {
559                    title: extracted.title,
560                    body_md: extracted.body_md,
561                    content_hash,
562                    metadata: extracted.metadata,
563                })
564            },
565        )
566        .await?;
567
568        tokenizer::ensure_loaded(family).await?;
569
570        // Resolve per-request post-pass modes from the typed args.
571        let output_paths = std::sync::Arc::new(
572            crate::extractor::output::OutputPaths::resolve(self.config.output.dir.as_deref())
573                .map_err(McpError::Extractor)?,
574        );
575
576        let tables_mode_resolved = tables_mode(args.tables.as_ref())?;
577        let (images_mode_resolved, captioner_override) = images_mode(args.images.as_ref())?;
578        let caption_filters =
579            build_caption_filters(&self.config.image_captions, captioner_override);
580
581        // Run the M4 post-passes against the cached (pre-pass) body. These
582        // always run, even on cache hits: the cached `extracted_md` carries
583        // links-absolutized but no tables/images transforms.
584        let body_md = result.page.extracted_md.clone();
585        let tables_hook: Option<crate::extractor::tables::TableSummarizeHook> =
586            if matches!(tables_mode_resolved, TablesMode::Summarize) {
587                let summarizer = self.summarizer.clone();
588                let config = self.config.clone();
589                Some(std::sync::Arc::new(move |table_text: &str| {
590                    let summarizer = summarizer.clone();
591                    let config = config.clone();
592                    let table_text = table_text.to_string();
593                    Box::pin(async move {
594                        let defaults =
595                            crate::summarizer::DefaultsHint::from_config(&config.summarization);
596                        let opts = crate::summarizer::backend::CompactOpts {
597                            mode: defaults.mode,
598                            style: crate::summarizer::backend::Style::Bullet,
599                            target_tokens: Some(config.summarization.tables.target_tokens),
600                            focus: Some(config.summarization.tables.focus.clone()),
601                            preserve: vec![],
602                            backend_name: defaults.backend.clone(),
603                        };
604                        let content_hash = format!("sha256:{}", sha256_hex(table_text.as_bytes()));
605                        summarizer
606                            .compact(&content_hash, &table_text, &opts)
607                            .await
608                            .map(|r| {
609                                let fb =
610                                    r.fallback.map(|f| crate::extractor::tables::FallbackInfo {
611                                        from: f.from,
612                                        reason: f.reason.to_string(),
613                                    });
614                                if let Some(fb) = &fb {
615                                    tracing::debug!(
616                                        target: "rover::mcp",
617                                        from = %fb.from,
618                                        reason = %fb.reason,
619                                        "table summarizer fell back to extractive",
620                                    );
621                                }
622                                (r.summary_md, fb)
623                            })
624                            .map_err(|e| e.fallback_reason().to_string())
625                    })
626                        as std::pin::Pin<
627                            Box<
628                                dyn std::future::Future<
629                                        Output = Result<
630                                            (
631                                                String,
632                                                Option<crate::extractor::tables::FallbackInfo>,
633                                            ),
634                                            String,
635                                        >,
636                                    > + Send,
637                            >,
638                        >
639                }))
640            } else {
641                None
642            };
643        let (body_md, tables_transformed) = crate::extractor::tables::apply_with_summarizer(
644            &body_md,
645            &tables_mode_resolved,
646            &output_paths,
647            &url,
648            tables_hook.as_ref(),
649        )
650        .await
651        .map_err(McpError::Extractor)?;
652
653        let captioners_opt = if self.captioners.is_empty() {
654            None
655        } else {
656            Some(self.captioners.as_ref())
657        };
658        let images_result = crate::extractor::images::apply(
659            &body_md,
660            &images_mode_resolved,
661            &output_paths,
662            client,
663            captioners_opt,
664            &caption_filters,
665            Some(&self.db),
666            self.ssrf_level,
667        )
668        .await
669        .map_err(McpError::Extractor)?;
670        let body_md = images_result.markdown;
671
672        // Prompt-injection output guard: scan/act once here (after tables &
673        // images, before summarize). The wrapper is applied last (below).
674        let guard_assessment = self
675            .guard
676            .assess(url.as_str(), args.security.as_ref(), &body_md);
677        // When the body is returned directly (no summarize), apply the level
678        // action to it. When summarized, the returned body is the summary of
679        // HIGH-cleaned content (internal hardening) and is wrapped as-is.
680        let direct_body = if args.summarize.is_none() {
681            guard_assessment.acted_body.clone()
682        } else {
683            body_md.clone()
684        };
685        let body_md = direct_body;
686
687        // M7: optional inline `summarize` arg runs first against the
688        // post-pass body. If the agent provided this, the returned
689        // `markdown` is the summary.
690        let (body_md, summarize_meta): (String, Option<SummarizeOutcome>) = if let Some(inline) =
691            args.summarize.clone()
692        {
693            let defaults = crate::summarizer::DefaultsHint::from_config(&self.config.summarization);
694            let opts = self.summarizer.resolve_defaults(
695                inline.mode.map(Into::into),
696                inline.style.map(Into::into),
697                inline.target_tokens,
698                inline.focus,
699                inline.preserve.into_iter().map(Into::into).collect(),
700                inline.backend,
701                &defaults,
702            );
703            let (summary_md, fallback) = self.run_compact(&body_md, &opts).await?;
704            (
705                summary_md,
706                Some(SummarizeOutcome {
707                    summarized: true,
708                    fallback,
709                }),
710            )
711        } else {
712            (body_md, None)
713        };
714
715        // Recompute tokens against the (possibly summarized) body; `max_tokens`
716        // constrains what the agent will actually see.
717        let tokens = tokenizer::count(&body_md, family)?;
718
719        // M7: auto-summarize on `max_tokens` overflow. Single-shot: if the
720        // resulting summary is still over budget, return MaxTokensExceeded.
721        // If the agent already supplied an explicit `summarize` arg, don't
722        // override that choice — surface the error directly.
723        let (body_md, tokens, auto_meta): (String, usize, Option<SummarizeOutcome>) =
724            if let Some(max) = args.max_tokens {
725                if tokens <= max {
726                    (body_md, tokens, None)
727                } else if summarize_meta.is_some() {
728                    return Err(McpError::MaxTokensExceeded {
729                        actual: tokens,
730                        max,
731                        was_auto: false,
732                    });
733                } else if args.count_only {
734                    // count_only is a probe: tell the agent the real size
735                    // without auto-correction. Don't summarize — fall through
736                    // to the count_only short-circuit which reports the real
737                    // (over-budget) token count.
738                    (body_md, tokens, None)
739                } else {
740                    let defaults =
741                        crate::summarizer::DefaultsHint::from_config(&self.config.summarization);
742                    let opts = self.summarizer.resolve_defaults(
743                        None,
744                        None,
745                        Some(max),
746                        None,
747                        vec![],
748                        None,
749                        &defaults,
750                    );
751                    let (summary_md, fallback) = self.run_compact(&body_md, &opts).await?;
752                    let new_tokens = tokenizer::count(&summary_md, family)?;
753                    if new_tokens > max {
754                        return Err(McpError::MaxTokensExceeded {
755                            actual: new_tokens,
756                            max,
757                            was_auto: true,
758                        });
759                    }
760                    (
761                        summary_md,
762                        new_tokens,
763                        Some(SummarizeOutcome {
764                            summarized: true,
765                            fallback,
766                        }),
767                    )
768                }
769            } else {
770                (body_md, tokens, None)
771            };
772
773        // Build the optional SWR envelope before lowering `cache_status` to the
774        // unit-variant wire enum.
775        let revalidation = match &result.cache_status {
776            crate::fetcher::cached::CacheStatus::Stale {
777                revalidation_task_id: Some(id),
778            } => Some(crate::mcp::envelope::StaleRevalidation {
779                task_id: id.clone(),
780                monitor_command: format!("rover task {id} --monitor"),
781                poll_command: format!("rover task {id}"),
782                hint: "Optional. Revalidation runs in the background regardless.".into(),
783            }),
784            _ => None,
785        };
786
787        let cache_status: CacheStatus = result.cache_status.into();
788
789        if args.count_only {
790            // `result.page.content_hash` is already prefixed (`sha256:...`)
791            // by the `extract_fn` above; pass it through verbatim.
792            return Ok(FetchOutput::Count(CountResponse::Single(
793                CountSingleResponse {
794                    tokens,
795                    tokenizer: family.as_str().to_string(),
796                    source: CountSource::Url,
797                    url: Some(url.as_str().to_string()),
798                    content_hash: Some(result.page.content_hash.clone()),
799                    fetched_at: Some(
800                        jiff::Timestamp::from_second(result.page.fetched_at)
801                            .map(|t| t.to_string())
802                            .unwrap_or_default(),
803                    ),
804                    cache_status: Some(cache_status),
805                },
806            )));
807        }
808
809        let canonical = Url::parse(&result.page.canonical_url)
810            .map_err(|e| McpError::InvalidUrl(e.to_string()))?;
811        // Recover the metadata persisted in the cache row. See cli/fetch.rs
812        // for the rationale on the `raw_html_text_len` fallback used by the
813        // quality scorer.
814        let metadata: crate::extractor::ExtractedMetadata = result
815            .page
816            .metadata_json
817            .as_deref()
818            .and_then(|s| serde_json::from_str(s).ok())
819            .unwrap_or_default();
820        // Honor MetadataArg::Skip: hide all metadata from the response.
821        // (The cache row still carries metadata_json — only the wire output is blanked.)
822        let metadata = match args.metadata.as_ref() {
823            Some(MetadataArg::Skip) => crate::extractor::ExtractedMetadata::default(),
824            _ => metadata,
825        };
826        let quality = crate::extractor::quality::score(
827            &body_md,
828            body_md.chars().count().max(1),
829            !metadata.is_empty(),
830            result.page.title.is_some(),
831        );
832        let frontmatter = render_frontmatter(&PageMeta {
833            url: &url,
834            canonical_url: &canonical,
835            title: result.page.title.as_deref(),
836            fetched_at: jiff::Timestamp::now(),
837            body: &body_md,
838            tokens,
839            tokenizer_name: family.as_str(),
840            description: metadata.description.as_deref(),
841            author: metadata.author.as_deref(),
842            published: metadata.published.as_deref(),
843            modified: metadata.modified.as_deref(),
844            image: metadata.image.as_deref(),
845            og_type: metadata.og_type.as_deref(),
846            language: metadata.language.as_deref(),
847            schema_types: &metadata.schema_types,
848            // MCP reports summarization via the `FetchResponse.summarized`
849            // envelope field; the in-content frontmatter marker stays off.
850            extraction_quality: quality,
851            summarized: false,
852            headless_render: result.page.render_reason.as_deref(),
853            tables_transformed: &tables_transformed,
854            images_seen: images_result.images_seen,
855            images_downloaded: images_result.images_downloaded,
856            images_failed: images_result.images_failed,
857            images_processed: images_result.images_processed.clone(),
858            prompt_injection: Some(&guard_assessment.telemetry),
859        });
860
861        let summarized_flag = summarize_meta.as_ref().map(|o| o.summarized);
862        let auto_summarized_flag = auto_meta.as_ref().map(|o| o.summarized);
863        let summarizer_fallback = summarize_meta
864            .and_then(|o| o.fallback)
865            .or_else(|| auto_meta.and_then(|o| o.fallback));
866
867        Ok(FetchOutput::Full(FetchResponse {
868            content: self.guard.finish(
869                &guard_assessment,
870                &frontmatter,
871                &body_md,
872                args.summarize.is_none(),
873            ),
874            cache_status,
875            revalidation,
876            summarized: summarized_flag,
877            auto_summarized: auto_summarized_flag,
878            summarizer_fallback,
879        }))
880    }
881}
882
883#[cfg(test)]
884mod tests {
885    use std::str::FromStr;
886
887    use super::*;
888    use crate::tokenizer::Tokenizer;
889
890    #[test]
891    fn fetch_args_deserialize_minimal() {
892        let v: FetchArgs = serde_json::from_str(r#"{"url":"https://example.com"}"#).unwrap();
893        assert_eq!(v.url, "https://example.com");
894        assert!(!v.force_refresh);
895        assert!(!v.count_only);
896        assert!(v.tokenizer.is_none());
897        assert!(v.max_tokens.is_none());
898    }
899
900    #[test]
901    fn fetch_args_headless_typed_mode_auto() {
902        let v: FetchArgs = serde_json::from_str(
903            r#"{
904                "url":"https://example.com",
905                "headless": { "mode": "auto" }
906            }"#,
907        )
908        .unwrap();
909        let h = v.headless.expect("headless parsed");
910        assert!(matches!(h.mode, Some(HeadlessModeWire::Auto)));
911    }
912
913    #[test]
914    fn fetch_args_parse_typed_summarize() {
915        let v: FetchArgs = serde_json::from_str(
916            r#"{
917                "url":"https://example.com",
918                "summarize":{
919                    "target_tokens":500,
920                    "mode":"extractive",
921                    "style":"bullet",
922                    "preserve":["code","tables"]
923                }
924            }"#,
925        )
926        .unwrap();
927        let s = v.summarize.expect("summarize parsed");
928        assert_eq!(s.target_tokens, Some(500));
929        assert!(matches!(
930            s.mode,
931            Some(crate::mcp::tools::summarize::SummarizeMode::Extractive)
932        ));
933        assert!(matches!(
934            s.style,
935            Some(crate::mcp::tools::summarize::SummarizeStyle::Bullet)
936        ));
937        assert_eq!(s.preserve.len(), 2);
938    }
939
940    #[test]
941    fn fetch_args_reject_unknown_summarize_field() {
942        let r: Result<FetchArgs, _> =
943            serde_json::from_str(r#"{"url":"https://x/","summarize":{"bogus":1}}"#);
944        assert!(r.is_err());
945    }
946
947    #[test]
948    fn fetch_args_reject_unknown_fields() {
949        let r: Result<FetchArgs, _> =
950            serde_json::from_str(r#"{"url":"https://example.com","bogus":1}"#);
951        assert!(r.is_err());
952    }
953
954    #[test]
955    fn fetch_args_parse_user_agent_and_timeout_overrides() {
956        let v: FetchArgs = serde_json::from_str(
957            r#"{"url":"https://example.com","user_agent":"my-agent/2.0","timeout_secs":42}"#,
958        )
959        .unwrap();
960        assert_eq!(v.user_agent.as_deref(), Some("my-agent/2.0"));
961        assert_eq!(v.timeout_secs, Some(42));
962    }
963
964    #[test]
965    fn fetch_args_default_transport_overrides_are_none() {
966        let v: FetchArgs = serde_json::from_str(r#"{"url":"https://example.com"}"#).unwrap();
967        assert!(v.user_agent.is_none());
968        assert!(v.timeout_secs.is_none());
969    }
970
971    #[test]
972    fn fetch_args_parse_tokenizer_string() {
973        let v: FetchArgs =
974            serde_json::from_str(r#"{"url":"https://example.com","tokenizer":"claude"}"#).unwrap();
975        assert_eq!(v.tokenizer.as_deref(), Some("claude"));
976        // And the string parses to the enum variant we expect.
977        let t = Tokenizer::from_str(v.tokenizer.as_deref().unwrap()).unwrap();
978        assert_eq!(t, Tokenizer::Claude);
979    }
980
981    #[test]
982    fn fetch_args_schema_contains_all_documented_fields() {
983        let schema = schemars::schema_for!(FetchArgs);
984        let json = serde_json::to_string(&schema).unwrap();
985        for field in [
986            "url",
987            "force_refresh",
988            "count_only",
989            "tokenizer",
990            "max_tokens",
991            "headless",
992            "tables",
993            "images",
994            "metadata",
995            "summarize",
996        ] {
997            assert!(json.contains(field), "schema missing field: {field}");
998        }
999    }
1000
1001    #[test]
1002    fn typed_tables_sample_parses() {
1003        let v: FetchArgs = serde_json::from_str(
1004            r#"{"url":"https://x/","tables":{"mode":"sample","strategy":"head_tail","head":3,"tail":2}}"#,
1005        )
1006        .unwrap();
1007        match v.tables.unwrap() {
1008            TablesArg::Sample {
1009                strategy: SampleArg::HeadTail { head, tail },
1010            } => {
1011                assert_eq!(head, 3);
1012                assert_eq!(tail, 2);
1013            }
1014            _ => panic!("wrong variant"),
1015        }
1016    }
1017
1018    #[test]
1019    fn typed_tables_rejects_unknown_field() {
1020        let r: Result<FetchArgs, _> =
1021            serde_json::from_str(r#"{"url":"https://x/","tables":{"mode":"embed","bogus":1}}"#);
1022        assert!(r.is_err());
1023    }
1024
1025    #[test]
1026    fn typed_images_download_parses() {
1027        let v: FetchArgs =
1028            serde_json::from_str(r#"{"url":"https://x/","images":{"mode":"download"}}"#).unwrap();
1029        assert!(matches!(v.images, Some(ImagesArg::Download)));
1030    }
1031
1032    #[test]
1033    fn typed_images_caption_parses_without_captioner() {
1034        let v: FetchArgs =
1035            serde_json::from_str(r#"{"url":"https://x/","images":{"mode":"caption"}}"#).unwrap();
1036        assert!(matches!(
1037            v.images,
1038            Some(ImagesArg::Caption { captioner: None })
1039        ));
1040    }
1041
1042    #[test]
1043    fn typed_images_caption_parses_with_captioner_override() {
1044        let v: FetchArgs = serde_json::from_str(
1045            r#"{"url":"https://x/","images":{"mode":"caption","captioner":"gpt4o"}}"#,
1046        )
1047        .unwrap();
1048        assert!(matches!(
1049            v.images,
1050            Some(ImagesArg::Caption { captioner: Some(ref s) }) if s == "gpt4o"
1051        ));
1052    }
1053
1054    #[test]
1055    fn typed_metadata_skip_parses() {
1056        let v: FetchArgs =
1057            serde_json::from_str(r#"{"url":"https://x/","metadata":"skip"}"#).unwrap();
1058        assert!(matches!(v.metadata, Some(MetadataArg::Skip)));
1059    }
1060}