Skip to main content

rover/mcp/tools/
get_metadata.rs

1//! MCP `get_metadata` tool — fetch metadata only (no markdown body).
2
3use schemars::JsonSchema;
4use serde::{Deserialize, Serialize};
5use url::Url;
6
7use crate::fetcher::cached::{ExtractResult, FetchOptions, fetch_with_cache, sha256_hex};
8use crate::mcp::envelope::MetadataResponse;
9use crate::mcp::error::McpError;
10use crate::mcp::handler::{RoverHandler, resolve_tokenizer};
11use crate::tokenizer;
12
13#[derive(Debug, Clone, Default, Serialize, Deserialize, JsonSchema)]
14#[serde(deny_unknown_fields)]
15pub struct GetMetadataArgs {
16    pub url: String,
17    #[serde(default)]
18    pub force_refresh: bool,
19    #[serde(default)]
20    pub tokenizer: Option<String>,
21    #[serde(default)]
22    pub security: Option<crate::guard::SecurityArg>,
23}
24
25impl RoverHandler {
26    pub async fn get_metadata_inner(
27        &self,
28        args: GetMetadataArgs,
29    ) -> Result<MetadataResponse, McpError> {
30        let url = Url::parse(&args.url).map_err(|e| McpError::InvalidUrl(e.to_string()))?;
31        let family = resolve_tokenizer(args.tokenizer.as_deref(), &self.config)?;
32        tokenizer::ensure_loaded(family).await?;
33
34        let result = fetch_with_cache(
35            &self.db,
36            &self.client,
37            &self.pacer,
38            &self.config.rate_limit,
39            &self.config.robots,
40            &url,
41            &self.config.cache,
42            FetchOptions {
43                force_refresh: args.force_refresh,
44                ssrf_level: self.ssrf_level,
45                ssrf_project_root: self.ssrf_project_root.clone(),
46                har_recorder: self.har_recorder.clone(),
47                ignore_robots: false,
48                user_agent: self.config.fetch.user_agent.clone(),
49                #[cfg(feature = "headless")]
50                headless: None,
51                headless_mode: crate::fetcher::HeadlessMode::Off,
52                synchronous_revalidation: false,
53            },
54            |body, base| {
55                let extracted = crate::extractor::pipeline::extract(body, Some(base))
56                    .map_err(crate::fetcher::FetcherError::Extract)?;
57                let content_hash = format!("sha256:{}", sha256_hex(extracted.body_md.as_bytes()));
58                Ok(ExtractResult {
59                    title: extracted.title,
60                    body_md: extracted.body_md,
61                    content_hash,
62                    metadata: extracted.metadata,
63                })
64            },
65        )
66        .await?;
67
68        let metadata: crate::extractor::ExtractedMetadata = result
69            .page
70            .metadata_json
71            .as_deref()
72            .and_then(|s| serde_json::from_str(s).ok())
73            .unwrap_or_default();
74
75        let quality = crate::extractor::quality::score(
76            &result.page.extracted_md,
77            result.page.extracted_md.chars().count().max(1),
78            !metadata.is_empty(),
79            result.page.title.is_some(),
80        );
81
82        // Guard the prose metadata fields in place (no wrapper — structured
83        // response). Structured fields (URLs, dates, og_type, language,
84        // schema_types) are left untouched.
85        let mut title = metadata.title.clone();
86        let mut description = metadata.description.clone();
87        let mut author = metadata.author.clone();
88        let metadata_guard = {
89            let mut fields: Vec<&mut String> = Vec::new();
90            if let Some(s) = title.as_mut() {
91                fields.push(s);
92            }
93            if let Some(s) = description.as_mut() {
94                fields.push(s);
95            }
96            if let Some(s) = author.as_mut() {
97                fields.push(s);
98            }
99            self.guard
100                .guard_metadata(url.as_str(), args.security.as_ref(), &mut fields)
101        };
102
103        Ok(MetadataResponse {
104            title,
105            description,
106            author,
107            published: metadata.published.clone(),
108            modified: metadata.modified.clone(),
109            image: metadata.image.clone(),
110            og_type: metadata.og_type.clone(),
111            canonical: metadata.canonical.clone(),
112            language: metadata.language.clone(),
113            schema_types: metadata.schema_types.clone(),
114            extraction_quality: quality,
115            url: url.as_str().to_string(),
116            content_hash: result.page.content_hash.clone(),
117            fetched_at: jiff::Timestamp::from_second(result.page.fetched_at)
118                .map(|t| t.to_string())
119                .unwrap_or_default(),
120            cache_status: result.cache_status.into(),
121            prompt_injection: metadata_guard.telemetry,
122            security_notice: metadata_guard.notice,
123        })
124    }
125}
126
127#[cfg(test)]
128mod tests {
129    use super::*;
130
131    #[test]
132    fn schema_contains_required_fields() {
133        let schema = schemars::schema_for!(GetMetadataArgs);
134        let json = serde_json::to_string(&schema).unwrap();
135        for f in ["url", "force_refresh", "tokenizer"] {
136            assert!(json.contains(f), "missing {f}");
137        }
138    }
139
140    #[test]
141    fn rejects_unknown_field() {
142        let r: Result<GetMetadataArgs, _> =
143            serde_json::from_str(r#"{"url":"https://x/","bogus":1}"#);
144        assert!(r.is_err());
145    }
146}