rover/mcp/tools/
get_metadata.rs1use schemars::JsonSchema;
4use serde::{Deserialize, Serialize};
5use url::Url;
6
7use crate::fetcher::cached::{ExtractResult, FetchOptions, fetch_with_cache, sha256_hex};
8use crate::mcp::envelope::MetadataResponse;
9use crate::mcp::error::McpError;
10use crate::mcp::handler::{RoverHandler, resolve_tokenizer};
11use crate::tokenizer;
12
13#[derive(Debug, Clone, Default, Serialize, Deserialize, JsonSchema)]
14#[serde(deny_unknown_fields)]
15pub struct GetMetadataArgs {
16 pub url: String,
17 #[serde(default)]
18 pub force_refresh: bool,
19 #[serde(default)]
20 pub tokenizer: Option<String>,
21 #[serde(default)]
22 pub security: Option<crate::guard::SecurityArg>,
23}
24
25impl RoverHandler {
26 pub async fn get_metadata_inner(
27 &self,
28 args: GetMetadataArgs,
29 ) -> Result<MetadataResponse, McpError> {
30 let url = Url::parse(&args.url).map_err(|e| McpError::InvalidUrl(e.to_string()))?;
31 let family = resolve_tokenizer(args.tokenizer.as_deref(), &self.config)?;
32 tokenizer::ensure_loaded(family).await?;
33
34 let result = fetch_with_cache(
35 &self.db,
36 &self.client,
37 &self.pacer,
38 &self.config.rate_limit,
39 &self.config.robots,
40 &url,
41 &self.config.cache,
42 FetchOptions {
43 force_refresh: args.force_refresh,
44 ssrf_level: self.ssrf_level,
45 ssrf_project_root: self.ssrf_project_root.clone(),
46 har_recorder: self.har_recorder.clone(),
47 ignore_robots: false,
48 user_agent: self.config.fetch.user_agent.clone(),
49 #[cfg(feature = "headless")]
50 headless: None,
51 headless_mode: crate::fetcher::HeadlessMode::Off,
52 synchronous_revalidation: false,
53 },
54 |body, base| {
55 let extracted = crate::extractor::pipeline::extract(body, Some(base))
56 .map_err(crate::fetcher::FetcherError::Extract)?;
57 let content_hash = format!("sha256:{}", sha256_hex(extracted.body_md.as_bytes()));
58 Ok(ExtractResult {
59 title: extracted.title,
60 body_md: extracted.body_md,
61 content_hash,
62 metadata: extracted.metadata,
63 })
64 },
65 )
66 .await?;
67
68 let metadata: crate::extractor::ExtractedMetadata = result
69 .page
70 .metadata_json
71 .as_deref()
72 .and_then(|s| serde_json::from_str(s).ok())
73 .unwrap_or_default();
74
75 let quality = crate::extractor::quality::score(
76 &result.page.extracted_md,
77 result.page.extracted_md.chars().count().max(1),
78 !metadata.is_empty(),
79 result.page.title.is_some(),
80 );
81
82 let mut title = metadata.title.clone();
86 let mut description = metadata.description.clone();
87 let mut author = metadata.author.clone();
88 let metadata_guard = {
89 let mut fields: Vec<&mut String> = Vec::new();
90 if let Some(s) = title.as_mut() {
91 fields.push(s);
92 }
93 if let Some(s) = description.as_mut() {
94 fields.push(s);
95 }
96 if let Some(s) = author.as_mut() {
97 fields.push(s);
98 }
99 self.guard
100 .guard_metadata(url.as_str(), args.security.as_ref(), &mut fields)
101 };
102
103 Ok(MetadataResponse {
104 title,
105 description,
106 author,
107 published: metadata.published.clone(),
108 modified: metadata.modified.clone(),
109 image: metadata.image.clone(),
110 og_type: metadata.og_type.clone(),
111 canonical: metadata.canonical.clone(),
112 language: metadata.language.clone(),
113 schema_types: metadata.schema_types.clone(),
114 extraction_quality: quality,
115 url: url.as_str().to_string(),
116 content_hash: result.page.content_hash.clone(),
117 fetched_at: jiff::Timestamp::from_second(result.page.fetched_at)
118 .map(|t| t.to_string())
119 .unwrap_or_default(),
120 cache_status: result.cache_status.into(),
121 prompt_injection: metadata_guard.telemetry,
122 security_notice: metadata_guard.notice,
123 })
124 }
125}
126
127#[cfg(test)]
128mod tests {
129 use super::*;
130
131 #[test]
132 fn schema_contains_required_fields() {
133 let schema = schemars::schema_for!(GetMetadataArgs);
134 let json = serde_json::to_string(&schema).unwrap();
135 for f in ["url", "force_refresh", "tokenizer"] {
136 assert!(json.contains(f), "missing {f}");
137 }
138 }
139
140 #[test]
141 fn rejects_unknown_field() {
142 let r: Result<GetMetadataArgs, _> =
143 serde_json::from_str(r#"{"url":"https://x/","bogus":1}"#);
144 assert!(r.is_err());
145 }
146}