rover-fetch 0.1.0

An MCP server for fetching and prepping web content for LLM agents.
Documentation
//! MCP `get_metadata` tool — fetch metadata only (no markdown body).

use schemars::JsonSchema;
use serde::{Deserialize, Serialize};
use url::Url;

use crate::fetcher::cached::{ExtractResult, FetchOptions, fetch_with_cache, sha256_hex};
use crate::mcp::envelope::MetadataResponse;
use crate::mcp::error::McpError;
use crate::mcp::handler::{RoverHandler, resolve_tokenizer};
use crate::tokenizer;

#[derive(Debug, Clone, Default, Serialize, Deserialize, JsonSchema)]
#[serde(deny_unknown_fields)]
pub struct GetMetadataArgs {
    pub url: String,
    #[serde(default)]
    pub force_refresh: bool,
    #[serde(default)]
    pub tokenizer: Option<String>,
    #[serde(default)]
    pub security: Option<crate::guard::SecurityArg>,
}

impl RoverHandler {
    pub async fn get_metadata_inner(
        &self,
        args: GetMetadataArgs,
    ) -> Result<MetadataResponse, McpError> {
        let url = Url::parse(&args.url).map_err(|e| McpError::InvalidUrl(e.to_string()))?;
        let family = resolve_tokenizer(args.tokenizer.as_deref(), &self.config)?;
        tokenizer::ensure_loaded(family).await?;

        let result = fetch_with_cache(
            &self.db,
            &self.client,
            &self.pacer,
            &self.config.rate_limit,
            &self.config.robots,
            &url,
            &self.config.cache,
            FetchOptions {
                force_refresh: args.force_refresh,
                ssrf_level: self.ssrf_level,
                ssrf_project_root: self.ssrf_project_root.clone(),
                har_recorder: self.har_recorder.clone(),
                ignore_robots: false,
                user_agent: self.config.fetch.user_agent.clone(),
                #[cfg(feature = "headless")]
                headless: None,
                headless_mode: crate::fetcher::HeadlessMode::Off,
                synchronous_revalidation: false,
            },
            |body, base| {
                let extracted = crate::extractor::pipeline::extract(body, Some(base))
                    .map_err(crate::fetcher::FetcherError::Extract)?;
                let content_hash = format!("sha256:{}", sha256_hex(extracted.body_md.as_bytes()));
                Ok(ExtractResult {
                    title: extracted.title,
                    body_md: extracted.body_md,
                    content_hash,
                    metadata: extracted.metadata,
                })
            },
        )
        .await?;

        let metadata: crate::extractor::ExtractedMetadata = result
            .page
            .metadata_json
            .as_deref()
            .and_then(|s| serde_json::from_str(s).ok())
            .unwrap_or_default();

        let quality = crate::extractor::quality::score(
            &result.page.extracted_md,
            result.page.extracted_md.chars().count().max(1),
            !metadata.is_empty(),
            result.page.title.is_some(),
        );

        // Guard the prose metadata fields in place (no wrapper — structured
        // response). Structured fields (URLs, dates, og_type, language,
        // schema_types) are left untouched.
        let mut title = metadata.title.clone();
        let mut description = metadata.description.clone();
        let mut author = metadata.author.clone();
        let metadata_guard = {
            let mut fields: Vec<&mut String> = Vec::new();
            if let Some(s) = title.as_mut() {
                fields.push(s);
            }
            if let Some(s) = description.as_mut() {
                fields.push(s);
            }
            if let Some(s) = author.as_mut() {
                fields.push(s);
            }
            self.guard
                .guard_metadata(url.as_str(), args.security.as_ref(), &mut fields)
        };

        Ok(MetadataResponse {
            title,
            description,
            author,
            published: metadata.published.clone(),
            modified: metadata.modified.clone(),
            image: metadata.image.clone(),
            og_type: metadata.og_type.clone(),
            canonical: metadata.canonical.clone(),
            language: metadata.language.clone(),
            schema_types: metadata.schema_types.clone(),
            extraction_quality: quality,
            url: url.as_str().to_string(),
            content_hash: result.page.content_hash.clone(),
            fetched_at: jiff::Timestamp::from_second(result.page.fetched_at)
                .map(|t| t.to_string())
                .unwrap_or_default(),
            cache_status: result.cache_status.into(),
            prompt_injection: metadata_guard.telemetry,
            security_notice: metadata_guard.notice,
        })
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn schema_contains_required_fields() {
        let schema = schemars::schema_for!(GetMetadataArgs);
        let json = serde_json::to_string(&schema).unwrap();
        for f in ["url", "force_refresh", "tokenizer"] {
            assert!(json.contains(f), "missing {f}");
        }
    }

    #[test]
    fn rejects_unknown_field() {
        let r: Result<GetMetadataArgs, _> =
            serde_json::from_str(r#"{"url":"https://x/","bogus":1}"#);
        assert!(r.is_err());
    }
}