Skip to main content

essence/
mcp.rs

1//! MCP (Model Context Protocol) server implementation for Essence.
2//!
3//! Exposes Essence's web retrieval capabilities as MCP tools so that AI agents
4//! (Claude, etc.) can use scrape, map, crawl, and search functionality.
5
6use rmcp::{
7    ServerHandler,
8    handler::server::tool::ToolRouter,
9    model::*,
10    tool, tool_handler, tool_router,
11    handler::server::wrapper::Parameters,
12    ErrorData as McpError,
13};
14use schemars::JsonSchema;
15use serde::Deserialize;
16use tracing::{error, info};
17
18use crate::{
19    api::scrape::scrape_core_logic,
20    crawler::{crawl_website, mapper},
21    search::SearchProvider,
22    types::{
23        CrawlRequest, MapRequest, ScrapeRequest,
24    },
25};
26
27// ---------------------------------------------------------------------------
28// Parameter structs (derive JsonSchema for MCP tool input schema generation)
29// ---------------------------------------------------------------------------
30
31/// Parameters for the `scrape` tool.
32#[derive(Debug, Clone, Deserialize, JsonSchema)]
33pub struct ScrapeParams {
34    /// The URL to scrape.
35    pub url: String,
36
37    /// Output formats to return (e.g. "markdown", "html", "links").
38    /// Defaults to ["markdown"].
39    #[serde(default)]
40    pub formats: Option<Vec<String>>,
41
42    /// Rendering engine: "auto", "http", or "browser".
43    /// Defaults to "auto".
44    #[serde(default)]
45    pub engine: Option<String>,
46
47    /// Request timeout in milliseconds. Defaults to 30000.
48    #[serde(default)]
49    pub timeout_ms: Option<u64>,
50}
51
52/// Parameters for the `map` tool.
53#[derive(Debug, Clone, Deserialize, JsonSchema)]
54pub struct MapParams {
55    /// The URL to discover links from.
56    pub url: String,
57
58    /// Search query to filter discovered URLs.
59    #[serde(default)]
60    pub search: Option<String>,
61
62    /// Skip sitemap.xml discovery.
63    #[serde(default)]
64    pub ignore_sitemap: Option<bool>,
65
66    /// Include subdomains in discovery.
67    #[serde(default)]
68    pub include_subdomains: Option<bool>,
69
70    /// Maximum number of URLs to return. Defaults to 5000.
71    #[serde(default)]
72    pub limit: Option<u32>,
73}
74
75/// Parameters for the `crawl` tool.
76#[derive(Debug, Clone, Deserialize, JsonSchema)]
77pub struct CrawlParams {
78    /// The starting URL to crawl.
79    pub url: String,
80
81    /// Maximum crawl depth. Defaults to 2.
82    #[serde(default)]
83    pub max_depth: Option<u32>,
84
85    /// Maximum number of pages to crawl. Defaults to 100.
86    #[serde(default)]
87    pub limit: Option<u32>,
88
89    /// Glob patterns of paths to include.
90    #[serde(default)]
91    pub include_paths: Option<Vec<String>>,
92
93    /// Glob patterns of paths to exclude.
94    #[serde(default)]
95    pub exclude_paths: Option<Vec<String>>,
96
97    /// Allow following links back up the URL hierarchy.
98    #[serde(default)]
99    pub allow_backward_links: Option<bool>,
100
101    /// Allow following external links.
102    #[serde(default)]
103    pub allow_external_links: Option<bool>,
104}
105
106/// Parameters for the `search` tool.
107#[derive(Debug, Clone, Deserialize, JsonSchema)]
108pub struct SearchParams {
109    /// The search query.
110    pub query: String,
111
112    /// Number of results to return. Defaults to 10.
113    #[serde(default)]
114    pub limit: Option<u32>,
115
116    /// Whether to scrape the content of each result URL. Defaults to false.
117    #[serde(default)]
118    pub scrape_results: Option<bool>,
119}
120
121// ---------------------------------------------------------------------------
122// MCP Server
123// ---------------------------------------------------------------------------
124
125/// The Essence MCP server, exposing scrape/map/crawl/search as MCP tools.
126#[derive(Clone)]
127pub struct EssenceMcpServer {
128    tool_router: ToolRouter<Self>,
129}
130
131#[tool_router]
132impl EssenceMcpServer {
133    /// Create a new `EssenceMcpServer` with all tool routes registered.
134    #[allow(clippy::new_without_default)]
135    pub fn new() -> Self {
136        Self {
137            tool_router: Self::tool_router(),
138        }
139    }
140
141    /// Scrape a single web page and return its content as Markdown, HTML, or other formats.
142    ///
143    /// Uses an intelligent HTTP -> Browser fallback strategy for maximum reliability.
144    #[tool(description = "Scrape a single web page and return its content as Markdown (default), HTML, or other formats. Uses intelligent HTTP -> Browser fallback for reliability.")]
145    async fn scrape(
146        &self,
147        Parameters(params): Parameters<ScrapeParams>,
148    ) -> Result<CallToolResult, McpError> {
149        info!("MCP tool call: scrape url={}", params.url);
150
151        let request = ScrapeRequest {
152            url: params.url.clone(),
153            formats: params.formats.unwrap_or_else(|| vec!["markdown".to_string()]),
154            engine: params.engine.unwrap_or_else(|| "auto".to_string()),
155            timeout: params.timeout_ms.unwrap_or(30000),
156            ..ScrapeRequest::default()
157        };
158
159        match scrape_core_logic(&request).await {
160            Ok(response) => {
161                let json = serde_json::to_string_pretty(&response).map_err(|e| {
162                    McpError::internal_error(
163                        format!("Failed to serialize scrape response: {}", e),
164                        None,
165                    )
166                })?;
167                Ok(CallToolResult::success(vec![Content::text(json)]))
168            }
169            Err(e) => {
170                error!("MCP scrape error for {}: {}", params.url, e);
171                Ok(CallToolResult::error(vec![Content::text(format!(
172                    "Scrape failed: {}",
173                    e
174                ))]))
175            }
176        }
177    }
178
179    /// Discover URLs from a website via sitemaps and in-page link extraction.
180    #[tool(description = "Discover URLs from a website via sitemaps and in-page link extraction. Returns a list of discovered URLs.")]
181    async fn map(
182        &self,
183        Parameters(params): Parameters<MapParams>,
184    ) -> Result<CallToolResult, McpError> {
185        info!("MCP tool call: map url={}", params.url);
186
187        let map_request = MapRequest {
188            url: params.url.clone(),
189            search: params.search,
190            ignore_sitemap: params.ignore_sitemap,
191            include_subdomains: params.include_subdomains.or(Some(true)),
192            limit: params.limit.or(Some(5000)),
193        };
194
195        match mapper::discover_urls(&params.url, &map_request).await {
196            Ok(links) => {
197                let result = serde_json::json!({
198                    "success": true,
199                    "count": links.len(),
200                    "links": links
201                });
202                let json = serde_json::to_string_pretty(&result).map_err(|e| {
203                    McpError::internal_error(
204                        format!("Failed to serialize map response: {}", e),
205                        None,
206                    )
207                })?;
208                Ok(CallToolResult::success(vec![Content::text(json)]))
209            }
210            Err(e) => {
211                error!("MCP map error for {}: {}", params.url, e);
212                Ok(CallToolResult::error(vec![Content::text(format!(
213                    "Map failed: {}",
214                    e
215                ))]))
216            }
217        }
218    }
219
220    /// Crawl a website starting from a URL, following links up to a specified depth and page limit.
221    #[tool(description = "Crawl a website starting from a URL, following links up to a specified depth and page limit. Returns scraped content from all crawled pages.")]
222    async fn crawl(
223        &self,
224        Parameters(params): Parameters<CrawlParams>,
225    ) -> Result<CallToolResult, McpError> {
226        info!("MCP tool call: crawl url={}", params.url);
227
228        let crawl_request = CrawlRequest {
229            url: params.url.clone(),
230            max_depth: params.max_depth.unwrap_or(2),
231            limit: params.limit.unwrap_or(100),
232            include_paths: params.include_paths,
233            exclude_paths: params.exclude_paths,
234            allow_backward_links: params.allow_backward_links,
235            allow_external_links: params.allow_external_links,
236            ignore_sitemap: None,
237            detect_pagination: Some(true),
238            max_pagination_pages: Some(50),
239            use_parallel: None,
240        };
241
242        match crawl_website(&crawl_request).await {
243            Ok(documents) => {
244                let result = serde_json::json!({
245                    "success": true,
246                    "pages_crawled": documents.len(),
247                    "data": documents
248                });
249                let json = serde_json::to_string_pretty(&result).map_err(|e| {
250                    McpError::internal_error(
251                        format!("Failed to serialize crawl response: {}", e),
252                        None,
253                    )
254                })?;
255                Ok(CallToolResult::success(vec![Content::text(json)]))
256            }
257            Err(e) => {
258                error!("MCP crawl error for {}: {}", params.url, e);
259                Ok(CallToolResult::error(vec![Content::text(format!(
260                    "Crawl failed: {}",
261                    e
262                ))]))
263            }
264        }
265    }
266
267    /// Search the web using DuckDuckGo and optionally scrape each result page.
268    #[tool(description = "Search the web using DuckDuckGo and optionally scrape each result page for full content. Returns search results with titles, URLs, and snippets.")]
269    async fn search(
270        &self,
271        Parameters(params): Parameters<SearchParams>,
272    ) -> Result<CallToolResult, McpError> {
273        info!("MCP tool call: search query={}", params.query);
274
275        let provider = SearchProvider::new().map_err(|e| {
276            McpError::internal_error(
277                format!("Failed to create search provider: {}", e),
278                None,
279            )
280        })?;
281
282        let limit = params.limit.unwrap_or(10);
283
284        let mut results = provider
285            .search_duckduckgo(&params.query, limit)
286            .await
287            .map_err(|e| {
288                McpError::internal_error(
289                    format!("Search failed: {}", e),
290                    None,
291                )
292            })?;
293
294        // Optionally scrape each result
295        if params.scrape_results.unwrap_or(false) {
296            info!("Scraping {} search results", results.len());
297            for result in &mut results {
298                let scrape_req = ScrapeRequest {
299                    url: result.url.clone(),
300                    formats: vec!["markdown".to_string()],
301                    engine: "http".to_string(),
302                    timeout: 10000,
303                    only_main_content: true,
304                    ..ScrapeRequest::default()
305                };
306                match scrape_core_logic(&scrape_req).await {
307                    Ok(response) => {
308                        if let Some(data) = response.data {
309                            result.content = Some(data);
310                        }
311                    }
312                    Err(e) => {
313                        error!("Failed to scrape search result {}: {}", result.url, e);
314                    }
315                }
316            }
317        }
318
319        let response = serde_json::json!({
320            "success": true,
321            "count": results.len(),
322            "data": results
323        });
324        let json = serde_json::to_string_pretty(&response).map_err(|e| {
325            McpError::internal_error(
326                format!("Failed to serialize search response: {}", e),
327                None,
328            )
329        })?;
330        Ok(CallToolResult::success(vec![Content::text(json)]))
331    }
332}
333
334// ---------------------------------------------------------------------------
335// ServerHandler implementation
336// ---------------------------------------------------------------------------
337
338#[tool_handler]
339impl ServerHandler for EssenceMcpServer {
340    fn get_info(&self) -> ServerInfo {
341        ServerInfo {
342            server_info: Implementation {
343                name: "essence".to_string(),
344                title: Some("Essence Web Retrieval Engine".to_string()),
345                version: env!("CARGO_PKG_VERSION").to_string(),
346                description: Some(
347                    "Production-ready web retrieval engine with intelligent HTTP->Browser fallback, \
348                     providing LLM-ready Markdown outputs. Supports scraping, crawling, URL discovery, \
349                     and web search."
350                        .to_string(),
351                ),
352                icons: None,
353                website_url: None,
354            },
355            capabilities: ServerCapabilities::builder()
356                .enable_tools()
357                .build(),
358            instructions: Some(
359                "Essence is a web retrieval engine. Use the 'scrape' tool to fetch a single page, \
360                 'map' to discover URLs on a site, 'crawl' to traverse multiple pages, or 'search' \
361                 to find pages via DuckDuckGo web search. All tools return structured JSON with \
362                 Markdown content suitable for LLM consumption."
363                    .to_string(),
364            ),
365            ..Default::default()
366        }
367    }
368}