use rmcp::{
ServerHandler, ServiceExt,
handler::server::{router::tool::ToolRouter, wrapper::Parameters},
model::*,
tool, tool_handler, tool_router,
transport::stdio,
};
use crate::engine::CrawlEngineBuilder;
use crate::types::CrawlConfig;
fn validate_url(url: &str) -> Result<(), rmcp::ErrorData> {
if url.is_empty() {
return Err(rmcp::ErrorData::invalid_params("url is required", None));
}
if !url.starts_with("http://") && !url.starts_with("https://") {
return Err(rmcp::ErrorData::invalid_params(
"url must start with http:// or https://",
None,
));
}
Ok(())
}
fn parse_format(format: &Option<String>) -> &str {
match format.as_deref() {
Some(f) if f.eq_ignore_ascii_case("json") => "json",
_ => "markdown",
}
}
pub struct CrawlbergMcp {
tool_router: ToolRouter<CrawlbergMcp>,
config: CrawlConfig,
}
impl Clone for CrawlbergMcp {
fn clone(&self) -> Self {
Self {
tool_router: self.tool_router.clone(),
config: self.config.clone(),
}
}
}
#[tool_router]
impl CrawlbergMcp {
pub fn new() -> Self {
Self::with_config(CrawlConfig::default())
}
pub fn with_config(config: CrawlConfig) -> Self {
Self {
tool_router: Self::tool_router(),
config,
}
}
fn build_engine(&self, config: CrawlConfig) -> Result<crate::engine::CrawlEngine, rmcp::ErrorData> {
CrawlEngineBuilder::new()
.config(config)
.build()
.map_err(|e| rmcp::ErrorData::internal_error(format!("Failed to build engine: {e}"), None))
}
#[tool(
description = "Scrape a single URL and extract content as markdown or JSON. Returns page content, metadata, links, and images.",
annotations(
title = "Scrape URL",
read_only_hint = true,
idempotent_hint = true,
open_world_hint = true
)
)]
async fn scrape(
&self,
Parameters(params): Parameters<super::params::ScrapeParams>,
) -> Result<CallToolResult, rmcp::ErrorData> {
use super::errors::map_crawl_error;
use super::format::{format_as_json, format_as_markdown};
validate_url(¶ms.url)?;
let config = self.config.clone();
#[cfg(feature = "browser")]
let config = {
let mut config = config;
if let Some(true) = params.use_browser {
config.browser.mode = crate::types::BrowserMode::Always;
}
config
};
let engine = self.build_engine(config)?;
let result = engine.scrape(¶ms.url).await.map_err(map_crawl_error)?;
let response = if parse_format(¶ms.format) == "json" {
format_as_json(&result)
} else {
format_as_markdown(&result)
};
Ok(CallToolResult::success(vec![Content::text(response)]))
}
#[tool(
description = "Crawl a website following links. Returns content for all discovered pages up to max_depth/max_pages.",
annotations(title = "Crawl Website", read_only_hint = true, open_world_hint = true)
)]
async fn crawl(
&self,
Parameters(params): Parameters<super::params::CrawlParams>,
) -> Result<CallToolResult, rmcp::ErrorData> {
use super::errors::map_crawl_error;
use super::format::{format_crawl_as_json, format_crawl_as_markdown};
validate_url(¶ms.url)?;
if let Some(depth) = params.max_depth
&& depth > 100
{
return Err(rmcp::ErrorData::invalid_params("max_depth must be <= 100", None));
}
if let Some(pages) = params.max_pages
&& (pages == 0 || pages > 100_000)
{
return Err(rmcp::ErrorData::invalid_params("max_pages must be 1..=100000", None));
}
let mut config = self.config.clone();
if let Some(depth) = params.max_depth {
config.max_depth = Some(depth);
}
if let Some(pages) = params.max_pages {
config.max_pages = Some(pages);
}
if let Some(stay) = params.stay_on_domain {
config.stay_on_domain = stay;
}
let engine = self.build_engine(config)?;
let result = engine.crawl(¶ms.url).await.map_err(map_crawl_error)?;
let response = if parse_format(¶ms.format) == "json" {
format_crawl_as_json(&result)
} else {
format_crawl_as_markdown(&result)
};
Ok(CallToolResult::success(vec![Content::text(response)]))
}
#[tool(
description = "Discover all pages on a website via links and sitemaps. Returns a list of discovered URLs.",
annotations(
title = "Map Website",
read_only_hint = true,
idempotent_hint = true,
open_world_hint = true
)
)]
async fn map(
&self,
Parameters(params): Parameters<super::params::MapParams>,
) -> Result<CallToolResult, rmcp::ErrorData> {
use super::errors::map_crawl_error;
use super::format::format_map_result;
validate_url(¶ms.url)?;
let mut config = self.config.clone();
if let Some(limit) = params.limit {
config.map_limit = Some(limit);
}
if let Some(ref search) = params.search {
config.map_search = Some(search.clone());
}
if let Some(robots) = params.respect_robots_txt {
config.respect_robots_txt = robots;
}
let engine = self.build_engine(config)?;
let result = engine.map(¶ms.url).await.map_err(map_crawl_error)?;
let response = format_map_result(&result);
Ok(CallToolResult::success(vec![Content::text(response)]))
}
#[tool(
description = "Scrape multiple URLs concurrently. Returns results for all URLs.",
annotations(title = "Batch Scrape", read_only_hint = true, open_world_hint = true)
)]
async fn batch_scrape(
&self,
Parameters(params): Parameters<super::params::BatchScrapeParams>,
) -> Result<CallToolResult, rmcp::ErrorData> {
use super::format::{format_as_json, format_as_markdown};
if params.urls.is_empty() {
return Err(rmcp::ErrorData::invalid_params("urls array must not be empty", None));
}
for url in ¶ms.urls {
validate_url(url)?;
}
let mut config = self.config.clone();
if let Some(concurrency) = params.concurrency {
config.max_concurrent = Some(concurrency);
}
let engine = self.build_engine(config)?;
let url_refs: Vec<&str> = params.urls.iter().map(|s| s.as_str()).collect();
let results = engine.batch_scrape(&url_refs).await;
let is_json = parse_format(¶ms.format) == "json";
let mut response = String::new();
for (url, result) in &results {
match result {
Ok(scrape_result) => {
if is_json {
response.push_str(&format_as_json(scrape_result));
} else {
response.push_str(&format!("## {url}\n\n"));
response.push_str(&format_as_markdown(scrape_result));
}
response.push_str("\n\n---\n\n");
}
Err(e) => {
response.push_str(&format!("## {url}\n\n**Error:** {e}\n\n---\n\n"));
}
}
}
Ok(CallToolResult::success(vec![Content::text(response)]))
}
#[tool(
description = "Crawl multiple seed URLs concurrently. Returns crawl results for all seeds.",
annotations(title = "Batch Crawl", read_only_hint = true, open_world_hint = true)
)]
async fn batch_crawl(
&self,
Parameters(params): Parameters<super::params::BatchCrawlParams>,
) -> Result<CallToolResult, rmcp::ErrorData> {
use super::format::{format_crawl_as_json, format_crawl_as_markdown};
if params.urls.is_empty() {
return Err(rmcp::ErrorData::invalid_params("urls array must not be empty", None));
}
for url in ¶ms.urls {
validate_url(url)?;
}
if let Some(depth) = params.max_depth
&& depth > 100
{
return Err(rmcp::ErrorData::invalid_params("max_depth must be <= 100", None));
}
if let Some(pages) = params.max_pages
&& (pages == 0 || pages > 100_000)
{
return Err(rmcp::ErrorData::invalid_params("max_pages must be 1..=100000", None));
}
let mut config = self.config.clone();
if let Some(depth) = params.max_depth {
config.max_depth = Some(depth);
}
if let Some(pages) = params.max_pages {
config.max_pages = Some(pages);
}
if let Some(stay) = params.stay_on_domain {
config.stay_on_domain = stay;
}
if let Some(concurrency) = params.concurrency {
config.max_concurrent = Some(concurrency);
}
let engine = self.build_engine(config)?;
let url_refs: Vec<&str> = params.urls.iter().map(|s| s.as_str()).collect();
let results = engine.batch_crawl(&url_refs).await;
let is_json = parse_format(¶ms.format) == "json";
let mut response = String::new();
for (url, result) in &results {
match result {
Ok(crawl_result) => {
if is_json {
response.push_str(&format_crawl_as_json(crawl_result));
} else {
response.push_str(&format!("## {url}\n\n"));
response.push_str(&format_crawl_as_markdown(crawl_result));
}
response.push_str("\n\n---\n\n");
}
Err(e) => {
response.push_str(&format!("## {url}\n\n**Error:** {e}\n\n---\n\n"));
}
}
}
Ok(CallToolResult::success(vec![Content::text(response)]))
}
#[tool(
description = "Download a document from a URL. Returns metadata about the downloaded file.",
annotations(title = "Download Document", read_only_hint = true, open_world_hint = true)
)]
async fn download(
&self,
Parameters(params): Parameters<super::params::DownloadParams>,
) -> Result<CallToolResult, rmcp::ErrorData> {
use super::errors::map_crawl_error;
validate_url(¶ms.url)?;
let mut config = self.config.clone();
config.download_documents = true;
if let Some(max_size) = params.max_size {
config.document_max_size = Some(max_size);
}
let engine = self.build_engine(config)?;
let result = engine.scrape(¶ms.url).await.map_err(map_crawl_error)?;
let response = if let Some(ref doc) = result.downloaded_document {
serde_json::json!({
"url": doc.url,
"mime_type": doc.mime_type,
"size": doc.size,
"filename": doc.filename,
"content_hash": doc.content_hash,
})
.to_string()
} else {
serde_json::json!({
"url": params.url,
"content_type": result.content_type,
"status_code": result.status_code,
"body_size": result.body_size,
"note": "URL returned HTML content, not a downloadable document"
})
.to_string()
};
Ok(CallToolResult::success(vec![Content::text(response)]))
}
#[tool(
description = "Execute browser actions on a page. Actions may mutate page or application state.",
annotations(
title = "Interact",
read_only_hint = false,
destructive_hint = true,
open_world_hint = true
)
)]
async fn interact(
&self,
Parameters(params): Parameters<super::params::InteractParams>,
) -> Result<CallToolResult, rmcp::ErrorData> {
use super::errors::map_crawl_error;
validate_url(¶ms.url)?;
let actions = params
.actions
.into_iter()
.map(serde_json::from_value)
.collect::<Result<Vec<crate::PageAction>, _>>()
.map_err(|e| rmcp::ErrorData::invalid_params(format!("invalid action payload: {e}"), None))?;
let engine = self.build_engine(self.config.clone())?;
let result = engine.interact(¶ms.url, &actions).await.map_err(map_crawl_error)?;
let response = serde_json::to_string_pretty(&result).map_err(|e| {
rmcp::ErrorData::internal_error(format!("Failed to serialize interaction result: {e}"), None)
})?;
Ok(CallToolResult::success(vec![Content::text(response)]))
}
#[tool(
description = "Convert markdown links into numbered citations with an appended reference list.",
annotations(
title = "Generate Citations",
read_only_hint = true,
idempotent_hint = true,
open_world_hint = false
)
)]
fn generate_citations(
&self,
Parameters(params): Parameters<super::params::GenerateCitationsParams>,
) -> Result<CallToolResult, rmcp::ErrorData> {
let result = crate::citations::generate_citations(¶ms.markdown);
let response = serde_json::to_string_pretty(&result)
.map_err(|e| rmcp::ErrorData::internal_error(format!("Failed to serialize citations: {e}"), None))?;
Ok(CallToolResult::success(vec![Content::text(response)]))
}
#[tool(
description = "Get the current crawlberg library version.",
annotations(
title = "Get Version",
read_only_hint = true,
idempotent_hint = true,
open_world_hint = false
)
)]
fn get_version(
&self,
Parameters(_): Parameters<super::params::EmptyParams>,
) -> Result<CallToolResult, rmcp::ErrorData> {
let response = serde_json::json!({
"version": env!("CARGO_PKG_VERSION"),
});
Ok(CallToolResult::success(vec![Content::text(
serde_json::to_string_pretty(&response)
.unwrap_or_else(|e| format!("{{\"error\": \"Failed to serialize version: {e}\"}}")),
)]))
}
}
#[tool_handler]
impl ServerHandler for CrawlbergMcp {
fn get_info(&self) -> ServerInfo {
let mut capabilities = ServerCapabilities::default();
capabilities.tools = Some(ToolsCapability::default());
let server_info = Implementation::new("crawlberg-mcp", env!("CARGO_PKG_VERSION"))
.with_title("Crawlberg Web Crawling MCP Server")
.with_description(
"Web crawling and scraping library for extracting content from websites. \
Supports single-page scraping, multi-page crawling, site mapping, and batch operations.",
)
.with_website_url("https://github.com/xberg-io/crawlberg");
InitializeResult::new(capabilities)
.with_server_info(server_info)
.with_instructions(
"Scrape, crawl, and map websites. Use 'scrape' for single pages, 'crawl' for \
following links across a site, 'map' for discovering all URLs, and 'batch_scrape'/\
'batch_crawl' for processing multiple URLs concurrently. Use format: 'json' for \
structured output or 'markdown' (default) for human-readable content.",
)
}
}
impl Default for CrawlbergMcp {
fn default() -> Self {
Self::new()
}
}
pub async fn start_mcp_server() -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
start_mcp_server_with_config(CrawlConfig::default()).await
}
pub async fn start_mcp_server_with_config(config: CrawlConfig) -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
let service = CrawlbergMcp::with_config(config).serve(stdio()).await?;
service.waiting().await?;
Ok(())
}
pub type CrawlbergHttpMcpService = rmcp::transport::streamable_http_server::StreamableHttpService<
CrawlbergMcp,
rmcp::transport::streamable_http_server::session::local::LocalSessionManager,
>;
pub fn streamable_http_service(config: CrawlConfig) -> CrawlbergHttpMcpService {
use rmcp::transport::streamable_http_server::{
StreamableHttpServerConfig, StreamableHttpService, session::local::LocalSessionManager,
};
StreamableHttpService::new(
move || Ok::<_, std::io::Error>(CrawlbergMcp::with_config(config.clone())),
std::sync::Arc::new(LocalSessionManager::default()),
StreamableHttpServerConfig::default(),
)
}
#[cfg(test)]
mod tests {
use super::*;
use std::collections::HashMap;
#[test]
fn all_tools_declare_expected_annotation_hints() {
let tools = CrawlbergMcp::new().tool_router.list_all();
let by_name: HashMap<&str, &Tool> = tools.iter().map(|t| (t.name.as_ref(), t)).collect();
let expected: &[(&str, Option<bool>, Option<bool>, Option<bool>)] = &[
("scrape", Some(true), None, Some(true)),
("crawl", Some(true), None, Some(true)),
("map", Some(true), None, Some(true)),
("batch_scrape", Some(true), None, Some(true)),
("batch_crawl", Some(true), None, Some(true)),
("download", Some(true), None, Some(true)),
("interact", Some(false), Some(true), Some(true)),
("generate_citations", Some(true), None, Some(false)),
("get_version", Some(true), None, Some(false)),
];
assert_eq!(
tools.len(),
expected.len(),
"tool count drifted from annotation expectations"
);
for (name, read_only, destructive, open_world) in expected {
let tool = by_name
.get(name)
.unwrap_or_else(|| panic!("tool `{name}` missing from router"));
let ann = tool
.annotations
.as_ref()
.unwrap_or_else(|| panic!("tool `{name}` has no annotations"));
assert_eq!(ann.read_only_hint, *read_only, "read_only_hint mismatch for `{name}`");
assert_eq!(
ann.destructive_hint, *destructive,
"destructive_hint mismatch for `{name}`"
);
assert_eq!(
ann.open_world_hint, *open_world,
"open_world_hint mismatch for `{name}`"
);
assert!(ann.title.is_some(), "tool `{name}` is missing a title annotation");
}
}
}