ferridriver-mcp 0.4.0

ferridriver MCP server library -- browser automation via Model Context Protocol
Documentation
use crate::params::{EvaluateParams, ScreenshotParams_, SearchPageParams, SnapshotParams};
use crate::server::{McpServer, sess};
use base64::Engine;
use ferridriver::options::ScreenshotOptions;
use rmcp::{
  ErrorData,
  handler::server::wrapper::Parameters,
  model::{CallToolResult, Content},
  tool, tool_router,
};

#[tool_router(router = content_router, vis = "pub")]
impl McpServer {
  #[tool(
    name = "snapshot",
    description = "PRIMARY grounding tool — call this FIRST before deciding on any selectors or actions. \
    Returns the page as an accessibility tree: every interactable role/name, visible text, and \
    [ref=eN] handles. Cheap, fast, token-efficient, and deterministic — much better than screenshot \
    for picking what to click/fill. Supports depth limiting and incremental tracking (shows only \
    what changed since the last snapshot). Re-snapshot after any navigate/click/fill/run_script; \
    refs are invalidated by DOM mutations."
  )]
  async fn snapshot(&self, Parameters(p): Parameters<SnapshotParams>) -> Result<CallToolResult, ErrorData> {
    let s = sess(p.session.as_opt());
    let _guard = self.session_guard(s).await;
    let page = Box::pin(self.page(s)).await?;
    let opts = ferridriver::snapshot::SnapshotOptions {
      depth: p.depth,
      track: p.track,
    };
    match page.snapshot_for_ai(opts).await {
      Ok(result) => {
        if let Some(handle) = self.state.ref_map_handle(s).await {
          handle.store(std::sync::Arc::new(result.ref_map));
        } else {
          let state = self.state.read().await;
          state.set_ref_map(s, result.ref_map);
        }
        let mut text = result.full;
        if let Some(inc) = result.incremental {
          text.push_str("\n### Changes since last snapshot\n");
          text.push_str(&inc);
        }
        Ok(CallToolResult::success(vec![Content::text(text)]))
      },
      Err(e) => Ok(CallToolResult::success(vec![Content::text(format!(
        "[snapshot error: {e}]"
      ))])),
    }
  }

  #[tool(
    name = "screenshot",
    description = "Capture the page (or a single element via `selector`, or the full scrollable page \
    via `full_page`) as a base64-encoded image. USE SPARINGLY — it is much more token-expensive \
    than `snapshot`. Reach for it only when the a11y tree is ambiguous (icons without labels, \
    canvas, complex layout), or when the caller explicitly needs visual verification."
  )]
  async fn screenshot(&self, Parameters(p): Parameters<ScreenshotParams_>) -> Result<CallToolResult, ErrorData> {
    let s = sess(p.session.as_opt());
    let _guard = self.session_guard(s).await;
    let page = Box::pin(self.page(s)).await?;
    let mime = match p.format.as_deref() {
      Some("jpeg" | "jpg") => "image/jpeg",
      Some("webp") => "image/webp",
      _ => "image/png",
    };
    let bytes = if let Some(sel) = &p.selector {
      page.screenshot_element(sel).await.map_err(Self::err)?
    } else {
      let opts = ScreenshotOptions {
        format: p.format.clone(),
        quality: p.quality,
        full_page: p.full_page,
        ..Default::default()
      };
      page.screenshot(opts).await.map_err(Self::err)?
    };
    let b64 = base64::engine::general_purpose::STANDARD.encode(&bytes);
    Ok(CallToolResult::success(vec![Content::image(b64, mime)]))
  }

  #[tool(
    name = "evaluate",
    description = "Evaluate a single JavaScript expression IN the page (DOM context) and return its \
    JSON-serialized value. Use for quick one-liners: `document.title`, \
    `document.querySelectorAll('.row').length`, feature-detection. For multi-step imperative \
    logic — loops, conditionals, try/catch, chained navigations — use `run_script` instead. \
    The expression runs with the page's globals (`document`, `window`, `fetch`, etc.)."
  )]
  async fn evaluate(&self, Parameters(p): Parameters<EvaluateParams>) -> Result<CallToolResult, ErrorData> {
    let s = sess(p.session.as_opt());
    let _guard = self.session_guard(s).await;
    let page = Box::pin(self.page(s)).await?;
    let result = page
      .evaluate(
        p.expression.as_str(),
        ferridriver::protocol::SerializedArgument::default(),
        None,
      )
      .await
      .map_err(Self::err)?;
    let val = result.to_json_like().map_or_else(
      || result.as_string_lossy(),
      |v| serde_json::to_string_pretty(&v).unwrap_or_else(|_| v.to_string()),
    );
    Ok(CallToolResult::success(vec![Content::text(val)]))
  }

  #[tool(
    name = "search_page",
    description = "Grep the page's rendered text for a pattern (literal or regex), returning matches \
    with surrounding context. Fast and token-cheap; use to locate content without re-reading the \
    whole snapshot. Supports `regex`, `case_sensitive`, and `selector` for scoped search."
  )]
  async fn search_page(&self, Parameters(p): Parameters<SearchPageParams>) -> Result<CallToolResult, ErrorData> {
    let s = sess(p.session.as_opt());
    let _guard = self.session_guard(s).await;
    let page = Box::pin(self.page(s)).await?;
    let opts = ferridriver::actions::SearchOptions {
      pattern: p.pattern.clone(),
      regex: p.regex.unwrap_or(false),
      case_sensitive: p.case_sensitive.unwrap_or(false),
      context_chars: p.context_chars.unwrap_or(150),
      css_scope: p.selector.clone(),
      max_results: p.max_results.unwrap_or(25),
    };
    let result = ferridriver::actions::search_page(page.inner(), &opts)
      .await
      .map_err(Self::err)?;
    Ok(CallToolResult::success(vec![Content::text(
      ferridriver::actions::format_search_results(&result, &p.pattern),
    )]))
  }
}