servo-fetch-cli 0.8.0

A browser engine in a binary — fetch, render, and extract web content powered by Servo.
//! Crawl tool helper.

use std::time::Duration;

use rmcp::ErrorData;

use super::common::paginate;

const MAX_MCP_CRAWL_PAGES: usize = 500;

pub(crate) struct CrawlToolOptions<'a> {
    pub url: &'a str,
    pub limit: usize,
    pub max_depth: usize,
    pub json: bool,
    pub selector: Option<&'a str>,
    pub max_len: usize,
    pub timeout: u64,
    pub settle_ms: u64,
    pub include_glob: Option<&'a [String]>,
    pub exclude_glob: Option<&'a [String]>,
}

pub(crate) async fn crawl_pages(opts: CrawlToolOptions<'_>) -> Result<Vec<(String, String)>, ErrorData> {
    let limit = opts.limit.min(MAX_MCP_CRAWL_PAGES);

    let mut builder = servo_fetch::CrawlOptions::new(opts.url)
        .limit(limit)
        .max_depth(opts.max_depth)
        .timeout(Duration::from_secs(opts.timeout))
        .settle(Duration::from_millis(opts.settle_ms))
        .json(opts.json);
    if let Some(selector) = opts.selector {
        builder = builder.selector(selector);
    }

    if let Some(globs) = opts.include_glob.filter(|g| !g.is_empty()) {
        let refs: Vec<&str> = globs.iter().map(String::as_str).collect();
        builder = builder.include(&refs);
    }
    if let Some(globs) = opts.exclude_glob.filter(|g| !g.is_empty()) {
        let refs: Vec<&str> = globs.iter().map(String::as_str).collect();
        builder = builder.exclude(&refs);
    }

    let mut results = Vec::new();
    servo_fetch::crawl_each(builder, |r| {
        let text = match &r.outcome {
            Ok(page) => paginate(&servo_fetch::sanitize::sanitize(&page.content), 0, opts.max_len),
            Err(e) => format!("[error] {e}"),
        };
        results.push((r.url.clone(), text));
    })
    .map_err(|e| ErrorData::invalid_params(format!("{e:#}"), None))?;

    Ok(results)
}