basemind 0.2.1

Full AI context layer over MCP — tree-sitter code-map, document RAG (PDF/Office/HTML/email + OCR + reranker), shared agent memory, on-demand web crawl, git history + blame + per-symbol diff. 300+ languages, 8 coding-agent harnesses, content-addressed Fjall + LanceDB.
//! Build the shared `kreuzcrawl` engine handle from a `CrawlConfig`.
//!
//! The engine holds the reqwest client, robots.txt cache, and (when configured)
//! dispatch policy. It is cheap to clone (`Arc`-backed) and is created once
//! per `BasemindServer` boot.

use std::time::Duration;

use anyhow::{Context, Result};
use kreuzcrawl::{CrawlConfig as KcCrawlConfig, CrawlEngineHandle, create_engine};

use crate::config::CrawlConfig;

/// Translate basemind's `CrawlConfig` into kreuzcrawl's runtime config and
/// instantiate the engine. Returns an error when the user-supplied user-agent
/// is empty or kreuzcrawl rejects the validated config — both indicate a
/// configuration bug rather than a transient network issue.
pub fn build_engine(cfg: &CrawlConfig) -> Result<CrawlEngineHandle> {
    let max_pages = usize::try_from(cfg.max_pages).context("max_pages exceeds usize")?;
    let max_depth = usize::try_from(cfg.max_depth).context("max_depth exceeds usize")?;
    let max_body_size =
        usize::try_from(cfg.max_body_size).context("max_body_size exceeds usize")?;

    if !cfg.respect_robots_txt {
        tracing::warn!(
            "crawl.respect_robots_txt is disabled — basemind will fetch URLs that robots.txt forbids"
        );
    }

    let kc_cfg = KcCrawlConfig {
        max_pages: Some(max_pages),
        max_depth: Some(max_depth),
        respect_robots_txt: cfg.respect_robots_txt,
        user_agent: Some(cfg.user_agent.clone()),
        max_body_size: Some(max_body_size),
        // Reasonable defaults for an in-process MCP server: cap concurrency low
        // so a single agent call doesn't saturate a destination host, and keep
        // the per-request timeout tight enough that a stuck fetch can't hang
        // the MCP loop.
        max_concurrent: Some(4),
        request_timeout: Duration::from_secs(30),
        ..Default::default()
    };

    create_engine(Some(kc_cfg)).context("create kreuzcrawl engine")
}