crw-cli 0.15.2

crw — Unified CLI for web scraping, crawling, search, and serving
//! MCP subcommand — start the Model Context Protocol server.
//!
//! Supports two modes:
//! - **Embedded (default)** — Self-contained scraping engine. No external server needed.
//! - **Proxy** — Forwards tool calls to a remote CRW server over HTTP.

use crate::teardown::CmdError;
use clap::Args;
use crw_core::mcp::{
    JsonRpcRequest, JsonRpcResponse, ProtocolResult, handle_protocol_method, tool_result_response,
};
use serde_json::{Value, json};
use tokio::io::{AsyncBufReadExt, AsyncWriteExt, BufReader};
use tracing_subscriber::EnvFilter;

#[cfg(feature = "mcp-embedded")]
use crw_renderer::browser;

const SERVER_NAME: &str = "crw-mcp";
const SERVER_VERSION: &str = env!("CARGO_PKG_VERSION");

#[derive(Args)]
pub struct McpArgs {
    /// Remote CRW server URL. Enables proxy mode.
    /// Without this flag, runs in embedded mode (self-contained).
    #[arg(long, env = "CRW_API_URL")]
    pub api_url: Option<String>,

    /// API key for remote server authentication.
    #[arg(long, env = "CRW_API_KEY")]
    pub api_key: Option<String>,

    /// Config file path (embedded mode only, overrides config.local.toml).
    #[arg(long, env = "CRW_CONFIG")]
    pub config: Option<String>,
}

// --- Backend ---

enum Backend {
    Proxy {
        client: reqwest::Client,
        base_url: String,
        api_key: Option<String>,
    },
    #[cfg(feature = "mcp-embedded")]
    Embedded { state: crw_server::state::AppState },
}

impl Backend {
    async fn call_tool(&self, tool_name: &str, args: Value) -> Result<Value, String> {
        match self {
            Backend::Proxy {
                client,
                base_url,
                api_key,
            } => proxy_call_tool(client, base_url, api_key, tool_name, args).await,
            #[cfg(feature = "mcp-embedded")]
            Backend::Embedded { state } => {
                crw_server::routes::mcp::call_tool(state, tool_name, args).await
            }
        }
    }

    fn is_proxy(&self) -> bool {
        matches!(self, Backend::Proxy { .. })
    }

    async fn handle_request(&self, req: JsonRpcRequest) -> Option<JsonRpcResponse> {
        match handle_protocol_method(SERVER_NAME, SERVER_VERSION, &req, self.is_proxy()) {
            ProtocolResult::Response(resp) => return Some(resp),
            ProtocolResult::Notification => return None,
            ProtocolResult::NotHandled => {}
        }

        match req.method.as_str() {
            "tools/call" => {
                let id = req.id.unwrap_or(Value::Null);
                let tool_name = req
                    .params
                    .get("name")
                    .and_then(|v| v.as_str())
                    .unwrap_or("");
                let arguments = req.params.get("arguments").cloned().unwrap_or(json!({}));

                let result = self.call_tool(tool_name, arguments).await;
                Some(tool_result_response(id, tool_name, result))
            }

            _ => {
                if let Some(id) = req.id {
                    Some(JsonRpcResponse::error(
                        id,
                        -32601,
                        format!("method not found: {}", req.method),
                    ))
                } else {
                    None
                }
            }
        }
    }
}

// --- Proxy mode HTTP dispatch ---

const TIMEOUT_SCRAPE: std::time::Duration = std::time::Duration::from_secs(120);
const TIMEOUT_CRAWL_KICKOFF: std::time::Duration = std::time::Duration::from_secs(120);
const TIMEOUT_CRAWL_STATUS: std::time::Duration = std::time::Duration::from_secs(30);
const TIMEOUT_MAP: std::time::Duration = std::time::Duration::from_secs(180);
const TIMEOUT_SEARCH: std::time::Duration = std::time::Duration::from_secs(120);

async fn proxy_call_tool(
    client: &reqwest::Client,
    base_url: &str,
    api_key: &Option<String>,
    tool_name: &str,
    args: Value,
) -> Result<Value, String> {
    let mut headers = reqwest::header::HeaderMap::new();
    headers.insert("content-type", "application/json".parse().unwrap());
    if let Some(key) = api_key {
        headers.insert(
            "authorization",
            format!("Bearer {key}")
                .parse()
                .map_err(|e| format!("invalid api key: {e}"))?,
        );
    }

    match tool_name {
        "crw_scrape" => {
            let resp = client
                .post(format!("{base_url}/v1/scrape"))
                .headers(headers)
                .timeout(TIMEOUT_SCRAPE)
                .json(&args)
                .send()
                .await
                .map_err(|e| format!("HTTP request failed: {e}"))?;
            parse_response(resp).await
        }
        "crw_crawl" => {
            let resp = client
                .post(format!("{base_url}/v1/crawl"))
                .headers(headers)
                .timeout(TIMEOUT_CRAWL_KICKOFF)
                .json(&args)
                .send()
                .await
                .map_err(|e| format!("HTTP request failed: {e}"))?;
            parse_response(resp).await
        }
        "crw_check_crawl_status" => {
            let id = args
                .get("id")
                .and_then(|v| v.as_str())
                .ok_or("missing required parameter: id")?;
            let resp = client
                .get(format!("{base_url}/v1/crawl/{id}"))
                .headers(headers)
                .timeout(TIMEOUT_CRAWL_STATUS)
                .send()
                .await
                .map_err(|e| format!("HTTP request failed: {e}"))?;
            parse_response(resp).await
        }
        "crw_map" => {
            let resp = client
                .post(format!("{base_url}/v1/map"))
                .headers(headers)
                .timeout(TIMEOUT_MAP)
                .json(&args)
                .send()
                .await
                .map_err(|e| format!("HTTP request failed: {e}"))?;
            parse_response(resp).await
        }
        "crw_search" => {
            let resp = client
                .post(format!("{base_url}/v1/search"))
                .headers(headers)
                .timeout(TIMEOUT_SEARCH)
                .json(&args)
                .send()
                .await
                .map_err(|e| format!("HTTP request failed: {e}"))?;
            parse_response(resp).await
        }
        _ => Err(format!("unknown tool: {tool_name}")),
    }
}

async fn parse_response(resp: reqwest::Response) -> Result<Value, String> {
    let status = resp.status();
    let body = resp
        .text()
        .await
        .map_err(|e| format!("failed to read response: {e}"))?;

    if !status.is_success() {
        return Err(format!("API error ({}): {}", status, truncate(&body, 500)));
    }

    serde_json::from_str(&body).map_err(|e| format!("invalid JSON response: {e}"))
}

fn truncate(s: &str, max: usize) -> &str {
    if s.len() <= max {
        s
    } else {
        let end = s.floor_char_boundary(max);
        &s[..end]
    }
}

// --- Main ---

pub async fn run(args: McpArgs) -> Result<(), CmdError> {
    // Log to stderr so stdout stays clean for MCP protocol
    tracing_subscriber::fmt()
        .with_writer(std::io::stderr)
        .with_env_filter(
            EnvFilter::try_from_default_env().unwrap_or_else(|_| "crw=info".parse().unwrap()),
        )
        .init();

    // Resolve api_url / api_key with the standard precedence chain:
    //   1. CLI flag / env (already merged by clap)
    //   2. `client.api_url` / `client.api_key` in ~/.config/crw/config.toml
    //   3. None — falls through to embedded mode below
    let (resolved_api_url, resolved_api_key) =
        resolve_client_credentials(args.api_url, args.api_key);

    let backend = if let Some(api_url) = resolved_api_url {
        tracing::info!("Starting {SERVER_NAME} v{SERVER_VERSION} (proxy mode)");
        tracing::info!("API URL: {api_url}");

        let client = reqwest::Client::builder()
            .redirect(crw_core::url_safety::safe_redirect_policy())
            .connect_timeout(std::time::Duration::from_secs(10))
            .build()
            .expect("reqwest client build failed");

        Backend::Proxy {
            client,
            base_url: api_url,
            api_key: resolved_api_key,
        }
    } else {
        #[cfg(feature = "mcp-embedded")]
        {
            tracing::info!("Starting {SERVER_NAME} v{SERVER_VERSION} (embedded mode)");

            if let Some(ref config_path) = args.config {
                unsafe { std::env::set_var("CRW_CONFIG", config_path) };
            }

            let mut config = crw_core::config::AppConfig::load().unwrap_or_else(|e| {
                tracing::warn!("Failed to load config, using defaults: {e}");
                crw_core::config::AppConfig {
                    server: Default::default(),
                    renderer: Default::default(),
                    crawler: Default::default(),
                    extraction: Default::default(),
                    auth: Default::default(),
                    request: Default::default(),
                    search: Default::default(),
                    map: Default::default(),
                    document: Default::default(),
                    client: Default::default(),
                }
            });

            let user_configured_renderer = std::env::var("CRW_RENDERER__LIGHTPANDA__WS_URL")
                .is_ok()
                || std::env::var("CRW_RENDERER__CHROME__WS_URL").is_ok()
                || std::env::var("CRW_RENDERER__PLAYWRIGHT__WS_URL").is_ok();

            let _browser_guards = if !user_configured_renderer {
                let browsers = browser::spawn_all_headless().await;
                if browsers.is_empty() {
                    tracing::info!(
                        "No browser found — JS rendering disabled. \
                         Install LightPanda or Chrome for full SPA support."
                    );
                }
                let mut guards = Vec::new();
                for (guard, ws_url, kind) in browsers {
                    match kind {
                        browser::RendererKind::LightPanda => {
                            config.renderer.lightpanda =
                                Some(crw_core::config::CdpEndpoint { ws_url });
                        }
                        browser::RendererKind::Chrome => {
                            config.renderer.chrome = Some(crw_core::config::CdpEndpoint { ws_url });
                        }
                    }
                    guards.push(guard);
                }
                guards
            } else {
                tracing::info!("CDP renderer already configured — skipping auto-spawn");
                Vec::new()
            };

            let state = match crw_server::state::AppState::new(config) {
                Ok(s) => s,
                Err(e) => {
                    tracing::error!("Failed to build application state: {e}");
                    return Err(CmdError::code_only(1));
                }
            };

            let backend = Backend::Embedded { state };
            run_stdio_loop(backend).await;
            drop(_browser_guards);
            return Ok(());
        }

        #[cfg(not(feature = "mcp-embedded"))]
        {
            tracing::error!(
                "Embedded mode not available (compiled without 'mcp-embedded' feature). \
                 Use --api-url to connect to a remote CRW server."
            );
            return Err(CmdError::code_only(1));
        }
    };

    run_stdio_loop(backend).await;
    Ok(())
}

async fn run_stdio_loop(backend: Backend) {
    let mut stdout = tokio::io::stdout();
    let stdin = tokio::io::stdin();
    let mut reader = BufReader::new(stdin);
    let mut line = String::new();

    loop {
        line.clear();
        match reader.read_line(&mut line).await {
            Ok(0) => break,
            Ok(_) => {}
            Err(e) => {
                tracing::error!("stdin read error: {e}");
                break;
            }
        }

        let trimmed = line.trim();
        if trimmed.is_empty() {
            continue;
        }

        tracing::debug!("← {trimmed}");

        let req: JsonRpcRequest = match serde_json::from_str(trimmed) {
            Ok(r) => r,
            Err(e) => {
                let err = JsonRpcResponse::error(Value::Null, -32700, format!("parse error: {e}"));
                let out = serde_json::to_string(&err).unwrap();
                tracing::debug!("→ {out}");
                let _ = stdout.write_all(out.as_bytes()).await;
                let _ = stdout.write_all(b"\n").await;
                let _ = stdout.flush().await;
                continue;
            }
        };

        if let Some(resp) = backend.handle_request(req).await {
            let out = serde_json::to_string(&resp).unwrap();
            tracing::debug!("→ {out}");
            let _ = stdout.write_all(out.as_bytes()).await;
            let _ = stdout.write_all(b"\n").await;
            let _ = stdout.flush().await;
        }
    }
}

/// Resolve proxy-mode credentials. CLI / env values (already merged by clap)
/// win; otherwise consult `client.{api_url,api_key}` from
/// `~/.config/crw/config.toml`. This is what lets `crw setup --cloud` -> fresh
/// shell -> `crw mcp` start in proxy mode without a manual `source ~/.zshrc`.
fn resolve_client_credentials(
    cli_url: Option<String>,
    cli_key: Option<String>,
) -> (Option<String>, Option<String>) {
    if cli_url.is_some() {
        return (cli_url, cli_key);
    }
    match crw_core::config::AppConfig::load() {
        Ok(cfg) => {
            let file_url = cfg.client.api_url;
            let file_key = cli_key.or(cfg.client.api_key);
            (file_url, file_key)
        }
        Err(_) => (None, cli_key),
    }
}