tidev 0.2.0 - Docs.rs

use anyhow::{Context, Result, bail};
use pulldown_cmark::{Event, Options as MarkdownOptions, Parser as MarkdownParser, Tag, TagEnd};
use reqwest::header::{ACCEPT, ACCEPT_LANGUAGE, CONTENT_TYPE, HeaderMap, HeaderValue, USER_AGENT};
use reqwest::{Client, StatusCode};
use serde::Deserialize;
use serde_json::{Value, json};
use std::time::Duration;
use tokio::time::timeout;
use url::Url;

use crate::session::ToolCall;
use crate::tooling::tools::{WebFetchArgs as WebFetchToolArgs, WebSearchArgs as WebSearchToolArgs};
use crate::tooling::{ToolDefinition, ToolPermission};

const EXA_URL: &str = "https://mcp.exa.ai/mcp";
const SEARCH_TIMEOUT: Duration = Duration::from_secs(25);
const FETCH_DEFAULT_TIMEOUT: Duration = Duration::from_secs(30);
const FETCH_MAX_TIMEOUT: Duration = Duration::from_secs(120);
const MAX_RESPONSE_BYTES: usize = 5 * 1024 * 1024;

pub fn definitions() -> Vec<ToolDefinition> {
    vec![
        ToolDefinition::new::<WebSearchToolArgs>(
            "websearch",
            "Search the web using Exa and return a concise text summary.",
            ToolPermission::Search,
        ),
        ToolDefinition::new::<WebFetchToolArgs>(
            "webfetch",
            "Fetch a web page as text, markdown, or HTML.",
            ToolPermission::Read,
        ),
    ]
}

pub fn execute_tool_call(
    _workspace_root: &std::path::Path,
    call: &ToolCall,
    _max_output_bytes: usize,
) -> Result<String> {
    let arguments: Value = serde_json::from_str(&call.arguments)
        .with_context(|| format!("failed to parse arguments for tool '{}'", call.name))?;

    match crate::tooling::canonical_tool_name(&call.name) {
        Some("websearch") => {
            let args = serde_json::from_value::<SearchArgs>(arguments)
                .with_context(|| format!("failed to decode arguments for tool '{}'", call.name))?;
            run_webtools(async { WebToolsClient::new()?.search(args).await })
        }
        Some("webfetch") => {
            let args = serde_json::from_value::<FetchArgs>(arguments)
                .with_context(|| format!("failed to decode arguments for tool '{}'", call.name))?;
            run_webtools(async { WebToolsClient::new()?.fetch(args).await })
        }
        Some(other) => bail!("unsupported web tool '{}'", other),
        None => bail!("unknown tool '{}'", call.name),
    }
}

struct WebToolsClient {
    http: Client,
    exa_url: String,
}

impl WebToolsClient {
    fn new() -> Result<Self> {
        let http = Client::builder()
            .user_agent("tidev-webtools/0.1")
            .build()
            .context("failed to construct web tools HTTP client")?;

        let exa_url = std::env::var("WEBTOOLS_EXA_URL").unwrap_or_else(|_| EXA_URL.to_string());

        Ok(Self { http, exa_url })
    }

    async fn search(&self, args: SearchArgs) -> Result<String> {
        let query = args.query.trim();
        if query.is_empty() {
            bail!("query cannot be empty");
        }

        let search_type = match args.search_type.as_deref() {
            Some("fast") => "fast",
            Some("deep") => "deep",
            _ => "auto",
        };

        let payload = json!({
            "jsonrpc": "2.0",
            "id": 1,
            "method": "tools/call",
            "params": {
                "name": "web_search_exa",
                "arguments": {
                    "query": query,
                    "type": search_type,
                    "numResults": args.num_results.unwrap_or(8),
                    "livecrawl": "fallback",
                    "contextMaxCharacters": null,
                }
            }
        });

        let body = timeout(SEARCH_TIMEOUT, async {
            let response = self
                .http
                .post(&self.exa_url)
                .header(ACCEPT, "application/json, text/event-stream")
                .json(&payload)
                .send()
                .await
                .context("failed to send web search request")?;

            if !response.status().is_success() {
                bail!(
                    "web search request failed with status {}",
                    response.status()
                );
            }

            response
                .text()
                .await
                .context("failed to read web search response")
        })
        .await
        .context("web search request timed out")??;

        let text = parse_exa_sse(&body)?.unwrap_or_else(|| {
            "No search results found. Please try a different query.".to_string()
        });

        Ok(text)
    }

    async fn fetch(&self, args: FetchArgs) -> Result<String> {
        let url = validate_url(&args.url)?;
        let format = match args.format.as_deref() {
            Some("text") => WebFetchFormat::Text,
            Some("html") => WebFetchFormat::Html,
            _ => WebFetchFormat::Markdown,
        };

        let timeout_secs = args
            .timeout
            .unwrap_or(FETCH_DEFAULT_TIMEOUT.as_secs())
            .min(FETCH_MAX_TIMEOUT.as_secs());
        let duration = Duration::from_secs(timeout_secs);
        let headers = fetch_headers(format);

        let response = timeout(duration, self.fetch_response(&url, headers.clone())).await??;

        let mime = response
            .headers()
            .get(CONTENT_TYPE)
            .and_then(|value| value.to_str().ok())
            .and_then(|value| value.split(';').next())
            .map(str::trim)
            .filter(|value| !value.is_empty())
            .unwrap_or("text/plain")
            .to_ascii_lowercase();

        if let Some(length) = response.content_length()
            && length > MAX_RESPONSE_BYTES as u64
        {
            bail!("response too large (exceeds 5MB limit)");
        }

        let bytes = response
            .bytes()
            .await
            .context("failed to read response body")?;
        if bytes.len() > MAX_RESPONSE_BYTES {
            bail!("response too large (exceeds 5MB limit)");
        }

        if is_image_mime(&mime) {
            return Ok(format!("Image fetched successfully ({})", mime));
        }

        let body = String::from_utf8_lossy(&bytes).into_owned();
        let output = match format {
            WebFetchFormat::Html => body,
            WebFetchFormat::Markdown => {
                if mime.contains("html") {
                    html2md::rewrite_html(&body, false)
                } else {
                    body
                }
            }
            WebFetchFormat::Text => {
                if mime.contains("html") {
                    markdown_to_text(&html2md::rewrite_html(&body, false))
                } else {
                    body
                }
            }
        };

        Ok(output)
    }

    async fn fetch_response(&self, url: &Url, headers: HeaderMap) -> Result<reqwest::Response> {
        let response = self
            .http
            .get(url.clone())
            .headers(headers.clone())
            .send()
            .await
            .context("failed to send fetch request")?;

        if response.status() == StatusCode::FORBIDDEN
            && response
                .headers()
                .get("cf-mitigated")
                .and_then(|value| value.to_str().ok())
                == Some("challenge")
        {
            let mut retry = headers;
            retry.insert(USER_AGENT, HeaderValue::from_static("opencode"));
            return self
                .http
                .get(url.clone())
                .headers(retry)
                .send()
                .await
                .context("failed to retry fetch request");
        }

        if !response.status().is_success() {
            bail!("fetch request failed with status {}", response.status());
        }

        Ok(response)
    }
}

fn run_webtools<T>(future: impl std::future::Future<Output = Result<T>>) -> Result<T> {
    let runtime = tokio::runtime::Builder::new_current_thread()
        .enable_all()
        .build()
        .context("failed to construct webtools runtime")?;

    runtime.block_on(future)
}

#[derive(Clone, Debug, Deserialize)]
struct SearchArgs {
    query: String,
    num_results: Option<i64>,
    search_type: Option<String>,
}

#[derive(Clone, Debug, Deserialize)]
struct FetchArgs {
    url: String,
    format: Option<String>,
    timeout: Option<u64>,
}

#[derive(Clone, Copy, Debug, PartialEq, Eq)]
enum WebFetchFormat {
    Text,
    Markdown,
    Html,
}

fn parse_exa_sse(body: &str) -> Result<Option<String>> {
    for line in body.lines() {
        let Some(data) = line.strip_prefix("data:") else {
            continue;
        };

        let data = data.trim();
        if data.is_empty() {
            continue;
        }

        let value: serde_json::Value =
            serde_json::from_str(data).with_context(|| "failed to parse Exa SSE payload")?;

        if let Some(text) = value
            .get("result")
            .and_then(|value| value.get("content"))
            .and_then(serde_json::Value::as_array)
            .and_then(|content| content.first())
            .and_then(|item| item.get("text"))
            .and_then(serde_json::Value::as_str)
        {
            return Ok(Some(text.to_string()));
        }
    }

    Ok(None)
}

fn fetch_headers(format: WebFetchFormat) -> HeaderMap {
    let mut headers = HeaderMap::new();
    headers.insert(
        USER_AGENT,
        HeaderValue::from_static(
            "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36",
        ),
    );
    headers.insert(ACCEPT_LANGUAGE, HeaderValue::from_static("en-US,en;q=0.9"));

    let accept = match format {
        WebFetchFormat::Markdown => {
            "text/markdown;q=1.0, text/x-markdown;q=0.9, text/plain;q=0.8, text/html;q=0.7, */*;q=0.1"
        }
        WebFetchFormat::Text => "text/plain;q=1.0, text/markdown;q=0.9, text/html;q=0.8, */*;q=0.1",
        WebFetchFormat::Html => {
            "text/html;q=1.0, application/xhtml+xml;q=0.9, text/plain;q=0.8, text/markdown;q=0.7, */*;q=0.1"
        }
    };
    headers.insert(ACCEPT, HeaderValue::from_static(accept));
    headers
}

fn validate_url(value: &str) -> Result<Url> {
    let url = Url::parse(value).with_context(|| format!("invalid URL '{value}'"))?;
    match url.scheme() {
        "http" | "https" => Ok(url),
        _ => bail!("URL must start with http:// or https://"),
    }
}

fn is_image_mime(mime: &str) -> bool {
    mime.starts_with("image/") && mime != "image/svg+xml"
}

fn markdown_to_text(markdown: &str) -> String {
    let mut output = String::new();
    let mut options = MarkdownOptions::empty();
    options.insert(MarkdownOptions::ENABLE_STRIKETHROUGH);
    options.insert(MarkdownOptions::ENABLE_TABLES);

    let mut in_code_block = false;
    for event in MarkdownParser::new_ext(markdown, options) {
        match event {
            Event::Start(Tag::CodeBlock(_)) => {
                if !output.is_empty() && !output.ends_with('\n') {
                    output.push('\n');
                }
                in_code_block = true;
            }
            Event::End(TagEnd::CodeBlock) => {
                if !output.ends_with('\n') {
                    output.push('\n');
                }
                in_code_block = false;
            }
            Event::Start(tag)
                if is_block_tag(&tag) && !output.is_empty() && !output.ends_with('\n') =>
            {
                output.push('\n');
            }
            Event::End(tag_end) if is_block_tag_end(&tag_end) && !output.ends_with('\n') => {
                output.push('\n');
            }
            Event::Text(text)
            | Event::Code(text)
            | Event::Html(text)
            | Event::InlineHtml(text)
            | Event::InlineMath(text)
            | Event::DisplayMath(text) => {
                append_text_segment(&mut output, &text, in_code_block);
            }
            Event::SoftBreak | Event::HardBreak if !output.ends_with('\n') => {
                output.push('\n');
            }
            _ => {}
        }
    }

    normalize_plain_text(output)
}

fn append_text_segment(output: &mut String, text: &str, in_code_block: bool) {
    if in_code_block {
        output.push_str(text);
        return;
    }

    let trimmed = text.trim();
    if trimmed.is_empty() {
        return;
    }

    if matches!(output.chars().last(), Some(last) if !last.is_whitespace()) {
        output.push(' ');
    }

    output.push_str(trimmed);
}

fn normalize_plain_text(text: String) -> String {
    let mut normalized = String::with_capacity(text.len());
    let mut previous_blank_line = false;

    for line in text.lines().map(str::trim_end) {
        let is_blank = line.trim().is_empty();
        if is_blank {
            if !previous_blank_line && !normalized.is_empty() {
                normalized.push('\n');
            }
            previous_blank_line = true;
            continue;
        }

        if !normalized.is_empty() && !normalized.ends_with('\n') {
            normalized.push('\n');
        }
        normalized.push_str(line.trim());
        previous_blank_line = false;
    }

    normalized.trim().to_string()
}

fn is_block_tag(tag: &Tag<'_>) -> bool {
    matches!(
        tag,
        Tag::Paragraph
            | Tag::Heading { .. }
            | Tag::BlockQuote(_)
            | Tag::CodeBlock(_)
            | Tag::HtmlBlock
            | Tag::List(_)
            | Tag::Item
            | Tag::FootnoteDefinition(_)
            | Tag::DefinitionList
            | Tag::DefinitionListTitle
            | Tag::DefinitionListDefinition
            | Tag::Table(_)
            | Tag::TableHead
            | Tag::TableRow
            | Tag::TableCell
            | Tag::MetadataBlock(_)
    )
}

fn is_block_tag_end(tag: &TagEnd) -> bool {
    matches!(
        tag,
        TagEnd::Paragraph
            | TagEnd::Heading(_)
            | TagEnd::BlockQuote(_)
            | TagEnd::HtmlBlock
            | TagEnd::List(_)
            | TagEnd::Item
            | TagEnd::FootnoteDefinition
            | TagEnd::DefinitionList
            | TagEnd::DefinitionListTitle
            | TagEnd::DefinitionListDefinition
            | TagEnd::Table
            | TagEnd::TableHead
            | TagEnd::TableRow
            | TagEnd::TableCell
            | TagEnd::MetadataBlock(_)
    )
}