Skip to main content

hematite/tools/
research.rs

1use lazy_static::lazy_static;
2use reqwest::header::USER_AGENT;
3use serde_json::Value;
4use std::sync::Mutex;
5use std::time::Duration;
6use std::time::Instant;
7
8lazy_static! {
9    /// Rate limit: 2 seconds between search calls to prevent local IP blocking.
10    static ref LAST_SEARCH_CALL: Mutex<Option<Instant>> = Mutex::new(None);
11}
12
13/// tool: research_web
14///
15/// Perform a zero-cost technical search using SearXNG (if configured) or DuckDuckGo Lite.
16/// Returns snippets and titles from technical search results.
17pub async fn execute_search(args: &Value, searx_url: Option<String>) -> Result<String, String> {
18    let query = args
19        .get("query")
20        .and_then(|v| v.as_str())
21        .ok_or_else(|| "Missing required argument: 'query'".to_string())?;
22
23    // 1. First Attempt: Original Query
24    let results = perform_search(query, searx_url.as_deref()).await?;
25    if !results.is_empty() && !results.contains("No search results found") {
26        return Ok(results);
27    }
28
29    // 2. Fallback: Simplified Query if needed
30    let tier2 = query
31        .replace("2024", "")
32        .replace("2025", "")
33        .replace("2026", "")
34        .replace("crate", "")
35        .split_whitespace()
36        .collect::<Vec<_>>()
37        .join(" ");
38
39    if tier2 != query {
40        let second_results = perform_search(&tier2, searx_url.as_deref()).await?;
41        if !second_results.is_empty() && !second_results.contains("No search results found") {
42            return Ok(second_results);
43        }
44    }
45
46    Ok(
47        "No search results found. All web content was safely sanitized. Try a broader search term."
48            .to_string(),
49    )
50}
51
52/// Proactively strip JSON-like structures and tool-call patterns from web content.
53/// This prevents 'Prompt Injection' where a website tries to trick the agent into running commands.
54fn sanitize_web_content(text: &str) -> String {
55    text.replace("{", " (")
56        .replace("}", ") ")
57        .replace("[", " (")
58        .replace("]", ") ")
59        .replace("\"", "'")
60        .replace("<script", "[BLOCKED SCRIPT]")
61}
62
63async fn perform_search(query: &str, searx_url: Option<&str>) -> Result<String, String> {
64    // 1. Try Local SearXNG if configured OR auto-detect on default port (8080)
65    let effective_url = searx_url.unwrap_or("http://localhost:8080");
66
67    match perform_searx_search(query, effective_url).await {
68        Ok(results) if !results.is_empty() => return Ok(results),
69        _ => {
70            // Silently fall back to Jina if SearXNG is unreachable or empty.
71            // Note: perform_searx_search has its own timeout to prevent blocking.
72        }
73    }
74
75    // 2. Respect Rate Limits (even for proxy, to be a good citizen)
76    let sleep_duration = {
77        if let Ok(last_call) = LAST_SEARCH_CALL.lock() {
78            last_call.and_then(|instant| {
79                let elapsed = instant.elapsed();
80                if elapsed < Duration::from_secs(3) {
81                    Some(Duration::from_secs(3) - elapsed)
82                } else {
83                    None
84                }
85            })
86        } else {
87            None
88        }
89    };
90    if let Some(duration) = sleep_duration {
91        tokio::time::sleep(duration).await;
92    }
93    if let Ok(mut last_call) = LAST_SEARCH_CALL.lock() {
94        *last_call = Some(Instant::now());
95    }
96
97    // 3. Construct Jina Search Proxy URL
98    // s.jina.ai converts search results into clean markdown for agents.
99    let encoded = percent_encoding::utf8_percent_encode(query, percent_encoding::NON_ALPHANUMERIC);
100    let search_url = format!("https://s.jina.ai/{}", encoded);
101
102    let client = reqwest::Client::builder()
103        .timeout(Duration::from_secs(20))
104        .build()
105        .map_err(|e| format!("Failed to build client: {e}"))?;
106
107    let mut request = client.get(&search_url)
108        .header(USER_AGENT, "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36");
109
110    // 3.5 Optional: Inject Jina API Key if available in environment
111    if let Ok(key) = std::env::var("JINA_API_KEY") {
112        request = request.header("Authorization", format!("Bearer {}", key));
113    }
114
115    let response = request
116        .send()
117        .await
118        .map_err(|e| format!("Failed to connect to search proxy: {e}"))?;
119
120    let markdown = response
121        .text()
122        .await
123        .map_err(|e| format!("Failed to read search response: {e}"))?;
124
125    // 4. Safety First: Detect HTML/Captcha leaks and sanitize content.
126    if markdown.trim().starts_with("<!doctype html") || markdown.contains("<html") {
127        return Err("Search proxy returned raw HTML (possibly a rate limit or captcha). Falling back to internal reasoning.".into());
128    }
129
130    Ok(format!(
131        "[Source: Jina Search Proxy]\n\n{}",
132        sanitize_web_content(&markdown)
133    ))
134}
135
136async fn perform_searx_search(query: &str, base_url: &str) -> Result<String, String> {
137    let client = reqwest::Client::builder()
138        .timeout(Duration::from_secs(15))
139        .build()
140        .map_err(|e| format!("Failed to build SearXNG client: {e}"))?;
141
142    // Base URL should not have trailing slash for consistency
143    let base = base_url.trim_end_matches('/');
144    let search_url = format!(
145        "{}/search?q={}&format=json",
146        base,
147        urlencoding::encode(query)
148    );
149
150    let response = client
151        .get(&search_url)
152        .header(USER_AGENT, "Hematite-CLI/0.6.0")
153        .send()
154        .await
155        .map_err(|e| format!("SearXNG connection failed: {e}"))?;
156
157    if !response.status().is_success() {
158        return Err(format!("SearXNG returned error: {}", response.status()));
159    }
160
161    let json: Value = response
162        .json()
163        .await
164        .map_err(|e| format!("Failed to parse SearXNG JSON: {e}"))?;
165
166    let mut output = String::new();
167    output.push_str("[Source: SearXNG (Local/Auto-Detected)]\n\n");
168    output.push_str(&format!("# Search results for: {}\n\n", query));
169
170    if let Some(results) = json.get("results").and_then(|r| r.as_array()) {
171        for (i, res) in results.iter().take(10).enumerate() {
172            let title = res
173                .get("title")
174                .and_then(|v| v.as_str())
175                .unwrap_or("No Title");
176            let url = res.get("url").and_then(|v| v.as_str()).unwrap_or("#");
177            let content = res.get("content").and_then(|v| v.as_str()).unwrap_or("");
178
179            output.push_str(&format!(
180                "### {}. [{}]({})\n{}\n\n",
181                i + 1,
182                title,
183                url,
184                sanitize_web_content(content)
185            ));
186        }
187    }
188
189    if output.len() < 50 {
190        return Ok(String::new());
191    }
192
193    Ok(output)
194}
195
196/// tool: fetch_docs
197///
198/// Fetch any URL and convert it into clean, agent-ready Markdown using the Jina Reader proxy.
199/// This prevents local IP blocking and ensures structured context for documentation.
200pub async fn execute_fetch(args: &Value) -> Result<String, String> {
201    let url = args
202        .get("url")
203        .and_then(|v| v.as_str())
204        .ok_or_else(|| "Missing required argument: 'url'".to_string())?;
205
206    // Prefix with Jina Reader - it handles the rendering and markdown conversion for us.
207    let proxy_url = format!("https://r.jina.ai/{}", url);
208
209    let client = reqwest::Client::builder()
210        .timeout(Duration::from_secs(25))
211        .build()
212        .map_err(|e| format!("Failed to build client: {e}"))?;
213
214    let mut request = client.get(&proxy_url)
215        .header(USER_AGENT, "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36");
216
217    // 2.5 Optional: Inject Jina API Key if available in environment
218    if let Ok(key) = std::env::var("JINA_API_KEY") {
219        request = request.header("Authorization", format!("Bearer {}", key));
220    }
221
222    let response = request
223        .send()
224        .await
225        .map_err(|e| format!("Failed to connect to documentation proxy: {e}"))?;
226
227    let markdown = response
228        .text()
229        .await
230        .map_err(|e| format!("Failed to read documentation body: {e}"))?;
231
232    Ok(markdown)
233}