Skip to main content

hematite/tools/
research.rs

1use lazy_static::lazy_static;
2use reqwest::header::USER_AGENT;
3use serde_json::Value;
4use std::sync::Mutex;
5use std::time::Duration;
6use std::time::Instant;
7
8lazy_static! {
9    /// Rate limit: 2 seconds between search calls to prevent local IP blocking.
10    static ref LAST_SEARCH_CALL: Mutex<Option<Instant>> = Mutex::new(None);
11}
12
13/// tool: research_web
14///
15/// Perform a zero-cost technical search using SearXNG (if configured) or DuckDuckGo Lite.
16/// Returns snippets and titles from technical search results.
17pub async fn execute_search(args: &Value, searx_url: Option<String>) -> Result<String, String> {
18    let query = args
19        .get("query")
20        .and_then(|v| v.as_str())
21        .ok_or_else(|| "Missing required argument: 'query'".to_string())?;
22
23    // 1. First Attempt: Original Query
24    let results = perform_search(query, searx_url.as_deref()).await?;
25    if !results.is_empty() && !results.contains("No search results found") {
26        return Ok(results);
27    }
28
29    // 2. Fallback: Simplified Query if needed
30    let tier2 = query
31        .replace("2024", "")
32        .replace("2025", "")
33        .replace("2026", "")
34        .replace("crate", "")
35        .split_whitespace()
36        .collect::<Vec<_>>()
37        .join(" ");
38
39    if tier2 != query {
40        let second_results = perform_search(&tier2, searx_url.as_deref()).await?;
41        if !second_results.is_empty() && !second_results.contains("No search results found") {
42            return Ok(second_results);
43        }
44    }
45
46    Ok(
47        "No search results found. All web content was safely sanitized. Try a broader search term."
48            .to_string(),
49    )
50}
51
52/// Proactively strip JSON-like structures and tool-call patterns from web content.
53/// This prevents 'Prompt Injection' where a website tries to trick the agent into running commands.
54fn sanitize_web_content(text: &str) -> String {
55    // Preserve markdown link syntax, but neuter common prompt/tool-call markers.
56    text.replace("{", " (")
57        .replace("}", ") ")
58        .replace("\"", "'")
59        .replace("<script", "[BLOCKED SCRIPT]")
60        .replace("<iframe", "[BLOCKED IFRAME]")
61        .replace("javascript:", "blocked-js:")
62}
63
64async fn perform_search(query: &str, searx_url: Option<&str>) -> Result<String, String> {
65    // 1. Try Local SearXNG if configured OR auto-detect on default port (8080)
66    let effective_url = searx_url.unwrap_or("http://localhost:8080");
67
68    match perform_searx_search(query, effective_url).await {
69        Ok(results) if !results.is_empty() => return Ok(results),
70        _ => {
71            // Silently fall back to Jina if SearXNG is unreachable or empty.
72            // Note: perform_searx_search has its own timeout to prevent blocking.
73        }
74    }
75
76    // 2. Respect Rate Limits (even for proxy, to be a good citizen)
77    let sleep_duration = {
78        if let Ok(last_call) = LAST_SEARCH_CALL.lock() {
79            last_call.and_then(|instant| {
80                let elapsed = instant.elapsed();
81                if elapsed < Duration::from_secs(3) {
82                    Some(Duration::from_secs(3) - elapsed)
83                } else {
84                    None
85                }
86            })
87        } else {
88            None
89        }
90    };
91    if let Some(duration) = sleep_duration {
92        tokio::time::sleep(duration).await;
93    }
94    if let Ok(mut last_call) = LAST_SEARCH_CALL.lock() {
95        *last_call = Some(Instant::now());
96    }
97
98    // 3. Construct Jina Search Proxy URL
99    // s.jina.ai converts search results into clean markdown for agents.
100    let encoded = percent_encoding::utf8_percent_encode(query, percent_encoding::NON_ALPHANUMERIC);
101    let search_url = format!("https://s.jina.ai/{}", encoded);
102
103    let client = reqwest::Client::builder()
104        .timeout(Duration::from_secs(20))
105        .build()
106        .map_err(|e| format!("Failed to build client: {e}"))?;
107
108    let mut request = client.get(&search_url)
109        .header(USER_AGENT, "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36");
110
111    // 3.5 Optional: Inject Jina API Key if available in environment
112    if let Ok(key) = std::env::var("JINA_API_KEY") {
113        request = request.header("Authorization", format!("Bearer {}", key));
114    }
115
116    let response = request
117        .send()
118        .await
119        .map_err(|e| format!("Failed to connect to search proxy: {e}"))?;
120
121    let markdown = response
122        .text()
123        .await
124        .map_err(|e| format!("Failed to read search response: {e}"))?;
125
126    // 4. Safety First: Detect HTML/Captcha leaks and sanitize content.
127    if markdown.trim().starts_with("<!doctype html") || markdown.contains("<html") {
128        return Err("Search proxy returned raw HTML (possibly a rate limit or captcha). Falling back to internal reasoning.".into());
129    }
130
131    Ok(format!(
132        "[Source: Jina Search Proxy]\n\n{}",
133        sanitize_web_content(&markdown)
134    ))
135}
136
137async fn perform_searx_search(query: &str, base_url: &str) -> Result<String, String> {
138    let client = reqwest::Client::builder()
139        .timeout(Duration::from_secs(5))
140        .build()
141        .map_err(|e| format!("Failed to build SearXNG client: {e}"))?;
142
143    // Base URL should not have trailing slash for consistency
144    let base = base_url.trim_end_matches('/');
145    let search_url = format!(
146        "{}/search?q={}&format=json",
147        base,
148        urlencoding::encode(query)
149    );
150
151    let response = client
152        .get(&search_url)
153        .header(USER_AGENT, "Hematite-CLI/0.6.0")
154        .send()
155        .await
156        .map_err(|e| format!("SearXNG connection failed: {e}"))?;
157
158    if !response.status().is_success() {
159        return Err(format!("SearXNG returned error: {}", response.status()));
160    }
161
162    let json: Value = response
163        .json()
164        .await
165        .map_err(|e| format!("Failed to parse SearXNG JSON: {e}"))?;
166
167    let mut output = String::new();
168    output.push_str("[Source: SearXNG (Local/Auto-Detected)]\n\n");
169    output.push_str(&format!("# Search results for: {}\n\n", query));
170
171    if let Some(results) = json.get("results").and_then(|r| r.as_array()) {
172        for (i, res) in results.iter().take(10).enumerate() {
173            let title = res
174                .get("title")
175                .and_then(|v| v.as_str())
176                .unwrap_or("No Title");
177            let url = res.get("url").and_then(|v| v.as_str()).unwrap_or("#");
178            let content = res.get("content").and_then(|v| v.as_str()).unwrap_or("");
179
180            output.push_str(&format!(
181                "### {}. [{}]({})\n{}\n\n",
182                i + 1,
183                title,
184                url,
185                sanitize_web_content(content)
186            ));
187        }
188    }
189
190    if output.len() < 50 {
191        return Ok(String::new());
192    }
193
194    Ok(output)
195}
196
197#[cfg(test)]
198mod tests {
199    use super::sanitize_web_content;
200
201    #[test]
202    fn sanitize_web_content_blocks_script_patterns_without_breaking_markdown_links() {
203        let input = r#"Use {"tool":"shell"} and [Rust](https://www.rust-lang.org) <iframe src="x"></iframe>"#;
204        let sanitized = sanitize_web_content(input);
205
206        assert!(sanitized.contains("('tool':'shell')"));
207        assert!(sanitized.contains("[Rust](https://www.rust-lang.org)"));
208        assert!(sanitized.contains("[BLOCKED IFRAME]"));
209    }
210}
211
212/// tool: fetch_docs
213///
214/// Fetch any URL and convert it into clean, agent-ready Markdown using the Jina Reader proxy.
215/// This prevents local IP blocking and ensures structured context for documentation.
216pub async fn execute_fetch(args: &Value) -> Result<String, String> {
217    let url = args
218        .get("url")
219        .and_then(|v| v.as_str())
220        .ok_or_else(|| "Missing required argument: 'url'".to_string())?;
221
222    // Prefix with Jina Reader - it handles the rendering and markdown conversion for us.
223    let proxy_url = format!("https://r.jina.ai/{}", url);
224
225    let client = reqwest::Client::builder()
226        .timeout(Duration::from_secs(25))
227        .build()
228        .map_err(|e| format!("Failed to build client: {e}"))?;
229
230    let mut request = client.get(&proxy_url)
231        .header(USER_AGENT, "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36");
232
233    // 2.5 Optional: Inject Jina API Key if available in environment
234    if let Ok(key) = std::env::var("JINA_API_KEY") {
235        request = request.header("Authorization", format!("Bearer {}", key));
236    }
237
238    let response = request
239        .send()
240        .await
241        .map_err(|e| format!("Failed to connect to documentation proxy: {e}"))?;
242
243    let markdown = response
244        .text()
245        .await
246        .map_err(|e| format!("Failed to read documentation body: {e}"))?;
247
248    Ok(markdown)
249}