Skip to main content

hematite/tools/
research.rs

1use lazy_static::lazy_static;
2use reqwest::header::USER_AGENT;
3use serde_json::Value;
4use std::fmt::Write as _;
5use std::sync::Mutex;
6use std::time::Duration;
7use std::time::Instant;
8
9lazy_static! {
10    /// Rate limit: 2 seconds between search calls to prevent local IP blocking.
11    static ref LAST_SEARCH_CALL: Mutex<Option<Instant>> = Mutex::new(None);
12}
13
14/// tool: research_web
15///
16/// Perform a zero-cost technical search using SearXNG (if configured) or DuckDuckGo Lite.
17/// Returns snippets and titles from technical search results.
18pub async fn execute_search(args: &Value, searx_url: Option<String>) -> Result<String, String> {
19    let query = args
20        .get("query")
21        .and_then(|v| v.as_str())
22        .ok_or_else(|| "Missing required argument: 'query'".to_string())?;
23
24    // 1. First Attempt: Original Query
25    let results = perform_search(query, searx_url.as_deref()).await?;
26    if !results.is_empty() && !results.contains("No search results found") {
27        return Ok(results);
28    }
29
30    // 2. Fallback: Simplified Query if needed
31    let tier2 = query
32        .replace("2024", "")
33        .replace("2025", "")
34        .replace("2026", "")
35        .replace("crate", "")
36        .split_whitespace()
37        .fold(String::new(), |mut s, w| {
38            if !s.is_empty() {
39                s.push(' ');
40            }
41            s.push_str(w);
42            s
43        });
44
45    if tier2 != query {
46        let second_results = perform_search(&tier2, searx_url.as_deref()).await?;
47        if !second_results.is_empty() && !second_results.contains("No search results found") {
48            return Ok(second_results);
49        }
50    }
51
52    Ok(
53        "No search results found. All web content was safely sanitized. Try a broader search term."
54            .to_string(),
55    )
56}
57
58/// Proactively strip JSON-like structures and tool-call patterns from web content.
59/// This prevents 'Prompt Injection' where a website tries to trick the agent into running commands.
60fn sanitize_web_content(text: &str) -> String {
61    // Preserve markdown link syntax, but neuter common prompt/tool-call markers.
62    text.replace("{", " (")
63        .replace("}", ") ")
64        .replace("\"", "'")
65        .replace("<script", "[BLOCKED SCRIPT]")
66        .replace("<iframe", "[BLOCKED IFRAME]")
67        .replace("javascript:", "blocked-js:")
68}
69
70async fn perform_search(query: &str, searx_url: Option<&str>) -> Result<String, String> {
71    // 1. Try Local SearXNG if configured OR auto-detect on default port (8080)
72    let effective_url = searx_url.unwrap_or("http://localhost:8080");
73
74    match perform_searx_search(query, effective_url).await {
75        Ok(results) if !results.is_empty() => return Ok(results),
76        _ => {
77            // Silently fall back to Jina if SearXNG is unreachable or empty.
78            // Note: perform_searx_search has its own timeout to prevent blocking.
79        }
80    }
81
82    // 2. Respect Rate Limits (even for proxy, to be a good citizen)
83    let sleep_duration = {
84        if let Ok(last_call) = LAST_SEARCH_CALL.lock() {
85            last_call.and_then(|instant| {
86                let elapsed = instant.elapsed();
87                if elapsed < Duration::from_secs(3) {
88                    Some(Duration::from_secs(3) - elapsed)
89                } else {
90                    None
91                }
92            })
93        } else {
94            None
95        }
96    };
97    if let Some(duration) = sleep_duration {
98        tokio::time::sleep(duration).await;
99    }
100    if let Ok(mut last_call) = LAST_SEARCH_CALL.lock() {
101        *last_call = Some(Instant::now());
102    }
103
104    // 3. Construct Jina Search Proxy URL
105    // s.jina.ai converts search results into clean markdown for agents.
106    let encoded = percent_encoding::utf8_percent_encode(query, percent_encoding::NON_ALPHANUMERIC);
107    let search_url = format!("https://s.jina.ai/{}", encoded);
108
109    let client = reqwest::Client::builder()
110        .timeout(Duration::from_secs(20))
111        .build()
112        .map_err(|e| format!("Failed to build client: {e}"))?;
113
114    let mut request = client.get(&search_url)
115        .header(USER_AGENT, "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36");
116
117    // 3.5 Optional: Inject Jina API Key if available in environment
118    if let Ok(key) = std::env::var("JINA_API_KEY") {
119        request = request.header("Authorization", format!("Bearer {}", key));
120    }
121
122    let response = request
123        .send()
124        .await
125        .map_err(|e| format!("Failed to connect to search proxy: {e}"))?;
126
127    let markdown = response
128        .text()
129        .await
130        .map_err(|e| format!("Failed to read search response: {e}"))?;
131
132    // 4. Safety First: Detect HTML/Captcha leaks and sanitize content.
133    if markdown.trim().starts_with("<!doctype html") || markdown.contains("<html") {
134        return Err("Search proxy returned raw HTML (possibly a rate limit or captcha). Falling back to internal reasoning.".into());
135    }
136
137    Ok(format!(
138        "[Source: Jina Search Proxy]\n\n{}",
139        sanitize_web_content(&markdown)
140    ))
141}
142
143async fn perform_searx_search(query: &str, base_url: &str) -> Result<String, String> {
144    let client = reqwest::Client::builder()
145        .timeout(Duration::from_secs(5))
146        .build()
147        .map_err(|e| format!("Failed to build SearXNG client: {e}"))?;
148
149    // Base URL should not have trailing slash for consistency
150    let base = base_url.trim_end_matches('/');
151    let search_url = format!(
152        "{}/search?q={}&format=json",
153        base,
154        urlencoding::encode(query)
155    );
156
157    let response = client
158        .get(&search_url)
159        .header(USER_AGENT, "Hematite-CLI/0.6.0")
160        .send()
161        .await
162        .map_err(|e| format!("SearXNG connection failed: {e}"))?;
163
164    if !response.status().is_success() {
165        return Err(format!("SearXNG returned error: {}", response.status()));
166    }
167
168    let json: Value = response
169        .json()
170        .await
171        .map_err(|e| format!("Failed to parse SearXNG JSON: {e}"))?;
172
173    let mut output = String::with_capacity(query.len() + 4096);
174    output.push_str("[Source: SearXNG (Local/Auto-Detected)]\n\n");
175    let _ = write!(output, "# Search results for: {}\n\n", query);
176
177    if let Some(results) = json.get("results").and_then(|r| r.as_array()) {
178        for (i, res) in results.iter().take(10).enumerate() {
179            let title = res
180                .get("title")
181                .and_then(|v| v.as_str())
182                .unwrap_or("No Title");
183            let url = res.get("url").and_then(|v| v.as_str()).unwrap_or("#");
184            let content = res.get("content").and_then(|v| v.as_str()).unwrap_or("");
185
186            let _ = write!(
187                output,
188                "### {}. [{}]({})\n{}\n\n",
189                i + 1,
190                title,
191                url,
192                sanitize_web_content(content)
193            );
194        }
195    }
196
197    if output.len() < 50 {
198        return Ok(String::new());
199    }
200
201    Ok(output)
202}
203
204/// tool: fetch_docs
205///
206/// Fetch any URL and convert it into clean, agent-ready Markdown using the Jina Reader proxy.
207/// This prevents local IP blocking and ensures structured context for documentation.
208pub async fn execute_fetch(args: &Value) -> Result<String, String> {
209    let url = args
210        .get("url")
211        .and_then(|v| v.as_str())
212        .ok_or_else(|| "Missing required argument: 'url'".to_string())?;
213
214    // Prefix with Jina Reader - it handles the rendering and markdown conversion for us.
215    let proxy_url = format!("https://r.jina.ai/{}", url);
216
217    let client = reqwest::Client::builder()
218        .timeout(Duration::from_secs(25))
219        .build()
220        .map_err(|e| format!("Failed to build client: {e}"))?;
221
222    let mut request = client.get(&proxy_url)
223        .header(USER_AGENT, "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36");
224
225    // 2.5 Optional: Inject Jina API Key if available in environment
226    if let Ok(key) = std::env::var("JINA_API_KEY") {
227        request = request.header("Authorization", format!("Bearer {}", key));
228    }
229
230    let response = request
231        .send()
232        .await
233        .map_err(|e| format!("Failed to connect to documentation proxy: {e}"))?;
234
235    let markdown = response
236        .text()
237        .await
238        .map_err(|e| format!("Failed to read documentation body: {e}"))?;
239
240    Ok(markdown)
241}
242
243#[cfg(test)]
244mod tests {
245    use super::sanitize_web_content;
246
247    #[test]
248    fn sanitize_web_content_blocks_script_patterns_without_breaking_markdown_links() {
249        let input = r#"Use {"tool":"shell"} and [Rust](https://www.rust-lang.org) <iframe src="x"></iframe>"#;
250        let sanitized = sanitize_web_content(input);
251
252        assert!(sanitized.contains("('tool':'shell')"));
253        assert!(sanitized.contains("[Rust](https://www.rust-lang.org)"));
254        assert!(sanitized.contains("[BLOCKED IFRAME]"));
255    }
256}