Skip to main content

atomcode_core/tool/
web_search.rs

1use anyhow::Result;
2use async_trait::async_trait;
3use serde::Deserialize;
4use serde_json::json;
5use tokio::process::Command;
6
7use super::{ApprovalRequirement, Tool, ToolContext, ToolDef, ToolResult};
8
9/// Clamp a byte index to the nearest valid UTF-8 char boundary (forward).
10/// Prevents panics when slicing strings that contain multi-byte characters.
11fn ceil_char_boundary(s: &str, index: usize) -> usize {
12    if index >= s.len() {
13        return s.len();
14    }
15    let mut i = index;
16    while i < s.len() && !s.is_char_boundary(i) {
17        i += 1;
18    }
19    i
20}
21
22/// Clamp a byte index to the nearest valid UTF-8 char boundary (backward).
23fn floor_char_boundary(s: &str, index: usize) -> usize {
24    if index >= s.len() {
25        return s.len();
26    }
27    let mut i = index;
28    while i > 0 && !s.is_char_boundary(i) {
29        i -= 1;
30    }
31    i
32}
33
34pub struct WebSearchTool;
35
36#[derive(Deserialize)]
37struct WebSearchArgs {
38    query: String,
39    #[serde(default = "default_max")]
40    max_results: usize,
41}
42
43fn default_max() -> usize {
44    8
45}
46
47#[async_trait]
48impl Tool for WebSearchTool {
49    fn definition(&self) -> ToolDef {
50        ToolDef {
51            name: "web_search",
52            description: "Search the web for information. Returns titles, URLs, and snippets.\n\
53                Use when you need to find documentation, look up APIs, research libraries, \
54                or find information not available locally.\n\
55                Examples:\n\
56                - {\"query\": \"openclaw github\"}\n\
57                - {\"query\": \"tailwindcss v4 installation guide\"}\n\
58                - {\"query\": \"rust reqwest POST example\"}"
59                .to_string(),
60            parameters: json!({
61                "type": "object",
62                "properties": {
63                    "query": { "type": "string", "description": "Search query" },
64                    "max_results": { "type": "integer", "description": "Max results (default 8)" }
65                },
66                "required": ["query"]
67            }),
68        }
69    }
70
71    fn approval(&self, _args: &str) -> ApprovalRequirement {
72        ApprovalRequirement::AutoApprove
73    }
74
75    async fn execute(&self, args: &str, _ctx: &ToolContext) -> Result<ToolResult> {
76        let parsed: WebSearchArgs = serde_json::from_str(args)?;
77        let max = parsed.max_results.min(20);
78
79        // Use curl for the HTTP request — reqwest gets blocked by DuckDuckGo's
80        // TLS fingerprint detection, but curl works reliably.
81        let query_encoded = parsed.query.replace(' ', "+");
82        let curl_bin = if cfg!(target_os = "windows") {
83            "curl.exe"
84        } else {
85            "curl"
86        };
87        let mut cmd = Command::new(curl_bin);
88        cmd.args(&[
89            "-s", "-X", "POST",
90            "https://html.duckduckgo.com/html/",
91            "-d", &format!("q={}", query_encoded),
92            "-A", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko)",
93            "--max-time", "15",
94            "-L", // follow redirects
95        ]);
96
97        // On Windows, prevent the spawned curl.exe from creating a visible console window.
98        crate::process_utils::suppress_console_window(&mut cmd);
99
100        let output = cmd.output().await;
101
102        let html = match output {
103            Ok(o) => String::from_utf8_lossy(&o.stdout).to_string(),
104            Err(e) => {
105                return Ok(ToolResult {
106                    call_id: String::new(),
107                    output: format!("Search failed: {}", e),
108                    success: false,
109                });
110            }
111        };
112
113        if html.is_empty() {
114            return Ok(ToolResult {
115                call_id: String::new(),
116                output: format!("Search returned empty response for '{}'", parsed.query),
117                success: false,
118            });
119        }
120
121        let results = parse_ddg_results(&html, max);
122
123        if results.is_empty() {
124            return Ok(ToolResult {
125                call_id: String::new(),
126                output: format!(
127                    "No results found for '{}' ({} bytes received)",
128                    parsed.query,
129                    html.len()
130                ),
131                success: false,
132            });
133        }
134
135        let mut out = format!("Search results for \"{}\":\n\n", parsed.query);
136        for (i, r) in results.iter().enumerate() {
137            out.push_str(&format!(
138                "{}. {}\n   {}\n   {}\n\n",
139                i + 1,
140                r.title,
141                r.url,
142                r.snippet
143            ));
144        }
145
146        Ok(ToolResult {
147            call_id: String::new(),
148            output: out,
149            success: true,
150        })
151    }
152}
153
154struct SearchResult {
155    title: String,
156    url: String,
157    snippet: String,
158}
159
160/// Parse DuckDuckGo HTML search results page.
161/// Actual structure: <a rel="nofollow" class="result__a" href="URL">title</a>
162///                   <a class="result__snippet" href="URL">snippet</a>
163fn parse_ddg_results(html: &str, max: usize) -> Vec<SearchResult> {
164    let mut results = Vec::new();
165
166    let mut pos = 0;
167    while results.len() < max {
168        // Find result link marker
169        let link_marker = "class=\"result__a\"";
170        let safe_pos = ceil_char_boundary(html, pos);
171        let marker_pos = match html[safe_pos..].find(link_marker) {
172            Some(p) => safe_pos + p,
173            None => break,
174        };
175        let after_marker = ceil_char_boundary(html, marker_pos + link_marker.len());
176
177        // Find the opening '<a' of this tag (search backwards from marker)
178        let tag_start = html[..marker_pos].rfind('<').unwrap_or(marker_pos);
179        // The entire <a ...>title</a> region
180        let tag_end = html[after_marker..]
181            .find("</a>")
182            .map(|p| after_marker + p)
183            .unwrap_or(after_marker);
184
185        let safe_tag_end_plus4 = ceil_char_boundary(html, tag_end + 4);
186        let tag_region = &html[tag_start..safe_tag_end_plus4]; // include </a>
187
188        // Extract href from the tag — search the entire <a ...> tag for href="..."
189        let url = if let Some(hp) = tag_region.find("href=\"") {
190            let hs = hp + 6;
191            let he = tag_region[hs..].find('"').map(|e| hs + e).unwrap_or(hs);
192            extract_ddg_url(&tag_region[hs..he])
193        } else {
194            pos = safe_tag_end_plus4;
195            continue;
196        };
197
198        // Extract title — text content between > (after all attributes) and </a>
199        let content_start = html[after_marker..tag_end]
200            .find('>')
201            .map(|p| after_marker + p + 1)
202            .unwrap_or(after_marker);
203        let safe_content_start = ceil_char_boundary(html, content_start);
204        let safe_tag_end = floor_char_boundary(html, tag_end);
205        let title = if safe_content_start <= safe_tag_end {
206            strip_html_tags(&html[safe_content_start..safe_tag_end])
207        } else {
208            String::new()
209        };
210
211        // Extract snippet: class="result__snippet" — search within next 2000 chars
212        let snippet_marker = "class=\"result__snippet\"";
213        let search_end = ceil_char_boundary(html, (tag_end + 2000).min(html.len()));
214        let safe_tag_end2 = ceil_char_boundary(html, tag_end);
215        let snippet = if let Some(sp) = html[safe_tag_end2..search_end].find(snippet_marker) {
216            let snippet_pos = safe_tag_end2 + sp;
217            let s_start = ceil_char_boundary(
218                html,
219                html[snippet_pos..]
220                    .find('>')
221                    .map(|p| snippet_pos + p + 1)
222                    .unwrap_or(snippet_pos),
223            );
224            let s_end = floor_char_boundary(
225                html,
226                html[s_start..]
227                    .find("</a>")
228                    .map(|p| s_start + p)
229                    .unwrap_or(s_start),
230            );
231            if s_start <= s_end {
232                strip_html_tags(&html[s_start..s_end])
233            } else {
234                String::new()
235            }
236        } else {
237            String::new()
238        };
239
240        if !title.trim().is_empty() && !url.is_empty() && url.starts_with("http") {
241            results.push(SearchResult {
242                title: title.trim().to_string(),
243                url,
244                snippet: snippet.trim().to_string(),
245            });
246        }
247
248        pos = ceil_char_boundary(html, tag_end + 4);
249    }
250
251    results
252}
253
254/// Extract actual URL from DuckDuckGo redirect URL.
255fn extract_ddg_url(raw: &str) -> String {
256    // DDG format: //duckduckgo.com/l/?uddg=https%3A%2F%2Fexample.com&...
257    if let Some(uddg_pos) = raw.find("uddg=") {
258        let start = uddg_pos + 5;
259        let end = raw[start..]
260            .find('&')
261            .map(|e| start + e)
262            .unwrap_or(raw.len());
263        let encoded = &raw[start..end];
264        url_decode(encoded)
265    } else if raw.starts_with("http") {
266        raw.to_string()
267    } else if raw.starts_with("//") {
268        format!("https:{}", raw)
269    } else {
270        raw.to_string()
271    }
272}
273
274/// Simple URL percent-decoding.
275fn url_decode(s: &str) -> String {
276    let mut result = String::with_capacity(s.len());
277    let mut chars = s.chars();
278    while let Some(c) = chars.next() {
279        if c == '%' {
280            let hex: String = chars.by_ref().take(2).collect();
281            if let Ok(byte) = u8::from_str_radix(&hex, 16) {
282                result.push(byte as char);
283            } else {
284                result.push('%');
285                result.push_str(&hex);
286            }
287        } else if c == '+' {
288            result.push(' ');
289        } else {
290            result.push(c);
291        }
292    }
293    result
294}
295
296/// Strip HTML tags from a string, decode basic entities.
297fn strip_html_tags(s: &str) -> String {
298    let mut result = String::with_capacity(s.len());
299    let mut in_tag = false;
300    for c in s.chars() {
301        match c {
302            '<' => in_tag = true,
303            '>' => in_tag = false,
304            _ if !in_tag => result.push(c),
305            _ => {}
306        }
307    }
308    result
309        .replace("&amp;", "&")
310        .replace("&lt;", "<")
311        .replace("&gt;", ">")
312        .replace("&quot;", "\"")
313        .replace("&#x27;", "'")
314        .replace("&nbsp;", " ")
315        .replace("&#39;", "'")
316}
317
318#[cfg(test)]
319mod tests {
320    use super::*;
321
322    #[test]
323    fn test_parse_ddg_results() {
324        let html = r#"
325        <h2 class="result__title">
326            <a rel="nofollow" class="result__a" href="https://github.com/openclaw">openclaw · GitHub</a>
327        </h2>
328        <a class="result__snippet" href="https://github.com/openclaw">Your personal AI assistant. openclaw has 23 repos.</a>
329        <h2 class="result__title">
330            <a rel="nofollow" class="result__a" href="https://openclaw.ai/">OpenClaw — Personal AI</a>
331        </h2>
332        <a class="result__snippet" href="https://openclaw.ai/">The AI that does things.</a>
333        "#;
334        let results = parse_ddg_results(html, 10);
335        assert_eq!(results.len(), 2);
336        assert_eq!(results[0].title, "openclaw · GitHub");
337        assert_eq!(results[0].url, "https://github.com/openclaw");
338        assert!(results[0].snippet.contains("23 repos"));
339        assert_eq!(results[1].title, "OpenClaw — Personal AI");
340        assert_eq!(results[1].url, "https://openclaw.ai/");
341    }
342
343    #[test]
344    fn test_parse_ddg_empty() {
345        let results = parse_ddg_results("<html><body>no results</body></html>", 10);
346        assert!(results.is_empty());
347    }
348
349    #[test]
350    fn test_strip_html_tags() {
351        assert_eq!(strip_html_tags("hello <b>world</b>"), "hello world");
352        assert_eq!(strip_html_tags("&amp; &lt;"), "& <");
353    }
354}