snm_brightdata_client/tools/
search.rs

1// src/tools/search.rs - Safe version with timeouts and limits
2use crate::tool::{Tool, ToolResult, McpContent};
3use crate::error::BrightDataError;
4use async_trait::async_trait;
5use serde_json::{json, Value};
6use reqwest::Client;
7use std::time::Duration;
8use scraper::{Html, Selector};
9use log::{info, error, debug, warn};
10
11const MAX_RESPONSE_SIZE: usize = 1_000_000; // 1MB limit
12const REQUEST_TIMEOUT: u64 = 30; // 30 seconds
13const MAX_RESULTS: usize = 5; // Limit results
14
15pub struct SearchEngine;
16
17#[async_trait]
18impl Tool for SearchEngine {
19    fn name(&self) -> &str {
20        "search_web"
21    }
22
23    fn description(&self) -> &str {
24        "Search the web using BrightData SERP proxy and extract results"
25    }
26
27    fn input_schema(&self) -> Value {
28        json!({
29            "type": "object",
30            "properties": {
31                "query": {
32                    "type": "string",
33                    "description": "Search query"
34                },
35                "engine": {
36                    "type": "string",
37                    "enum": ["google", "bing", "yandex", "duckduckgo"],
38                    "description": "Search engine to use",
39                    "default": "google"
40                }
41            },
42            "required": ["query"]
43        })
44    }
45
46    async fn execute(&self, parameters: Value) -> Result<ToolResult, BrightDataError> {
47        let query = parameters
48            .get("query")
49            .and_then(|v| v.as_str())
50            .ok_or_else(|| BrightDataError::ToolError("Missing 'query' parameter".into()))?;
51
52        let engine = parameters
53            .get("engine")
54            .and_then(|v| v.as_str())
55            .unwrap_or("google");
56
57        info!("🔍 Starting web search for query: '{}'", query);
58        
59        // Add timeout wrapper
60        let search_future = self.search_with_brightdata(query, engine);
61        let timeout_duration = Duration::from_secs(REQUEST_TIMEOUT);
62        
63        let result = match tokio::time::timeout(timeout_duration, search_future).await {
64            Ok(result) => result?,
65            Err(_) => {
66                error!("⏱️ Search request timed out after {} seconds", REQUEST_TIMEOUT);
67                return Err(BrightDataError::ToolError("Search request timed out".into()));
68            }
69        };
70        
71        // Try structured JSON first
72        if let Some(organic_results) = result.get("organic").and_then(|v| v.as_array()) {
73            if !organic_results.is_empty() {
74                return self.format_structured_results(query, organic_results, &result);
75            }
76        }
77        
78        // Fallback to HTML parsing with size check
79        if let Some(html_content) = result.as_object()
80            .and_then(|obj| obj.get("body"))
81            .and_then(|body| body.as_str()) {
82            
83            // Check response size
84            if html_content.len() > MAX_RESPONSE_SIZE {
85                warn!("⚠️ Response too large: {} bytes, truncating", html_content.len());
86                let truncated = &html_content[..MAX_RESPONSE_SIZE];
87                return self.parse_html_results(query, truncated);
88            }
89            
90            info!("📄 Parsing HTML content ({} bytes)", html_content.len());
91            return self.parse_html_results(query, html_content);
92        }
93
94        error!("❌ No valid search results found for query: '{}'", query);
95        Err(BrightDataError::ToolError("No valid search results found".into()))
96    }
97}
98
99impl SearchEngine {
100    async fn search_with_brightdata(&self, query: &str, engine: &str) -> Result<Value, BrightDataError> {
101        let api_token = std::env::var("BRIGHTDATA_API_TOKEN")
102            .or_else(|_| std::env::var("API_TOKEN"))
103            .map_err(|_| BrightDataError::ToolError("Missing BRIGHTDATA_API_TOKEN".into()))?;
104
105        let base_url = "https://api.brightdata.com";
106        let search_url = self.build_search_url(engine, query);
107        let zone = std::env::var("BRIGHTDATA_SERP_ZONE")
108            .unwrap_or_else(|_| "serp_api2".to_string());
109
110        let payload = json!({
111            "url": search_url,
112            "zone": zone,
113            "format": "raw"  // Use raw format to avoid JSON parsing issues
114        });
115
116        info!("🌐 Making BrightData request to: {}", search_url);
117        debug!("📦 Payload: {}", payload);
118
119        let client = Client::builder()
120            .timeout(Duration::from_secs(REQUEST_TIMEOUT))
121            .build()
122            .map_err(|e| BrightDataError::ToolError(e.to_string()))?;
123
124        let response = client
125            .post(&format!("{}/request", base_url))
126            .header("Authorization", format!("Bearer {}", api_token))
127            .header("Content-Type", "application/json")
128            .json(&payload)
129            .send()
130            .await
131            .map_err(|e| BrightDataError::ToolError(format!("Search request failed: {}", e)))?;
132
133        let status = response.status();
134        info!("📡 BrightData response status: {}", status);
135        
136        if !status.is_success() {
137            let err_text = response.text().await.unwrap_or_default();
138            error!("❌ BrightData API error {}: {}", status, err_text);
139            return Err(BrightDataError::ToolError(format!(
140                "BrightData error {}: {}",
141                status, err_text
142            )));
143        }
144
145        // Get response text with size limit
146        let response_text = response.text().await
147            .map_err(|e| BrightDataError::ToolError(format!("Failed to read response: {}", e)))?;
148
149        info!("✅ Received response, length: {} bytes", response_text.len());
150
151        // Return as JSON wrapper for HTML content
152        Ok(json!({
153            "body": response_text,
154            "format": "html",
155            "success": true
156        }))
157    }
158
159    fn format_structured_results(&self, query: &str, organic_results: &[Value], full_result: &Value) -> Result<ToolResult, BrightDataError> {
160        let mut formatted_results = Vec::new();
161        
162        // Limit results to prevent huge responses
163        for (i, result) in organic_results.iter().take(MAX_RESULTS).enumerate() {
164            let title = result.get("title").and_then(|t| t.as_str()).unwrap_or("No title");
165            let link = result.get("link").and_then(|l| l.as_str()).unwrap_or("");
166            let description = result.get("description").and_then(|d| d.as_str()).unwrap_or("");
167            
168            formatted_results.push(format!(
169                "{}. **{}**\n   {}\n   Link: {}\n", 
170                i + 1, title, description, link
171            ));
172        }
173
174        let content_text = format!("🔍 **Search Results for '{}'**\n\n{}", query, formatted_results.join("\n"));
175        let mcp_content = vec![McpContent::text(content_text)];
176        
177        info!("✅ Returning {} structured search results", organic_results.len().min(MAX_RESULTS));
178        Ok(ToolResult::success_with_raw(mcp_content, full_result.clone()))
179    }
180
181    fn parse_html_results(&self, query: &str, html_content: &str) -> Result<ToolResult, BrightDataError> {
182        info!("🔧 Starting HTML parsing for {} bytes", html_content.len());
183        
184        let document = Html::parse_document(html_content);
185        let mut results = Vec::new();
186        
187        // Simple, safe selector that won't cause infinite loops
188        if let Ok(selector) = Selector::parse("a[href*='http']") {
189            let mut count = 0;
190            for element in document.select(&selector) {
191                // Hard limit to prevent infinite loops
192                count += 1;
193                if count > 20 {
194                    warn!("⚠️ Reached maximum link extraction limit (20)");
195                    break;
196                }
197                
198                if let Some(href) = element.value().attr("href") {
199                    let text = element.text().collect::<String>().trim().to_string();
200                    
201                    // Basic filtering
202                    if text.len() > 5 && text.len() < 200 && 
203                       !text.to_lowercase().contains("sign in") &&
204                       !href.contains("accounts.google.com") {
205                        results.push((text, href.to_string()));
206                        
207                        // Stop when we have enough results
208                        if results.len() >= MAX_RESULTS {
209                            break;
210                        }
211                    }
212                }
213            }
214        }
215
216        if results.is_empty() {
217            return Err(BrightDataError::ToolError("No search results found in HTML".into()));
218        }
219
220        // Format results with limit
221        let formatted_results: Vec<String> = results.iter().take(MAX_RESULTS).enumerate().map(|(i, (title, url))| {
222            format!("{}. **{}**\n   Link: {}\n", i + 1, title, url)
223        }).collect();
224
225        let content_text = format!("🔍 **Search Results for '{}'**\n\n{}", 
226                                 query, formatted_results.join("\n"));
227
228        let mcp_content = vec![McpContent::text(content_text)];
229        info!("✅ Extracted {} results from HTML", results.len());
230
231        let raw_result = json!({
232            "query": query,
233            "results": results.iter().take(MAX_RESULTS).map(|(title, url)| json!({
234                "title": title,
235                "url": url
236            })).collect::<Vec<_>>(),
237            "source": "html_parsed"
238        });
239
240        Ok(ToolResult::success_with_raw(mcp_content, raw_result))
241    }
242
243    fn build_search_url(&self, engine: &str, query: &str) -> String {
244        let encoded_query = urlencoding::encode(query);
245        match engine {
246            "bing" => format!("https://www.bing.com/search?q={}", encoded_query),
247            "yandex" => format!("https://yandex.com/search/?text={}", encoded_query),
248            "duckduckgo" => format!("https://duckduckgo.com/?q={}", encoded_query),
249            _ => format!("https://www.google.com/search?q={}", encoded_query),
250        }
251    }
252}