snm_brightdata_client/tools/
search.rs1use crate::tool::{Tool, ToolResult, McpContent};
3use crate::error::BrightDataError;
4use async_trait::async_trait;
5use serde_json::{json, Value};
6use reqwest::Client;
7use std::time::Duration;
8use scraper::{Html, Selector};
9use log::{info, error, debug, warn};
10
11const MAX_RESPONSE_SIZE: usize = 1_000_000; const REQUEST_TIMEOUT: u64 = 30; const MAX_RESULTS: usize = 5; pub struct SearchEngine;
16
17#[async_trait]
18impl Tool for SearchEngine {
19 fn name(&self) -> &str {
20 "search_web"
21 }
22
23 fn description(&self) -> &str {
24 "Search the web using BrightData SERP proxy and extract results"
25 }
26
27 fn input_schema(&self) -> Value {
28 json!({
29 "type": "object",
30 "properties": {
31 "query": {
32 "type": "string",
33 "description": "Search query"
34 },
35 "engine": {
36 "type": "string",
37 "enum": ["google", "bing", "yandex", "duckduckgo"],
38 "description": "Search engine to use",
39 "default": "google"
40 }
41 },
42 "required": ["query"]
43 })
44 }
45
46 async fn execute(&self, parameters: Value) -> Result<ToolResult, BrightDataError> {
47 let query = parameters
48 .get("query")
49 .and_then(|v| v.as_str())
50 .ok_or_else(|| BrightDataError::ToolError("Missing 'query' parameter".into()))?;
51
52 let engine = parameters
53 .get("engine")
54 .and_then(|v| v.as_str())
55 .unwrap_or("google");
56
57 info!("🔍 Starting web search for query: '{}'", query);
58
59 let search_future = self.search_with_brightdata(query, engine);
61 let timeout_duration = Duration::from_secs(REQUEST_TIMEOUT);
62
63 let result = match tokio::time::timeout(timeout_duration, search_future).await {
64 Ok(result) => result?,
65 Err(_) => {
66 error!("⏱️ Search request timed out after {} seconds", REQUEST_TIMEOUT);
67 return Err(BrightDataError::ToolError("Search request timed out".into()));
68 }
69 };
70
71 if let Some(organic_results) = result.get("organic").and_then(|v| v.as_array()) {
73 if !organic_results.is_empty() {
74 return self.format_structured_results(query, organic_results, &result);
75 }
76 }
77
78 if let Some(html_content) = result.as_object()
80 .and_then(|obj| obj.get("body"))
81 .and_then(|body| body.as_str()) {
82
83 if html_content.len() > MAX_RESPONSE_SIZE {
85 warn!("⚠️ Response too large: {} bytes, truncating", html_content.len());
86 let truncated = &html_content[..MAX_RESPONSE_SIZE];
87 return self.parse_html_results(query, truncated);
88 }
89
90 info!("📄 Parsing HTML content ({} bytes)", html_content.len());
91 return self.parse_html_results(query, html_content);
92 }
93
94 error!("❌ No valid search results found for query: '{}'", query);
95 Err(BrightDataError::ToolError("No valid search results found".into()))
96 }
97}
98
99impl SearchEngine {
100 async fn search_with_brightdata(&self, query: &str, engine: &str) -> Result<Value, BrightDataError> {
101 let api_token = std::env::var("BRIGHTDATA_API_TOKEN")
102 .or_else(|_| std::env::var("API_TOKEN"))
103 .map_err(|_| BrightDataError::ToolError("Missing BRIGHTDATA_API_TOKEN".into()))?;
104
105 let base_url = "https://api.brightdata.com";
106 let search_url = self.build_search_url(engine, query);
107 let zone = std::env::var("BRIGHTDATA_SERP_ZONE")
108 .unwrap_or_else(|_| "serp_api2".to_string());
109
110 let payload = json!({
111 "url": search_url,
112 "zone": zone,
113 "format": "raw" });
115
116 info!("🌐 Making BrightData request to: {}", search_url);
117 debug!("📦 Payload: {}", payload);
118
119 let client = Client::builder()
120 .timeout(Duration::from_secs(REQUEST_TIMEOUT))
121 .build()
122 .map_err(|e| BrightDataError::ToolError(e.to_string()))?;
123
124 let response = client
125 .post(&format!("{}/request", base_url))
126 .header("Authorization", format!("Bearer {}", api_token))
127 .header("Content-Type", "application/json")
128 .json(&payload)
129 .send()
130 .await
131 .map_err(|e| BrightDataError::ToolError(format!("Search request failed: {}", e)))?;
132
133 let status = response.status();
134 info!("📡 BrightData response status: {}", status);
135
136 if !status.is_success() {
137 let err_text = response.text().await.unwrap_or_default();
138 error!("❌ BrightData API error {}: {}", status, err_text);
139 return Err(BrightDataError::ToolError(format!(
140 "BrightData error {}: {}",
141 status, err_text
142 )));
143 }
144
145 let response_text = response.text().await
147 .map_err(|e| BrightDataError::ToolError(format!("Failed to read response: {}", e)))?;
148
149 info!("✅ Received response, length: {} bytes", response_text.len());
150
151 Ok(json!({
153 "body": response_text,
154 "format": "html",
155 "success": true
156 }))
157 }
158
159 fn format_structured_results(&self, query: &str, organic_results: &[Value], full_result: &Value) -> Result<ToolResult, BrightDataError> {
160 let mut formatted_results = Vec::new();
161
162 for (i, result) in organic_results.iter().take(MAX_RESULTS).enumerate() {
164 let title = result.get("title").and_then(|t| t.as_str()).unwrap_or("No title");
165 let link = result.get("link").and_then(|l| l.as_str()).unwrap_or("");
166 let description = result.get("description").and_then(|d| d.as_str()).unwrap_or("");
167
168 formatted_results.push(format!(
169 "{}. **{}**\n {}\n Link: {}\n",
170 i + 1, title, description, link
171 ));
172 }
173
174 let content_text = format!("🔍 **Search Results for '{}'**\n\n{}", query, formatted_results.join("\n"));
175 let mcp_content = vec![McpContent::text(content_text)];
176
177 info!("✅ Returning {} structured search results", organic_results.len().min(MAX_RESULTS));
178 Ok(ToolResult::success_with_raw(mcp_content, full_result.clone()))
179 }
180
181 fn parse_html_results(&self, query: &str, html_content: &str) -> Result<ToolResult, BrightDataError> {
182 info!("🔧 Starting HTML parsing for {} bytes", html_content.len());
183
184 let document = Html::parse_document(html_content);
185 let mut results = Vec::new();
186
187 if let Ok(selector) = Selector::parse("a[href*='http']") {
189 let mut count = 0;
190 for element in document.select(&selector) {
191 count += 1;
193 if count > 20 {
194 warn!("⚠️ Reached maximum link extraction limit (20)");
195 break;
196 }
197
198 if let Some(href) = element.value().attr("href") {
199 let text = element.text().collect::<String>().trim().to_string();
200
201 if text.len() > 5 && text.len() < 200 &&
203 !text.to_lowercase().contains("sign in") &&
204 !href.contains("accounts.google.com") {
205 results.push((text, href.to_string()));
206
207 if results.len() >= MAX_RESULTS {
209 break;
210 }
211 }
212 }
213 }
214 }
215
216 if results.is_empty() {
217 return Err(BrightDataError::ToolError("No search results found in HTML".into()));
218 }
219
220 let formatted_results: Vec<String> = results.iter().take(MAX_RESULTS).enumerate().map(|(i, (title, url))| {
222 format!("{}. **{}**\n Link: {}\n", i + 1, title, url)
223 }).collect();
224
225 let content_text = format!("🔍 **Search Results for '{}'**\n\n{}",
226 query, formatted_results.join("\n"));
227
228 let mcp_content = vec![McpContent::text(content_text)];
229 info!("✅ Extracted {} results from HTML", results.len());
230
231 let raw_result = json!({
232 "query": query,
233 "results": results.iter().take(MAX_RESULTS).map(|(title, url)| json!({
234 "title": title,
235 "url": url
236 })).collect::<Vec<_>>(),
237 "source": "html_parsed"
238 });
239
240 Ok(ToolResult::success_with_raw(mcp_content, raw_result))
241 }
242
243 fn build_search_url(&self, engine: &str, query: &str) -> String {
244 let encoded_query = urlencoding::encode(query);
245 match engine {
246 "bing" => format!("https://www.bing.com/search?q={}", encoded_query),
247 "yandex" => format!("https://yandex.com/search/?text={}", encoded_query),
248 "duckduckgo" => format!("https://duckduckgo.com/?q={}", encoded_query),
249 _ => format!("https://www.google.com/search?q={}", encoded_query),
250 }
251 }
252}