1use crate::tool::{Tool, ToolResult, McpContent};
3use crate::error::BrightDataError;
4use crate::extras::logger::JSON_LOGGER;
5use crate::filters::{ResponseFilter, ResponseStrategy, ResponseType};
6use async_trait::async_trait;
7use serde_json::{json, Value};
8use reqwest::Client;
9use std::time::Duration;
10use std::collections::HashMap;
11use log::{info, warn};
12
13pub struct SearchEngine;
14
15#[async_trait]
16impl Tool for SearchEngine {
17 fn name(&self) -> &str {
18 "search_web"
19 }
20
21 fn description(&self) -> &str {
22 "Search the web using various search engines via BrightData SERP API with pagination, advanced parameters, and intelligent filtering"
23 }
24
25 fn input_schema(&self) -> Value {
26 json!({
27 "type": "object",
28 "properties": {
29 "query": {
30 "type": "string",
31 "description": "Search query"
32 },
33 "engine": {
34 "type": "string",
35 "enum": ["google", "bing", "yandex", "duckduckgo"],
36 "description": "Search engine to use",
37 "default": "google"
38 },
39 "page": {
40 "type": "integer",
41 "description": "Page number for pagination (1-based)",
42 "minimum": 1,
43 "maximum": 5,
44 "default": 1
45 },
46 "num_results": {
47 "type": "integer",
48 "description": "Number of results per page (5-50)",
49 "minimum": 5,
50 "maximum": 50,
51 "default": 20
52 },
53 "country": {
54 "type": "string",
55 "description": "Country code for localized results (e.g., 'us', 'in', 'uk', 'ca')",
56 "default": "us"
57 },
58 "language": {
59 "type": "string",
60 "description": "Language code for results (e.g., 'en', 'hi', 'es', 'fr')",
61 "default": "en"
62 },
63 "safe_search": {
64 "type": "string",
65 "enum": ["off", "moderate", "strict"],
66 "description": "Safe search filter level",
67 "default": "moderate"
68 },
69 "time_filter": {
70 "type": "string",
71 "enum": ["any", "hour", "day", "week", "month", "year"],
72 "description": "Time-based filter for results",
73 "default": "any"
74 },
75 "search_type": {
76 "type": "string",
77 "enum": ["web", "images", "videos", "news", "shopping"],
78 "description": "Type of search results",
79 "default": "web"
80 },
81 "use_serp_api": {
82 "type": "boolean",
83 "description": "Use BrightData SERP API for structured results (recommended)",
84 "default": true
85 }
86 },
87 "required": ["query"]
88 })
89 }
90
91 async fn execute_internal(&self, parameters: Value) -> Result<ToolResult, BrightDataError> {
97 let query = parameters
98 .get("query")
99 .and_then(|v| v.as_str())
100 .ok_or_else(|| BrightDataError::ToolError("Missing 'query' parameter".into()))?;
101
102 let engine = parameters
103 .get("engine")
104 .and_then(|v| v.as_str())
105 .unwrap_or("google");
106
107 let page = parameters
108 .get("page")
109 .and_then(|v| v.as_i64())
110 .unwrap_or(1) as u32;
111
112 let num_results = parameters
113 .get("num_results")
114 .and_then(|v| v.as_i64())
115 .unwrap_or(20) as u32;
116
117 let country = parameters
118 .get("country")
119 .and_then(|v| v.as_str())
120 .unwrap_or("us");
121
122 let language = parameters
123 .get("language")
124 .and_then(|v| v.as_str())
125 .unwrap_or("en");
126
127 let safe_search = parameters
128 .get("safe_search")
129 .and_then(|v| v.as_str())
130 .unwrap_or("moderate");
131
132 let time_filter = parameters
133 .get("time_filter")
134 .and_then(|v| v.as_str())
135 .unwrap_or("any");
136
137 let search_type = parameters
138 .get("search_type")
139 .and_then(|v| v.as_str())
140 .unwrap_or("web");
141
142 let use_serp_api = parameters
143 .get("use_serp_api")
144 .and_then(|v| v.as_bool())
145 .unwrap_or(true);
146
147 let query_priority = ResponseStrategy::classify_query_priority(query);
149 let recommended_tokens = ResponseStrategy::get_recommended_token_allocation(query);
150
151 let execution_id = self.generate_execution_id();
155
156 info!("🔍 Search query: '{}' (engine: {}, priority: {:?}, tokens: {})",
157 query, engine, query_priority, recommended_tokens);
158
159 let result = if use_serp_api {
160 self.search_with_brightdata_serp_api_with_priority(
161 query, engine, page, num_results, country, language,
162 safe_search, time_filter, search_type, query_priority,
163 recommended_tokens, &execution_id
164 ).await?
165 } else {
166 self.search_with_brightdata_with_priority(query, engine, query_priority, recommended_tokens, &execution_id).await?
168 };
169
170 let content = result.get("content").and_then(|c| c.as_str()).unwrap_or("");
171 let source_used = if use_serp_api { "Enhanced SERP" } else { "Legacy" };
172
173 let tool_result = if std::env::var("TRUNCATE_FILTER")
175 .map(|v| v.to_lowercase() == "true")
176 .unwrap_or(false) {
177
178 ResponseStrategy::create_financial_response(
179 "search", query, "web", source_used, content, result.clone()
180 )
181 } else {
182 let content_text = if use_serp_api {
184 result.get("formatted_content").and_then(|c| c.as_str()).unwrap_or(content)
185 } else {
186 content
187 };
188
189 let mcp_content = if use_serp_api {
190 vec![McpContent::text(format!(
191 "🔍 **Enhanced Search Results for '{}'**\n\nEngine: {} | Page: {} | Results: {} | Country: {} | Language: {} | Priority: {:?} | Tokens: {}\nSearch Type: {} | Safe Search: {} | Time Filter: {}\nExecution ID: {}\n\n{}",
192 query, engine, page, num_results, country, language, query_priority, recommended_tokens, search_type, safe_search, time_filter, execution_id, content_text
193 ))]
194 } else {
195 vec![McpContent::text(format!(
196 "🔍 **Search Results for '{}'**\n\nEngine: {} | Priority: {:?} | Tokens: {}\nExecution ID: {}\n\n{}",
197 query, engine, query_priority, recommended_tokens, execution_id, content_text
198 ))]
199 };
200 ToolResult::success_with_raw(mcp_content, result)
201 };
202
203 if std::env::var("TRUNCATE_FILTER")
205 .map(|v| v.to_lowercase() == "true")
206 .unwrap_or(false) {
207 Ok(ResponseStrategy::apply_size_limits(tool_result))
208 } else {
209 Ok(tool_result)
210 }
211 }
212}
213
214impl SearchEngine {
215 fn generate_execution_id(&self) -> String {
216 format!("search_{}", chrono::Utc::now().format("%Y%m%d_%H%M%S%.3f"))
217 }
218
219 async fn handle_brightdata_response_with_priority(
221 &self,
222 raw_content: String,
223 query: &str,
224 engine: &str,
225 priority: crate::filters::strategy::QueryPriority,
226 token_budget: usize,
227 execution_id: &str,
228 ) -> Result<Value, BrightDataError> {
229
230 if !std::env::var("TRUNCATE_FILTER")
232 .map(|v| v.to_lowercase() == "true")
233 .unwrap_or(false) {
234 return Ok(json!({
236 "content": raw_content,
237 "formatted_content": self.format_search_results(&raw_content, query, 1, 20, "web"),
238 "query": query,
239 "engine": engine,
240 "priority": format!("{:?}", priority),
241 "token_budget": token_budget,
242 "execution_id": execution_id,
243 "success": true,
244 "api_type": "no_filter"
245 }));
246 }
247
248 let response_type = ResponseStrategy::determine_response_type(&raw_content, query);
250
251 match response_type {
253 ResponseType::Skip => {
254 return Err(BrightDataError::ToolError("Skipping low quality search source".into()));
256 }
257
258 ResponseType::Emergency => {
259 let max_tokens = std::cmp::min(token_budget / 4, 15);
261 let emergency_content = ResponseFilter::extract_high_value_financial_data(
262 &raw_content,
263 max_tokens
264 );
265
266 return Ok(json!({
267 "content": emergency_content,
268 "formatted_content": emergency_content,
269 "response_type": "emergency",
270 "query": query,
271 "engine": engine,
272 "priority": format!("{:?}", priority),
273 "token_budget": token_budget,
274 "execution_id": execution_id,
275 "success": true,
276 "api_type": "emergency_serp"
277 }));
278 }
279
280 ResponseType::KeyMetrics => {
281 let max_tokens = std::cmp::min(token_budget / 3, 40);
283 let metrics_content = ResponseFilter::extract_high_value_financial_data(
284 &raw_content,
285 max_tokens
286 );
287
288 return Ok(json!({
289 "content": metrics_content,
290 "formatted_content": metrics_content,
291 "response_type": "key_metrics",
292 "query": query,
293 "engine": engine,
294 "priority": format!("{:?}", priority),
295 "token_budget": token_budget,
296 "execution_id": execution_id,
297 "success": true,
298 "api_type": "metrics_serp"
299 }));
300 }
301
302 ResponseType::Summary => {
303 let max_chars = std::cmp::min(token_budget * 4 / 2, 200); let summary_content = ResponseFilter::smart_truncate_preserving_financial_data(
306 &raw_content,
307 max_chars
308 );
309
310 let formatted_content = self.format_search_results_with_priority(&summary_content, query, 1, 1, "web", priority);
311
312 return Ok(json!({
313 "content": summary_content,
314 "formatted_content": formatted_content,
315 "response_type": "summary",
316 "query": query,
317 "engine": engine,
318 "priority": format!("{:?}", priority),
319 "token_budget": token_budget,
320 "execution_id": execution_id,
321 "success": true,
322 "api_type": "summary_serp"
323 }));
324 }
325
326 ResponseType::Filtered => {
327 let filtered_content = ResponseFilter::filter_financial_content(&raw_content);
329 let max_chars = std::cmp::min(token_budget * 4 / 2, 400);
330 let truncated_content = ResponseFilter::truncate_content(&filtered_content, max_chars);
331
332 let formatted_content = self.format_search_results_with_priority(&truncated_content, query, 1, 10, "web", priority);
333
334 return Ok(json!({
335 "content": truncated_content,
336 "formatted_content": formatted_content,
337 "response_type": "filtered",
338 "query": query,
339 "engine": engine,
340 "priority": format!("{:?}", priority),
341 "token_budget": token_budget,
342 "execution_id": execution_id,
343 "success": true,
344 "api_type": "filtered_serp"
345 }));
346 }
347
348 _ => {
349 let max_tokens = std::cmp::min(token_budget / 4, 20);
351 let minimal_content = ResponseFilter::extract_high_value_financial_data(&raw_content, max_tokens);
352 return Ok(json!({
353 "content": minimal_content,
354 "formatted_content": minimal_content,
355 "response_type": "fallback",
356 "query": query,
357 "engine": engine,
358 "priority": format!("{:?}", priority),
359 "token_budget": token_budget,
360 "execution_id": execution_id,
361 "success": true,
362 "api_type": "fallback_serp"
363 }));
364 }
365 }
366 }
367
368 async fn search_with_brightdata_serp_api_with_priority(
370 &self,
371 query: &str,
372 engine: &str,
373 page: u32,
374 num_results: u32,
375 country: &str,
376 language: &str,
377 safe_search: &str,
378 time_filter: &str,
379 search_type: &str,
380 priority: crate::filters::strategy::QueryPriority,
381 token_budget: usize,
382 execution_id: &str,
383 ) -> Result<Value, BrightDataError> {
384 let api_token = std::env::var("BRIGHTDATA_API_TOKEN")
385 .or_else(|_| std::env::var("API_TOKEN"))
386 .map_err(|_| BrightDataError::ToolError("Missing BRIGHTDATA_API_TOKEN".into()))?;
387
388 let base_url = std::env::var("BRIGHTDATA_BASE_URL")
389 .unwrap_or_else(|_| "https://api.brightdata.com".to_string());
390
391 let zone = std::env::var("BRIGHTDATA_SERP_ZONE")
392 .unwrap_or_else(|_| "serp_api2".to_string());
393
394 let effective_num_results = match priority {
396 crate::filters::strategy::QueryPriority::Critical => num_results,
397 crate::filters::strategy::QueryPriority::High => std::cmp::min(num_results, 30),
398 crate::filters::strategy::QueryPriority::Medium => std::cmp::min(num_results, 20),
399 crate::filters::strategy::QueryPriority::Low => std::cmp::min(num_results, 10),
400 };
401
402 let mut query_params = HashMap::new();
404 query_params.insert("q".to_string(), query.to_string());
405
406 if page > 1 {
408 let start = (page - 1) * effective_num_results;
409 query_params.insert("start".to_string(), start.to_string());
410 }
411 query_params.insert("num".to_string(), effective_num_results.to_string());
412
413 query_params.insert("gl".to_string(), country.to_string()); query_params.insert("hl".to_string(), language.to_string()); let safe_value = match safe_search {
419 "off" => "off",
420 "strict" => "strict",
421 _ => "moderate" };
423 query_params.insert("safe".to_string(), safe_value.to_string());
424
425 if time_filter != "any" && !matches!(priority, crate::filters::strategy::QueryPriority::Low) {
427 let tbs_value = match time_filter {
428 "hour" => "qdr:h",
429 "day" => "qdr:d",
430 "week" => "qdr:w",
431 "month" => "qdr:m",
432 "year" => "qdr:y",
433 _ => ""
434 };
435 if !tbs_value.is_empty() {
436 query_params.insert("tbs".to_string(), tbs_value.to_string());
437 }
438 }
439
440 if search_type != "web" && !matches!(priority, crate::filters::strategy::QueryPriority::Low) {
442 let tbm_value = match search_type {
443 "images" => "isch",
444 "videos" => "vid",
445 "news" => "nws",
446 "shopping" => "shop",
447 _ => ""
448 };
449 if !tbm_value.is_empty() {
450 query_params.insert("tbm".to_string(), tbm_value.to_string());
451 }
452 }
453
454 info!("🔍 Priority {} enhanced SERP API search: {} (engine: {}, page: {}, results: {}, country: {}) using zone: {} (execution: {})",
455 format!("{:?}", priority), query, engine, page, effective_num_results, country, zone, execution_id);
456
457 let mut search_url = self.get_base_search_url(engine);
459 let query_string = query_params.iter()
460 .map(|(k, v)| format!("{}={}", k, urlencoding::encode(v)))
461 .collect::<Vec<_>>()
462 .join("&");
463
464 if !query_string.is_empty() {
465 search_url = format!("{}?{}", search_url, query_string);
466 }
467
468 let mut payload = json!({
470 "url": search_url,
471 "zone": zone,
472 "format": "raw",
473 "render": true, "data_format": "markdown"
475 });
476
477 let client = Client::builder()
480 .timeout(Duration::from_secs(90))
481 .build()
482 .map_err(|e| BrightDataError::ToolError(e.to_string()))?;
483
484 let response = client
485 .post(&format!("{}/request", base_url))
486 .header("Authorization", format!("Bearer {}", api_token))
487 .header("Content-Type", "application/json")
488 .json(&payload)
489 .send()
490 .await
491 .map_err(|e| BrightDataError::ToolError(format!("Enhanced search request failed: {}", e)))?;
492
493 let status = response.status().as_u16();
494 let response_headers: HashMap<String, String> = response
495 .headers()
496 .iter()
497 .map(|(k, v)| (k.to_string(), v.to_str().unwrap_or("").to_string()))
498 .collect();
499
500 if let Err(e) = JSON_LOGGER.log_brightdata_request(
502 execution_id,
503 &zone,
504 &format!("Enhanced SERP: {} ({})", query, engine),
505 payload.clone(),
506 status,
507 response_headers,
508 "markdown"
509 ).await {
510 warn!("Failed to log BrightData request: {}", e);
511 }
512
513 if !response.status().is_success() {
514 let error_text = response.text().await.unwrap_or_default();
515 return Err(BrightDataError::ToolError(format!(
516 "BrightData SERP API error {}: {}",
517 status, error_text
518 )));
519 }
520
521 let raw_content = response.text().await
522 .map_err(|e| BrightDataError::ToolError(format!("Failed to read SERP response: {}", e)))?;
523
524 self.handle_brightdata_response_with_priority(raw_content, query, engine, priority, token_budget, execution_id).await
526 }
527
528 async fn search_with_brightdata_with_priority(
530 &self,
531 query: &str,
532 engine: &str,
533 priority: crate::filters::strategy::QueryPriority,
534 token_budget: usize,
535 execution_id: &str
536 ) -> Result<Value, BrightDataError> {
537 let api_token = std::env::var("BRIGHTDATA_API_TOKEN")
538 .or_else(|_| std::env::var("API_TOKEN"))
539 .map_err(|_| BrightDataError::ToolError("Missing BRIGHTDATA_API_TOKEN".into()))?;
540
541 let base_url = std::env::var("BRIGHTDATA_BASE_URL")
542 .unwrap_or_else(|_| "https://api.brightdata.com".to_string());
543
544 let search_url = self.build_search_url(engine, query);
545 let zone = std::env::var("BRIGHTDATA_SERP_ZONE")
546 .unwrap_or_else(|_| "serp_api2".to_string());
547
548 info!("🔍 Priority {} search URL: {} using zone: {} (execution: {})",
549 format!("{:?}", priority), search_url, zone, execution_id);
550
551 let mut payload = json!({
552 "url": search_url,
553 "zone": zone,
554 "format": "raw",
555 "data_format": "markdown"
556 });
557
558 if std::env::var("TRUNCATE_FILTER")
560 .map(|v| v.to_lowercase() == "true")
561 .unwrap_or(false) {
562
563 payload["processing_priority"] = json!(format!("{:?}", priority));
564 payload["token_budget"] = json!(token_budget);
565 }
566
567 let client = Client::builder()
568 .timeout(Duration::from_secs(90))
569 .build()
570 .map_err(|e| BrightDataError::ToolError(e.to_string()))?;
571
572 let response = client
573 .post(&format!("{}/request", base_url))
574 .header("Authorization", format!("Bearer {}", api_token))
575 .header("Content-Type", "application/json")
576 .json(&payload)
577 .send()
578 .await
579 .map_err(|e| BrightDataError::ToolError(format!("Search request failed: {}", e)))?;
580
581 let status = response.status().as_u16();
582 let response_headers: HashMap<String, String> = response
583 .headers()
584 .iter()
585 .map(|(k, v)| (k.to_string(), v.to_str().unwrap_or("").to_string()))
586 .collect();
587
588 if let Err(e) = JSON_LOGGER.log_brightdata_request(
590 execution_id,
591 &zone,
592 &search_url,
593 payload.clone(),
594 status,
595 response_headers,
596 "markdown"
597 ).await {
598 warn!("Failed to log BrightData request: {}", e);
599 }
600
601 if !response.status().is_success() {
602 let error_text = response.text().await.unwrap_or_default();
603 return Err(BrightDataError::ToolError(format!(
604 "BrightData API error {}: {}",
605 status, error_text
606 )));
607 }
608
609 let raw_content = response.text().await
610 .map_err(|e| BrightDataError::ToolError(format!("Failed to read response: {}", e)))?;
611
612 self.handle_brightdata_response_with_priority(raw_content, query, engine, priority, token_budget, execution_id).await
614 }
615
616 fn get_base_search_url(&self, engine: &str) -> String {
617 match engine {
618 "bing" => "https://www.bing.com/search".to_string(),
619 "yandex" => "https://yandex.com/search/".to_string(),
620 "duckduckgo" => "https://duckduckgo.com/".to_string(),
621 _ => "https://www.google.com/search".to_string(),
622 }
623 }
624
625 fn build_search_url(&self, engine: &str, query: &str) -> String {
626 let encoded_query = urlencoding::encode(query);
627 match engine {
628 "bing" => format!("https://www.bing.com/search?q={}", encoded_query),
629 "yandex" => format!("https://yandex.com/search/?text={}", encoded_query),
630 "duckduckgo" => format!("https://duckduckgo.com/?q={}", encoded_query),
631 _ => format!("https://www.google.com/search?q={}", encoded_query),
632 }
633 }
634
635 fn format_search_results_with_priority(
637 &self,
638 content: &str,
639 query: &str,
640 page: u32,
641 num_results: u32,
642 search_type: &str,
643 priority: crate::filters::strategy::QueryPriority
644 ) -> String {
645 if std::env::var("TRUNCATE_FILTER")
647 .map(|v| v.to_lowercase() == "true")
648 .unwrap_or(false) {
649
650 return format!("🔍 {}: {}",
652 ResponseStrategy::ultra_abbreviate_query(query),
653 content
654 );
655 }
656
657 self.format_search_results(content, query, page, num_results, search_type)
659 }
660
661 fn format_search_results(&self, content: &str, query: &str, page: u32, num_results: u32, search_type: &str) -> String {
662 if std::env::var("TRUNCATE_FILTER")
664 .map(|v| v.to_lowercase() == "true")
665 .unwrap_or(false) {
666
667 return format!("🔍 {}: {}",
669 ResponseStrategy::ultra_abbreviate_query(query),
670 content
671 );
672 }
673
674 let mut formatted = String::new();
676
677 formatted.push_str(&format!("# Search Results for: {}\n\n", query));
679 formatted.push_str(&format!("**Page**: {} | **Results per page**: {} | **Type**: {}\n\n", page, num_results, search_type));
680
681 if let Ok(json_data) = serde_json::from_str::<Value>(content) {
683 if let Some(results) = json_data.get("organic_results").and_then(|r| r.as_array()) {
685 formatted.push_str("## Organic Results\n\n");
686 for (i, result) in results.iter().take(num_results as usize).enumerate() {
687 let title = result.get("title").and_then(|t| t.as_str()).unwrap_or("No title");
688 let link = result.get("link").and_then(|l| l.as_str()).unwrap_or("");
689 let snippet = result.get("snippet").and_then(|s| s.as_str()).unwrap_or("");
690
691 formatted.push_str(&format!("### {}. {}\n", i + 1, title));
692 if !link.is_empty() {
693 formatted.push_str(&format!("**URL**: {}\n", link));
694 }
695 if !snippet.is_empty() {
696 formatted.push_str(&format!("**Snippet**: {}\n", snippet));
697 }
698 formatted.push_str("\n");
699 }
700 } else {
701 formatted.push_str("## Structured Results\n\n");
703 formatted.push_str("```json\n");
704 formatted.push_str(&serde_json::to_string_pretty(&json_data).unwrap_or_else(|_| content.to_string()));
705 formatted.push_str("\n```\n");
706 }
707 } else {
708 formatted.push_str("## Search Results\n\n");
710 formatted.push_str(content);
711 }
712
713 if page > 1 || num_results < 100 {
715 formatted.push_str(&format!("\n---\n*Page {} of search results*\n", page));
716 if page > 1 {
717 formatted.push_str("💡 *To get more results, use page parameter*\n");
718 }
719 }
720
721 formatted
722 }
723}