snm_brightdata_client/tools/
scrape.rs1use crate::tool::{Tool, ToolResult, McpContent};
3use crate::error::BrightDataError;
4use crate::extras::logger::JSON_LOGGER;
5use crate::filters::{ResponseFilter, ResponseStrategy};
6use async_trait::async_trait;
7use reqwest::Client;
8use serde_json::{json, Value};
9use std::env;
10use std::time::Duration;
11use std::collections::HashMap;
12use log::info;
13
14pub struct Scraper;
15
16#[async_trait]
17impl Tool for Scraper {
18 fn name(&self) -> &str {
19 "scrape_website"
20 }
21
22 fn description(&self) -> &str {
23 "Scrape a webpage using BrightData - supports API, Web Unlocker, and Residential Proxy"
24 }
25
26 fn input_schema(&self) -> Value {
27 json!({
28 "type": "object",
29 "properties": {
30 "url": {
31 "type": "string",
32 "description": "The URL to scrape"
33 },
34 "method": {
35 "type": "string",
36 "enum": ["api", "web_unlocker_proxy", "residential_proxy", "auto"],
37 "description": "Method: 'api' for REST API, 'web_unlocker_proxy' for Web Unlocker proxy, 'residential_proxy' for standard proxy, 'auto' to detect best available",
38 "default": "auto"
39 },
40 "format": {
41 "type": "string",
42 "enum": ["raw", "markdown", "screenshot"],
43 "description": "Output format - raw (HTML), markdown, or screenshot (Web Unlocker only)",
44 "default": "markdown"
45 },
46 "country": {
47 "type": "string",
48 "description": "Country code for geo-targeting (e.g., 'us', 'in', 'uk')",
49 "default": ""
50 },
51 "city": {
52 "type": "string",
53 "description": "City for geo-targeting (Web Unlocker only)",
54 "default": ""
55 },
56 "zipcode": {
57 "type": "string",
58 "description": "Zipcode for precise geo-targeting (Web Unlocker only)",
59 "default": ""
60 },
61 "mobile": {
62 "type": "boolean",
63 "description": "Use mobile user agent",
64 "default": false
65 },
66 "wait_for": {
67 "type": "string",
68 "description": "CSS selector or text to wait for (Web Unlocker only)",
69 "default": ""
70 },
71 "custom_headers": {
72 "type": "object",
73 "description": "Custom headers to send",
74 "additionalProperties": true,
75 "default": {}
76 },
77 "disable_captcha_solving": {
78 "type": "boolean",
79 "description": "Disable automatic CAPTCHA solving (Web Unlocker only)",
80 "default": false
81 }
82 },
83 "required": ["url"]
84 })
85 }
86
87 async fn execute_internal(&self, parameters: Value) -> Result<ToolResult, BrightDataError> {
88 let url = parameters
89 .get("url")
90 .and_then(|v| v.as_str())
91 .ok_or_else(|| BrightDataError::ToolError("Missing 'url' parameter".into()))?;
92
93 let data_type = parameters
94 .get("data_type")
95 .and_then(|v| v.as_str())
96 .unwrap_or("auto");
97
98 let extraction_format = parameters
99 .get("extraction_format")
100 .and_then(|v| v.as_str())
101 .unwrap_or("structured");
102
103 let clean_content = parameters
104 .get("clean_content")
105 .and_then(|v| v.as_bool())
106 .unwrap_or(true);
107
108 let schema = parameters.get("schema").cloned();
109
110 let execution_id = self.generate_execution_id();
111
112 match self.scrape_with_brightdata(url, data_type, extraction_format, clean_content, schema, &execution_id).await {
113 Ok(result) => {
114 let content = result.get("content").and_then(|c| c.as_str()).unwrap_or("");
115
116 let formatted_response = self.create_formatted_scrape_response(
118 url, data_type, extraction_format, content, &execution_id
119 );
120
121 let tool_result = ToolResult::success_with_raw(
122 vec![McpContent::text(formatted_response)],
123 result
124 );
125
126 if self.is_data_reduction_enabled() {
128 Ok(ResponseStrategy::apply_size_limits(tool_result))
129 } else {
130 Ok(tool_result)
131 }
132 }
133 Err(_e) => {
134 let empty_response = json!({
136 "url": url,
137 "data_type": data_type,
138 "status": "no_data",
139 "reason": "brightdata_error",
140 "execution_id": execution_id
141 });
142
143 Ok(ToolResult::success_with_raw(
144 vec![McpContent::text("📊 **No Data Available**\n\nPlease try again with a different URL or check if the website is accessible.".to_string())],
145 empty_response
146 ))
147 }
148 }
149 }
150}
151
152impl Scraper {
153 fn is_data_reduction_enabled(&self) -> bool {
155 std::env::var("DEDUCT_DATA")
156 .unwrap_or_else(|_| "false".to_string())
157 .to_lowercase() == "true"
158 }
159
160 fn create_formatted_scrape_response(
162 &self,
163 url: &str,
164 data_type: &str,
165 extraction_format: &str,
166 content: &str,
167 execution_id: &str
168 ) -> String {
169 if !self.is_data_reduction_enabled() {
171 return format!(
172 "📊 **Data Extraction from: {}**\n\n## Full Content\n{}\n\n*Data Type: {} | Format: {} • Execution: {}*",
173 url,
174 content,
175 data_type,
176 extraction_format,
177 execution_id
178 );
179 }
180
181 format!(
184 "📊 **Data Extraction from: {}**\n\n## Content (TODO: Add Filtering)\n{}\n\n*Data Type: {} | Format: {} • Execution: {}*",
185 url,
186 content,
187 data_type,
188 extraction_format,
189 execution_id
190 )
191 }
192
193 fn generate_execution_id(&self) -> String {
194 format!("scrape_{}", chrono::Utc::now().format("%Y%m%d_%H%M%S%.3f"))
195 }
196
197 async fn scrape_with_brightdata(
199 &self,
200 url: &str,
201 data_type: &str,
202 extraction_format: &str,
203 clean_content: bool,
204 schema: Option<Value>,
205 execution_id: &str,
206 ) -> Result<Value, BrightDataError> {
207 let api_token = env::var("BRIGHTDATA_API_TOKEN")
208 .or_else(|_| env::var("API_TOKEN"))
209 .map_err(|_| BrightDataError::ToolError("Missing BRIGHTDATA_API_TOKEN".into()))?;
210
211 let base_url = env::var("BRIGHTDATA_BASE_URL")
212 .unwrap_or_else(|_| "https://api.brightdata.com".to_string());
213
214 let zone = env::var("WEB_UNLOCKER_ZONE").unwrap_or_else(|_| "web_unlocker".to_string());
216
217 info!("📊 Extracting from {} using WEB_UNLOCKER_ZONE: {} (execution: {})",
218 url, zone, execution_id);
219
220 let mut payload = json!({
222 "url": url,
223 "zone": zone,
224 "format": "json",
225 "data_format": "markdown" });
227
228 if let Some(schema_obj) = schema {
230 payload["extraction_schema"] = schema_obj;
231 }
232
233 let client = Client::builder()
234 .timeout(Duration::from_secs(120))
235 .build()
236 .map_err(|e| BrightDataError::ToolError(e.to_string()))?;
237
238 let response = client
239 .post(&format!("{}/request", base_url))
240 .header("Authorization", format!("Bearer {}", api_token))
241 .header("Content-Type", "application/json")
242 .json(&payload)
243 .send()
244 .await
245 .map_err(|e| BrightDataError::ToolError(format!("BrightData extraction request failed: {}", e)))?;
246
247 let status = response.status().as_u16();
248 let response_headers: HashMap<String, String> = response
249 .headers()
250 .iter()
251 .map(|(k, v)| (k.to_string(), v.to_str().unwrap_or("").to_string()))
252 .collect();
253
254 if let Err(e) = JSON_LOGGER.log_brightdata_request(
256 execution_id,
257 &zone,
258 url,
259 payload.clone(),
260 status,
261 response_headers,
262 extraction_format
263 ).await {
264 log::warn!("Failed to log BrightData extraction request: {}", e);
265 }
266
267 if !response.status().is_success() {
268 let error_text = response.text().await.unwrap_or_default();
269 return Err(BrightDataError::ToolError(format!(
270 "BrightData extraction error {}: {}",
271 status, error_text
272 )));
273 }
274
275 let raw_content = response.text().await
276 .map_err(|e| BrightDataError::ToolError(e.to_string()))?;
277
278 println!("################################################################################################################");
280 println!("BRIGHTDATA RAW RESPONSE FROM: {}", url);
281 println!("ZONE: {}", zone);
282 println!("EXECUTION: {}", execution_id);
283 println!("DATA TYPE: {}", data_type);
284 println!("EXTRACTION FORMAT: {}", extraction_format);
285 println!("CONTENT LENGTH: {} bytes", raw_content.len());
286 println!("################################################################################################################");
287 println!("{}", raw_content);
288 println!("################################################################################################################");
289 println!("END OF BRIGHTDATA RESPONSE");
290 println!("################################################################################################################");
291
292 if self.is_data_reduction_enabled() {
294 if ResponseFilter::is_error_page(&raw_content) {
295 return Err(BrightDataError::ToolError("Extraction returned error page".into()));
296 } else if ResponseStrategy::should_try_next_source(&raw_content) {
297 return Err(BrightDataError::ToolError("Content quality too low".into()));
298 }
299 }
300
301 println!("--------------------------------------------------------------------------");
303 println!("SENDING TO ANTHROPIC FROM EXTRACT TOOL:");
304 println!("URL: {}", url);
305 println!("DATA TYPE: {}", data_type);
306 println!("EXTRACTION FORMAT: {}", extraction_format);
307 println!("DATA REDUCTION ENABLED: {}", self.is_data_reduction_enabled());
308 println!("CONTENT LENGTH: {} bytes", raw_content.len());
309 println!("--------------------------------------------------------------------------");
310 println!("{}", raw_content);
311 println!("--------------------------------------------------------------------------");
312 println!("END OF CONTENT SENT TO ANTHROPIC");
313 println!("--------------------------------------------------------------------------");
314
315 Ok(json!({
317 "content": raw_content,
318 "metadata": {
319 "url": url,
320 "zone": zone,
321 "execution_id": execution_id,
322 "data_type": data_type,
323 "extraction_format": extraction_format,
324 "clean_content": clean_content,
325 "data_format": "markdown",
326 "data_reduction_enabled": self.is_data_reduction_enabled()
327 },
328 "success": true
329 }))
330 }
331}