snm_brightdata_client/tools/
scrape.rs

1// src/tools/scrape.rs
2use crate::tool::{Tool, ToolResult, McpContent};
3use crate::error::BrightDataError;
4use crate::extras::logger::JSON_LOGGER;
5use crate::filters::{ResponseFilter, ResponseStrategy};
6use async_trait::async_trait;
7use reqwest::Client;
8use serde_json::{json, Value};
9use std::env;
10use std::time::Duration;
11use std::collections::HashMap;
12use log::info;
13
14pub struct Scraper;
15
16#[async_trait]
17impl Tool for Scraper {
18    fn name(&self) -> &str {
19        "scrape_website"
20    }
21
22    fn description(&self) -> &str {
23        "Scrape a webpage using BrightData - supports API, Web Unlocker, and Residential Proxy"
24    }
25
26    fn input_schema(&self) -> Value {
27        json!({
28            "type": "object",
29            "properties": {
30                "url": {
31                    "type": "string",
32                    "description": "The URL to scrape"
33                },
34                "method": {
35                    "type": "string",
36                    "enum": ["api", "web_unlocker_proxy", "residential_proxy", "auto"],
37                    "description": "Method: 'api' for REST API, 'web_unlocker_proxy' for Web Unlocker proxy, 'residential_proxy' for standard proxy, 'auto' to detect best available",
38                    "default": "auto"
39                },
40                "format": {
41                    "type": "string",
42                    "enum": ["raw", "markdown", "screenshot"],
43                    "description": "Output format - raw (HTML), markdown, or screenshot (Web Unlocker only)",
44                    "default": "markdown"
45                },
46                "country": {
47                    "type": "string",
48                    "description": "Country code for geo-targeting (e.g., 'us', 'in', 'uk')",
49                    "default": ""
50                },
51                "city": {
52                    "type": "string",
53                    "description": "City for geo-targeting (Web Unlocker only)",
54                    "default": ""
55                },
56                "zipcode": {
57                    "type": "string",
58                    "description": "Zipcode for precise geo-targeting (Web Unlocker only)",
59                    "default": ""
60                },
61                "mobile": {
62                    "type": "boolean",
63                    "description": "Use mobile user agent",
64                    "default": false
65                },
66                "wait_for": {
67                    "type": "string",
68                    "description": "CSS selector or text to wait for (Web Unlocker only)",
69                    "default": ""
70                },
71                "custom_headers": {
72                    "type": "object",
73                    "description": "Custom headers to send",
74                    "additionalProperties": true,
75                    "default": {}
76                },
77                "disable_captcha_solving": {
78                    "type": "boolean",
79                    "description": "Disable automatic CAPTCHA solving (Web Unlocker only)",
80                    "default": false
81                }
82            },
83            "required": ["url"]
84        })
85    }
86
87    async fn execute_internal(&self, parameters: Value) -> Result<ToolResult, BrightDataError> {
88        let url = parameters
89            .get("url")
90            .and_then(|v| v.as_str())
91            .ok_or_else(|| BrightDataError::ToolError("Missing 'url' parameter".into()))?;
92
93        let data_type = parameters
94            .get("data_type")
95            .and_then(|v| v.as_str())
96            .unwrap_or("auto");
97
98        let extraction_format = parameters
99            .get("extraction_format")
100            .and_then(|v| v.as_str())
101            .unwrap_or("structured");
102
103        let clean_content = parameters
104            .get("clean_content")
105            .and_then(|v| v.as_bool())
106            .unwrap_or(true);
107
108        let schema = parameters.get("schema").cloned();
109
110        let execution_id = self.generate_execution_id();
111        
112        match self.scrape_with_brightdata(url, data_type, extraction_format, clean_content, schema, &execution_id).await {
113            Ok(result) => {
114                let content = result.get("content").and_then(|c| c.as_str()).unwrap_or("");
115                
116                // Create formatted response based on DEDUCT_DATA setting
117                let formatted_response = self.create_formatted_scrape_response(
118                    url, data_type, extraction_format, content, &execution_id
119                );
120                
121                let tool_result = ToolResult::success_with_raw(
122                    vec![McpContent::text(formatted_response)], 
123                    result
124                );
125                
126                // Apply filtering only if DEDUCT_DATA=true
127                if self.is_data_reduction_enabled() {
128                    Ok(ResponseStrategy::apply_size_limits(tool_result))
129                } else {
130                    Ok(tool_result)
131                }
132            }
133            Err(_e) => {
134                // Return empty data for BrightData errors - Anthropic will retry
135                let empty_response = json!({
136                    "url": url,
137                    "data_type": data_type,
138                    "status": "no_data",
139                    "reason": "brightdata_error",
140                    "execution_id": execution_id
141                });
142                
143                Ok(ToolResult::success_with_raw(
144                    vec![McpContent::text("📊 **No Data Available**\n\nPlease try again with a different URL or check if the website is accessible.".to_string())],
145                    empty_response
146                ))
147            }
148        }
149    }
150}
151
152impl Scraper {
153    /// Check if data reduction is enabled via DEDUCT_DATA environment variable only
154    fn is_data_reduction_enabled(&self) -> bool {
155        std::env::var("DEDUCT_DATA")
156            .unwrap_or_else(|_| "false".to_string())
157            .to_lowercase() == "true"
158    }
159
160    /// Create formatted response with DEDUCT_DATA control
161    fn create_formatted_scrape_response(
162        &self,
163        url: &str,
164        data_type: &str,
165        extraction_format: &str,
166        content: &str,
167        execution_id: &str
168    ) -> String {
169        // If DEDUCT_DATA=false, return full content with basic formatting
170        if !self.is_data_reduction_enabled() {
171            return format!(
172                "📊 **Data Extraction from: {}**\n\n## Full Content\n{}\n\n*Data Type: {} | Format: {} • Execution: {}*",
173                url, 
174                content,
175                data_type, 
176                extraction_format,
177                execution_id
178            );
179        }
180
181        // TODO: Add filtered data extraction logic when DEDUCT_DATA=true
182        // For now, return full content formatted
183        format!(
184            "📊 **Data Extraction from: {}**\n\n## Content (TODO: Add Filtering)\n{}\n\n*Data Type: {} | Format: {} • Execution: {}*",
185            url, 
186            content,
187            data_type, 
188            extraction_format,
189            execution_id
190        )
191    }
192
193    fn generate_execution_id(&self) -> String {
194        format!("scrape_{}", chrono::Utc::now().format("%Y%m%d_%H%M%S%.3f"))
195    }
196
197    /// Extract data with BrightData using only WEB_UNLOCKER_ZONE
198    async fn scrape_with_brightdata(
199        &self,
200        url: &str,
201        data_type: &str,
202        extraction_format: &str,
203        clean_content: bool,
204        schema: Option<Value>,
205        execution_id: &str,
206    ) -> Result<Value, BrightDataError> {
207        let api_token = env::var("BRIGHTDATA_API_TOKEN")
208            .or_else(|_| env::var("API_TOKEN"))
209            .map_err(|_| BrightDataError::ToolError("Missing BRIGHTDATA_API_TOKEN".into()))?;
210
211        let base_url = env::var("BRIGHTDATA_BASE_URL")
212            .unwrap_or_else(|_| "https://api.brightdata.com".to_string());
213
214        // Always use WEB_UNLOCKER_ZONE
215        let zone = env::var("WEB_UNLOCKER_ZONE").unwrap_or_else(|_| "web_unlocker".to_string());
216
217        info!("📊 Extracting from {} using WEB_UNLOCKER_ZONE: {} (execution: {})", 
218              url, zone, execution_id);
219
220        // Build payload with mandatory markdown format
221        let mut payload = json!({
222            "url": url,
223            "zone": zone,
224            "format": "json",
225            "data_format": "markdown"  // MANDATORY: Always use markdown format
226        });
227
228        // Add optional schema if provided
229        if let Some(schema_obj) = schema {
230            payload["extraction_schema"] = schema_obj;
231        }
232
233        let client = Client::builder()
234            .timeout(Duration::from_secs(120))
235            .build()
236            .map_err(|e| BrightDataError::ToolError(e.to_string()))?;
237
238        let response = client
239            .post(&format!("{}/request", base_url))
240            .header("Authorization", format!("Bearer {}", api_token))
241            .header("Content-Type", "application/json")
242            .json(&payload)
243            .send()
244            .await
245            .map_err(|e| BrightDataError::ToolError(format!("BrightData extraction request failed: {}", e)))?;
246
247        let status = response.status().as_u16();
248        let response_headers: HashMap<String, String> = response
249            .headers()
250            .iter()
251            .map(|(k, v)| (k.to_string(), v.to_str().unwrap_or("").to_string()))
252            .collect();
253
254        // Log BrightData request
255        if let Err(e) = JSON_LOGGER.log_brightdata_request(
256            execution_id,
257            &zone,
258            url,
259            payload.clone(),
260            status,
261            response_headers,
262            extraction_format
263        ).await {
264            log::warn!("Failed to log BrightData extraction request: {}", e);
265        }
266
267        if !response.status().is_success() {
268            let error_text = response.text().await.unwrap_or_default();
269            return Err(BrightDataError::ToolError(format!(
270                "BrightData extraction error {}: {}",
271                status, error_text
272            )));
273        }
274
275        let raw_content = response.text().await
276            .map_err(|e| BrightDataError::ToolError(e.to_string()))?;
277
278        // Print what came from BrightData
279        println!("################################################################################################################");
280        println!("BRIGHTDATA RAW RESPONSE FROM: {}", url);
281        println!("ZONE: {}", zone);
282        println!("EXECUTION: {}", execution_id);
283        println!("DATA TYPE: {}", data_type);
284        println!("EXTRACTION FORMAT: {}", extraction_format);
285        println!("CONTENT LENGTH: {} bytes", raw_content.len());
286        println!("################################################################################################################");
287        println!("{}", raw_content);
288        println!("################################################################################################################");
289        println!("END OF BRIGHTDATA RESPONSE");
290        println!("################################################################################################################");
291
292        // Apply filters only if DEDUCT_DATA=true
293        if self.is_data_reduction_enabled() {
294            if ResponseFilter::is_error_page(&raw_content) {
295                return Err(BrightDataError::ToolError("Extraction returned error page".into()));
296            } else if ResponseStrategy::should_try_next_source(&raw_content) {
297                return Err(BrightDataError::ToolError("Content quality too low".into()));
298            }
299        }
300
301        // Print what will be sent to Anthropic
302        println!("--------------------------------------------------------------------------");
303        println!("SENDING TO ANTHROPIC FROM EXTRACT TOOL:");
304        println!("URL: {}", url);
305        println!("DATA TYPE: {}", data_type);
306        println!("EXTRACTION FORMAT: {}", extraction_format);
307        println!("DATA REDUCTION ENABLED: {}", self.is_data_reduction_enabled());
308        println!("CONTENT LENGTH: {} bytes", raw_content.len());
309        println!("--------------------------------------------------------------------------");
310        println!("{}", raw_content);
311        println!("--------------------------------------------------------------------------");
312        println!("END OF CONTENT SENT TO ANTHROPIC");
313        println!("--------------------------------------------------------------------------");
314
315        // Return raw content directly without processing
316        Ok(json!({
317            "content": raw_content,
318            "metadata": {
319                "url": url,
320                "zone": zone,
321                "execution_id": execution_id,
322                "data_type": data_type,
323                "extraction_format": extraction_format,
324                "clean_content": clean_content,
325                "data_format": "markdown",
326                "data_reduction_enabled": self.is_data_reduction_enabled()
327            },
328            "success": true
329        }))
330    }
331}