snm_brightdata_client/tools/
extract.rs

1// src/tools/extract.rs
2use crate::tool::{Tool, ToolResult, McpContent};
3use crate::error::BrightDataError;
4use crate::extras::logger::JSON_LOGGER;
5use crate::filters::{ResponseFilter, ResponseStrategy};
6use async_trait::async_trait;
7use reqwest::Client;
8use serde_json::{json, Value};
9use std::env;
10use std::time::Duration;
11use std::collections::HashMap;
12use log::info;
13
14pub struct Extractor;
15
16#[async_trait]
17impl Tool for Extractor {
18    fn name(&self) -> &str {
19        "extract_data"
20    }
21
22    fn description(&self) -> &str {
23        "Extract structured data from a webpage using BrightData with WEB_UNLOCKER_ZONE"
24    }
25
26    fn input_schema(&self) -> Value {
27        json!({
28            "type": "object",
29            "properties": {
30                "url": {
31                    "type": "string",
32                    "description": "The URL to extract data from"
33                },
34                "data_type": {
35                    "type": "string",
36                    "enum": ["auto", "financial", "ecommerce", "social", "news", "general"],
37                    "default": "auto",
38                    "description": "Type of data to extract for optimized processing"
39                },
40                "extraction_format": {
41                    "type": "string",
42                    "enum": ["markdown", "json", "structured", "raw"],
43                    "default": "structured",
44                    "description": "Format for extracted data"
45                },
46                "clean_content": {
47                    "type": "boolean",
48                    "default": true,
49                    "description": "Remove navigation, ads, and boilerplate content"
50                },
51                "schema": {
52                    "type": "object",
53                    "description": "Optional schema to guide extraction",
54                    "additionalProperties": true
55                }
56            },
57            "required": ["url"]
58        })
59    }
60
61    async fn execute_internal(&self, parameters: Value) -> Result<ToolResult, BrightDataError> {
62        let url = parameters
63            .get("url")
64            .and_then(|v| v.as_str())
65            .ok_or_else(|| BrightDataError::ToolError("Missing 'url' parameter".into()))?;
66
67        let data_type = parameters
68            .get("data_type")
69            .and_then(|v| v.as_str())
70            .unwrap_or("auto");
71
72        let extraction_format = parameters
73            .get("extraction_format")
74            .and_then(|v| v.as_str())
75            .unwrap_or("structured");
76
77        let clean_content = parameters
78            .get("clean_content")
79            .and_then(|v| v.as_bool())
80            .unwrap_or(true);
81
82        let schema = parameters.get("schema").cloned();
83
84        let execution_id = self.generate_execution_id();
85        
86        match self.extract_with_brightdata(url, data_type, extraction_format, clean_content, schema, &execution_id).await {
87            Ok(result) => {
88                let content = result.get("content").and_then(|c| c.as_str()).unwrap_or("");
89                
90                // Create formatted response based on DEDUCT_DATA setting
91                let formatted_response = self.create_formatted_extract_response(
92                    url, data_type, extraction_format, content, &execution_id
93                );
94                
95                let tool_result = ToolResult::success_with_raw(
96                    vec![McpContent::text(formatted_response)], 
97                    result
98                );
99                
100                // Apply filtering only if DEDUCT_DATA=true
101                if self.is_data_reduction_enabled() {
102                    Ok(ResponseStrategy::apply_size_limits(tool_result))
103                } else {
104                    Ok(tool_result)
105                }
106            }
107            Err(_e) => {
108                // Return empty data for BrightData errors - Anthropic will retry
109                let empty_response = json!({
110                    "url": url,
111                    "data_type": data_type,
112                    "status": "no_data",
113                    "reason": "brightdata_error",
114                    "execution_id": execution_id
115                });
116                
117                Ok(ToolResult::success_with_raw(
118                    vec![McpContent::text("📊 **No Data Available**\n\nPlease try again with a different URL or check if the website is accessible.".to_string())],
119                    empty_response
120                ))
121            }
122        }
123    }
124}
125
126impl Extractor {
127    /// Check if data reduction is enabled via DEDUCT_DATA environment variable only
128    fn is_data_reduction_enabled(&self) -> bool {
129        std::env::var("DEDUCT_DATA")
130            .unwrap_or_else(|_| "false".to_string())
131            .to_lowercase() == "true"
132    }
133
134    /// Create formatted response with DEDUCT_DATA control
135    fn create_formatted_extract_response(
136        &self,
137        url: &str,
138        data_type: &str,
139        extraction_format: &str,
140        content: &str,
141        execution_id: &str
142    ) -> String {
143        // If DEDUCT_DATA=false, return full content with basic formatting
144        if !self.is_data_reduction_enabled() {
145            return format!(
146                "📊 **Data Extraction from: {}**\n\n## Full Content\n{}\n\n*Data Type: {} | Format: {} • Execution: {}*",
147                url, 
148                content,
149                data_type, 
150                extraction_format,
151                execution_id
152            );
153        }
154
155        // TODO: Add filtered data extraction logic when DEDUCT_DATA=true
156        // For now, return full content formatted
157        format!(
158            "📊 **Data Extraction from: {}**\n\n## Content (TODO: Add Filtering)\n{}\n\n*Data Type: {} | Format: {} • Execution: {}*",
159            url, 
160            content,
161            data_type, 
162            extraction_format,
163            execution_id
164        )
165    }
166
167    fn generate_execution_id(&self) -> String {
168        format!("extract_{}", chrono::Utc::now().format("%Y%m%d_%H%M%S%.3f"))
169    }
170
171    /// Extract data with BrightData using only WEB_UNLOCKER_ZONE
172    async fn extract_with_brightdata(
173        &self,
174        url: &str,
175        data_type: &str,
176        extraction_format: &str,
177        clean_content: bool,
178        schema: Option<Value>,
179        execution_id: &str,
180    ) -> Result<Value, BrightDataError> {
181        let api_token = env::var("BRIGHTDATA_API_TOKEN")
182            .or_else(|_| env::var("API_TOKEN"))
183            .map_err(|_| BrightDataError::ToolError("Missing BRIGHTDATA_API_TOKEN".into()))?;
184
185        let base_url = env::var("BRIGHTDATA_BASE_URL")
186            .unwrap_or_else(|_| "https://api.brightdata.com".to_string());
187
188        // Always use WEB_UNLOCKER_ZONE
189        let zone = env::var("WEB_UNLOCKER_ZONE").unwrap_or_else(|_| "web_unlocker".to_string());
190
191        info!("📊 Extracting from {} using WEB_UNLOCKER_ZONE: {} (execution: {})", 
192              url, zone, execution_id);
193
194        // Build payload with mandatory markdown format
195        let mut payload = json!({
196            "url": url,
197            "zone": zone,
198            "format": "json",
199            "data_format": "markdown"  // MANDATORY: Always use markdown format
200        });
201
202        // Add optional schema if provided
203        if let Some(schema_obj) = schema {
204            payload["extraction_schema"] = schema_obj;
205        }
206
207        let client = Client::builder()
208            .timeout(Duration::from_secs(120))
209            .build()
210            .map_err(|e| BrightDataError::ToolError(e.to_string()))?;
211
212        let response = client
213            .post(&format!("{}/request", base_url))
214            .header("Authorization", format!("Bearer {}", api_token))
215            .header("Content-Type", "application/json")
216            .json(&payload)
217            .send()
218            .await
219            .map_err(|e| BrightDataError::ToolError(format!("BrightData extraction request failed: {}", e)))?;
220
221        let status = response.status().as_u16();
222        let response_headers: HashMap<String, String> = response
223            .headers()
224            .iter()
225            .map(|(k, v)| (k.to_string(), v.to_str().unwrap_or("").to_string()))
226            .collect();
227
228        // Log BrightData request
229        if let Err(e) = JSON_LOGGER.log_brightdata_request(
230            execution_id,
231            &zone,
232            url,
233            payload.clone(),
234            status,
235            response_headers,
236            extraction_format
237        ).await {
238            log::warn!("Failed to log BrightData extraction request: {}", e);
239        }
240
241        if !response.status().is_success() {
242            let error_text = response.text().await.unwrap_or_default();
243            return Err(BrightDataError::ToolError(format!(
244                "BrightData extraction error {}: {}",
245                status, error_text
246            )));
247        }
248
249        let raw_content = response.text().await
250            .map_err(|e| BrightDataError::ToolError(e.to_string()))?;
251
252        // Print what came from BrightData
253        println!("################################################################################################################");
254        println!("BRIGHTDATA RAW RESPONSE FROM: {}", url);
255        println!("ZONE: {}", zone);
256        println!("EXECUTION: {}", execution_id);
257        println!("DATA TYPE: {}", data_type);
258        println!("EXTRACTION FORMAT: {}", extraction_format);
259        println!("CONTENT LENGTH: {} bytes", raw_content.len());
260        println!("################################################################################################################");
261        println!("{}", raw_content);
262        println!("################################################################################################################");
263        println!("END OF BRIGHTDATA RESPONSE");
264        println!("################################################################################################################");
265
266        // Apply filters only if DEDUCT_DATA=true
267        if self.is_data_reduction_enabled() {
268            if ResponseFilter::is_error_page(&raw_content) {
269                return Err(BrightDataError::ToolError("Extraction returned error page".into()));
270            } else if ResponseStrategy::should_try_next_source(&raw_content) {
271                return Err(BrightDataError::ToolError("Content quality too low".into()));
272            }
273        }
274
275        // Print what will be sent to Anthropic
276        println!("--------------------------------------------------------------------------");
277        println!("SENDING TO ANTHROPIC FROM EXTRACT TOOL:");
278        println!("URL: {}", url);
279        println!("DATA TYPE: {}", data_type);
280        println!("EXTRACTION FORMAT: {}", extraction_format);
281        println!("DATA REDUCTION ENABLED: {}", self.is_data_reduction_enabled());
282        println!("CONTENT LENGTH: {} bytes", raw_content.len());
283        println!("--------------------------------------------------------------------------");
284        println!("{}", raw_content);
285        println!("--------------------------------------------------------------------------");
286        println!("END OF CONTENT SENT TO ANTHROPIC");
287        println!("--------------------------------------------------------------------------");
288
289        // Return raw content directly without processing
290        Ok(json!({
291            "content": raw_content,
292            "metadata": {
293                "url": url,
294                "zone": zone,
295                "execution_id": execution_id,
296                "data_type": data_type,
297                "extraction_format": extraction_format,
298                "clean_content": clean_content,
299                "data_format": "markdown",
300                "data_reduction_enabled": self.is_data_reduction_enabled()
301            },
302            "success": true
303        }))
304    }
305}