snm_brightdata_client/tools/
extract.rs1use crate::tool::{Tool, ToolResult, McpContent};
3use crate::error::BrightDataError;
4use crate::extras::logger::JSON_LOGGER;
5use crate::filters::{ResponseFilter, ResponseStrategy};
6use async_trait::async_trait;
7use reqwest::Client;
8use serde_json::{json, Value};
9use std::env;
10use std::time::Duration;
11use std::collections::HashMap;
12use log::info;
13
14pub struct Extractor;
15
16#[async_trait]
17impl Tool for Extractor {
18 fn name(&self) -> &str {
19 "extract_data"
20 }
21
22 fn description(&self) -> &str {
23 "Extract structured data from a webpage using BrightData with WEB_UNLOCKER_ZONE"
24 }
25
26 fn input_schema(&self) -> Value {
27 json!({
28 "type": "object",
29 "properties": {
30 "url": {
31 "type": "string",
32 "description": "The URL to extract data from"
33 },
34 "data_type": {
35 "type": "string",
36 "enum": ["auto", "financial", "ecommerce", "social", "news", "general"],
37 "default": "auto",
38 "description": "Type of data to extract for optimized processing"
39 },
40 "extraction_format": {
41 "type": "string",
42 "enum": ["markdown", "json", "structured", "raw"],
43 "default": "structured",
44 "description": "Format for extracted data"
45 },
46 "clean_content": {
47 "type": "boolean",
48 "default": true,
49 "description": "Remove navigation, ads, and boilerplate content"
50 },
51 "schema": {
52 "type": "object",
53 "description": "Optional schema to guide extraction",
54 "additionalProperties": true
55 }
56 },
57 "required": ["url"]
58 })
59 }
60
61 async fn execute_internal(&self, parameters: Value) -> Result<ToolResult, BrightDataError> {
62 let url = parameters
63 .get("url")
64 .and_then(|v| v.as_str())
65 .ok_or_else(|| BrightDataError::ToolError("Missing 'url' parameter".into()))?;
66
67 let data_type = parameters
68 .get("data_type")
69 .and_then(|v| v.as_str())
70 .unwrap_or("auto");
71
72 let extraction_format = parameters
73 .get("extraction_format")
74 .and_then(|v| v.as_str())
75 .unwrap_or("structured");
76
77 let clean_content = parameters
78 .get("clean_content")
79 .and_then(|v| v.as_bool())
80 .unwrap_or(true);
81
82 let schema = parameters.get("schema").cloned();
83
84 let execution_id = self.generate_execution_id();
85
86 match self.extract_with_brightdata(url, data_type, extraction_format, clean_content, schema, &execution_id).await {
87 Ok(result) => {
88 let content = result.get("content").and_then(|c| c.as_str()).unwrap_or("");
89
90 let formatted_response = self.create_formatted_extract_response(
92 url, data_type, extraction_format, content, &execution_id
93 );
94
95 let tool_result = ToolResult::success_with_raw(
96 vec![McpContent::text(formatted_response)],
97 result
98 );
99
100 if self.is_data_reduction_enabled() {
102 Ok(ResponseStrategy::apply_size_limits(tool_result))
103 } else {
104 Ok(tool_result)
105 }
106 }
107 Err(_e) => {
108 let empty_response = json!({
110 "url": url,
111 "data_type": data_type,
112 "status": "no_data",
113 "reason": "brightdata_error",
114 "execution_id": execution_id
115 });
116
117 Ok(ToolResult::success_with_raw(
118 vec![McpContent::text("📊 **No Data Available**\n\nPlease try again with a different URL or check if the website is accessible.".to_string())],
119 empty_response
120 ))
121 }
122 }
123 }
124}
125
126impl Extractor {
127 fn is_data_reduction_enabled(&self) -> bool {
129 std::env::var("DEDUCT_DATA")
130 .unwrap_or_else(|_| "false".to_string())
131 .to_lowercase() == "true"
132 }
133
134 fn create_formatted_extract_response(
136 &self,
137 url: &str,
138 data_type: &str,
139 extraction_format: &str,
140 content: &str,
141 execution_id: &str
142 ) -> String {
143 if !self.is_data_reduction_enabled() {
145 return format!(
146 "📊 **Data Extraction from: {}**\n\n## Full Content\n{}\n\n*Data Type: {} | Format: {} • Execution: {}*",
147 url,
148 content,
149 data_type,
150 extraction_format,
151 execution_id
152 );
153 }
154
155 format!(
158 "📊 **Data Extraction from: {}**\n\n## Content (TODO: Add Filtering)\n{}\n\n*Data Type: {} | Format: {} • Execution: {}*",
159 url,
160 content,
161 data_type,
162 extraction_format,
163 execution_id
164 )
165 }
166
167 fn generate_execution_id(&self) -> String {
168 format!("extract_{}", chrono::Utc::now().format("%Y%m%d_%H%M%S%.3f"))
169 }
170
171 async fn extract_with_brightdata(
173 &self,
174 url: &str,
175 data_type: &str,
176 extraction_format: &str,
177 clean_content: bool,
178 schema: Option<Value>,
179 execution_id: &str,
180 ) -> Result<Value, BrightDataError> {
181 let api_token = env::var("BRIGHTDATA_API_TOKEN")
182 .or_else(|_| env::var("API_TOKEN"))
183 .map_err(|_| BrightDataError::ToolError("Missing BRIGHTDATA_API_TOKEN".into()))?;
184
185 let base_url = env::var("BRIGHTDATA_BASE_URL")
186 .unwrap_or_else(|_| "https://api.brightdata.com".to_string());
187
188 let zone = env::var("WEB_UNLOCKER_ZONE").unwrap_or_else(|_| "web_unlocker".to_string());
190
191 info!("📊 Extracting from {} using WEB_UNLOCKER_ZONE: {} (execution: {})",
192 url, zone, execution_id);
193
194 let mut payload = json!({
196 "url": url,
197 "zone": zone,
198 "format": "json",
199 "data_format": "markdown" });
201
202 if let Some(schema_obj) = schema {
204 payload["extraction_schema"] = schema_obj;
205 }
206
207 let client = Client::builder()
208 .timeout(Duration::from_secs(120))
209 .build()
210 .map_err(|e| BrightDataError::ToolError(e.to_string()))?;
211
212 let response = client
213 .post(&format!("{}/request", base_url))
214 .header("Authorization", format!("Bearer {}", api_token))
215 .header("Content-Type", "application/json")
216 .json(&payload)
217 .send()
218 .await
219 .map_err(|e| BrightDataError::ToolError(format!("BrightData extraction request failed: {}", e)))?;
220
221 let status = response.status().as_u16();
222 let response_headers: HashMap<String, String> = response
223 .headers()
224 .iter()
225 .map(|(k, v)| (k.to_string(), v.to_str().unwrap_or("").to_string()))
226 .collect();
227
228 if let Err(e) = JSON_LOGGER.log_brightdata_request(
230 execution_id,
231 &zone,
232 url,
233 payload.clone(),
234 status,
235 response_headers,
236 extraction_format
237 ).await {
238 log::warn!("Failed to log BrightData extraction request: {}", e);
239 }
240
241 if !response.status().is_success() {
242 let error_text = response.text().await.unwrap_or_default();
243 return Err(BrightDataError::ToolError(format!(
244 "BrightData extraction error {}: {}",
245 status, error_text
246 )));
247 }
248
249 let raw_content = response.text().await
250 .map_err(|e| BrightDataError::ToolError(e.to_string()))?;
251
252 println!("################################################################################################################");
254 println!("BRIGHTDATA RAW RESPONSE FROM: {}", url);
255 println!("ZONE: {}", zone);
256 println!("EXECUTION: {}", execution_id);
257 println!("DATA TYPE: {}", data_type);
258 println!("EXTRACTION FORMAT: {}", extraction_format);
259 println!("CONTENT LENGTH: {} bytes", raw_content.len());
260 println!("################################################################################################################");
261 println!("{}", raw_content);
262 println!("################################################################################################################");
263 println!("END OF BRIGHTDATA RESPONSE");
264 println!("################################################################################################################");
265
266 if self.is_data_reduction_enabled() {
268 if ResponseFilter::is_error_page(&raw_content) {
269 return Err(BrightDataError::ToolError("Extraction returned error page".into()));
270 } else if ResponseStrategy::should_try_next_source(&raw_content) {
271 return Err(BrightDataError::ToolError("Content quality too low".into()));
272 }
273 }
274
275 println!("--------------------------------------------------------------------------");
277 println!("SENDING TO ANTHROPIC FROM EXTRACT TOOL:");
278 println!("URL: {}", url);
279 println!("DATA TYPE: {}", data_type);
280 println!("EXTRACTION FORMAT: {}", extraction_format);
281 println!("DATA REDUCTION ENABLED: {}", self.is_data_reduction_enabled());
282 println!("CONTENT LENGTH: {} bytes", raw_content.len());
283 println!("--------------------------------------------------------------------------");
284 println!("{}", raw_content);
285 println!("--------------------------------------------------------------------------");
286 println!("END OF CONTENT SENT TO ANTHROPIC");
287 println!("--------------------------------------------------------------------------");
288
289 Ok(json!({
291 "content": raw_content,
292 "metadata": {
293 "url": url,
294 "zone": zone,
295 "execution_id": execution_id,
296 "data_type": data_type,
297 "extraction_format": extraction_format,
298 "clean_content": clean_content,
299 "data_format": "markdown",
300 "data_reduction_enabled": self.is_data_reduction_enabled()
301 },
302 "success": true
303 }))
304 }
305}