snm_brightdata_client/tools/
scrape.rs

1// src/tools/scrape.rs
2use crate::tool::{Tool, ToolResult, McpContent};
3use crate::error::BrightDataError;
4use async_trait::async_trait;
5use serde_json::{Value, json};
6use reqwest::Client;
7use std::time::Duration;
8
9pub struct ScrapeMarkdown;
10
11#[async_trait]
12impl Tool for ScrapeMarkdown {
13    fn name(&self) -> &str {
14        "scrape_website"
15    }
16
17    fn description(&self) -> &str {
18        "Scrape a webpage using BrightData Web Unlocker"
19    }
20
21    fn input_schema(&self) -> Value {
22        json!({
23            "type": "object",
24            "properties": {
25                "url": {
26                    "type": "string",
27                    "description": "The URL to scrape"
28                },
29                "format": {
30                    "type": "string",
31                    "enum": ["raw", "markdown"],
32                    "description": "Output format",
33                    "default": "raw"
34                }
35            },
36            "required": ["url"]
37        })
38    }
39
40    async fn execute(&self, parameters: Value) -> Result<ToolResult, BrightDataError> {
41        let url = parameters
42            .get("url")
43            .and_then(|v| v.as_str())
44            .ok_or_else(|| BrightDataError::ToolError("Missing 'url' parameter".into()))?;
45
46        let format = parameters
47            .get("format")
48            .and_then(|v| v.as_str())
49            .unwrap_or("raw");
50
51        let result = self.scrape_with_brightdata(url, format).await?;
52        
53        let content_text = result.get("content").and_then(|c| c.as_str()).unwrap_or("No content");
54        let mcp_content = vec![McpContent::text(format!(
55            "🌐 **Scraped from {}**\n\n{}",
56            url,
57            content_text
58        ))];
59
60        Ok(ToolResult::success_with_raw(mcp_content, result))
61    }
62}
63
64impl ScrapeMarkdown {
65    async fn scrape_with_brightdata(&self, url: &str, format: &str) -> Result<Value, BrightDataError> {
66        let api_token = std::env::var("BRIGHTDATA_API_TOKEN")
67            .or_else(|_| std::env::var("API_TOKEN"))
68            .map_err(|_| BrightDataError::ToolError("Missing BRIGHTDATA_API_TOKEN".into()))?;
69
70        let base_url = std::env::var("BRIGHTDATA_BASE_URL")
71            .unwrap_or_else(|_| "https://api.brightdata.com".to_string());
72
73        let zone = std::env::var("WEB_UNLOCKER_ZONE")
74            .unwrap_or_else(|_| "default".to_string());
75
76        // Valid BrightData parameters only
77        let mut payload = json!({
78            "url": url,
79            "zone": zone,
80            "format": "raw"  // Always use "raw" format
81        });
82
83        // Add markdown conversion if requested
84        if format == "markdown" {
85            payload["data_format"] = json!("markdown");
86        }
87
88        let client = Client::builder()
89            .timeout(Duration::from_secs(120))
90            .build()
91            .map_err(|e| BrightDataError::ToolError(e.to_string()))?;
92
93        let response = client
94            .post(&format!("{}/request", base_url))
95            .header("Authorization", format!("Bearer {}", api_token))
96            .header("Content-Type", "application/json")
97            .json(&payload)
98            .send()
99            .await
100            .map_err(|e| BrightDataError::ToolError(format!("Request failed: {}", e)))?;
101
102        let status = response.status();
103        if !status.is_success() {
104            let error_text = response.text().await.unwrap_or_default();
105            return Err(BrightDataError::ToolError(format!(
106                "BrightData API error {}: {}",
107                status, error_text
108            )));
109        }
110
111        let content = response.text().await
112            .map_err(|e| BrightDataError::ToolError(e.to_string()))?;
113
114        Ok(json!({
115            "content": content,
116            "url": url,
117            "format": format,
118            "success": true
119        }))
120    }
121}