mcp_tools/servers/
web_tools.rs

1//! Web Tools MCP Server
2//!
3//! Provides web scraping and HTTP request capabilities via MCP protocol including:
4//! - HTTP GET/POST/PUT/DELETE requests
5//! - Web page content extraction
6//! - HTML parsing and element selection
7//! - JSON API interactions
8//! - URL validation and analysis
9
10use async_trait::async_trait;
11use reqwest::{Client, Method, Response};
12use serde::{Deserialize, Serialize};
13use std::collections::HashMap;
14use std::time::Duration;
15use tracing::{debug, info, warn};
16use url::Url;
17
18use crate::common::{
19    BaseServer, McpContent, McpServerBase, McpTool, McpToolRequest, McpToolResponse,
20    ServerCapabilities, ServerConfig,
21};
22use crate::{McpToolsError, Result};
23
24/// Web Tools MCP Server
25pub struct WebToolsServer {
26    base: BaseServer,
27    client: Client,
28}
29
30/// HTTP request configuration
31#[derive(Debug, Clone, Serialize, Deserialize)]
32pub struct HttpRequest {
33    pub url: String,
34    pub method: String,
35    pub headers: HashMap<String, String>,
36    pub body: Option<String>,
37    pub timeout: Option<u64>,
38    pub follow_redirects: bool,
39}
40
41/// HTTP response data
42#[derive(Debug, Clone, Serialize, Deserialize)]
43pub struct HttpResponse {
44    pub status: u16,
45    pub status_text: String,
46    pub headers: HashMap<String, String>,
47    pub body: String,
48    pub url: String,
49    pub content_type: Option<String>,
50    pub content_length: Option<u64>,
51}
52
53/// Web page analysis result
54#[derive(Debug, Clone, Serialize, Deserialize)]
55pub struct WebPageAnalysis {
56    pub url: String,
57    pub title: Option<String>,
58    pub description: Option<String>,
59    pub keywords: Vec<String>,
60    pub links: Vec<String>,
61    pub images: Vec<String>,
62    pub forms: Vec<FormInfo>,
63    pub meta_tags: HashMap<String, String>,
64    pub word_count: u32,
65    pub load_time: u64,
66}
67
68/// Form information
69#[derive(Debug, Clone, Serialize, Deserialize)]
70pub struct FormInfo {
71    pub action: Option<String>,
72    pub method: String,
73    pub fields: Vec<FormField>,
74}
75
76/// Form field information
77#[derive(Debug, Clone, Serialize, Deserialize)]
78pub struct FormField {
79    pub name: Option<String>,
80    pub field_type: String,
81    pub required: bool,
82    pub placeholder: Option<String>,
83}
84
85/// URL analysis result
86#[derive(Debug, Clone, Serialize, Deserialize)]
87pub struct UrlAnalysis {
88    pub url: String,
89    pub is_valid: bool,
90    pub scheme: Option<String>,
91    pub host: Option<String>,
92    pub port: Option<u16>,
93    pub path: String,
94    pub query: Option<String>,
95    pub fragment: Option<String>,
96    pub domain_info: DomainInfo,
97}
98
99/// Domain information
100#[derive(Debug, Clone, Serialize, Deserialize)]
101pub struct DomainInfo {
102    pub domain: String,
103    pub subdomain: Option<String>,
104    pub tld: Option<String>,
105    pub is_ip: bool,
106}
107
108impl WebToolsServer {
109    pub async fn new(config: ServerConfig) -> Result<Self> {
110        let base = BaseServer::new(config).await?;
111
112        // Create HTTP client with reasonable defaults
113        let client = Client::builder()
114            .timeout(Duration::from_secs(30))
115            .user_agent("MCP-Tools/1.0")
116            .build()
117            .map_err(|e| McpToolsError::Server(format!("Failed to create HTTP client: {}", e)))?;
118
119        Ok(Self { base, client })
120    }
121
122    /// Perform HTTP request
123    async fn http_request(&self, request: HttpRequest) -> Result<HttpResponse> {
124        debug!("Making HTTP request to: {}", request.url);
125
126        // Validate URL
127        let url = Url::parse(&request.url)
128            .map_err(|e| McpToolsError::Server(format!("Invalid URL: {}", e)))?;
129
130        // Parse method
131        let method = match request.method.to_uppercase().as_str() {
132            "GET" => Method::GET,
133            "POST" => Method::POST,
134            "PUT" => Method::PUT,
135            "DELETE" => Method::DELETE,
136            "HEAD" => Method::HEAD,
137            "PATCH" => Method::PATCH,
138            _ => {
139                return Err(McpToolsError::Server(format!(
140                    "Unsupported HTTP method: {}",
141                    request.method
142                )))
143            }
144        };
145
146        // Build request
147        let mut req_builder = self.client.request(method, url);
148
149        // Add headers
150        for (key, value) in request.headers {
151            req_builder = req_builder.header(&key, &value);
152        }
153
154        // Add body if provided
155        if let Some(body) = request.body {
156            req_builder = req_builder.body(body);
157        }
158
159        // Set timeout if provided
160        if let Some(timeout_secs) = request.timeout {
161            req_builder = req_builder.timeout(Duration::from_secs(timeout_secs));
162        }
163
164        // Execute request
165        let start_time = std::time::Instant::now();
166        let response = req_builder
167            .send()
168            .await
169            .map_err(|e| McpToolsError::Server(format!("HTTP request failed: {}", e)))?;
170
171        // Extract response data
172        let status = response.status().as_u16();
173        let status_text = response
174            .status()
175            .canonical_reason()
176            .unwrap_or("Unknown")
177            .to_string();
178        let final_url = response.url().to_string();
179
180        // Extract headers
181        let mut headers = HashMap::new();
182        for (key, value) in response.headers() {
183            if let Ok(value_str) = value.to_str() {
184                headers.insert(key.to_string(), value_str.to_string());
185            }
186        }
187
188        let content_type = response
189            .headers()
190            .get("content-type")
191            .and_then(|v| v.to_str().ok())
192            .map(|s| s.to_string());
193
194        let content_length = response.content_length();
195
196        // Get response body
197        let body = response
198            .text()
199            .await
200            .map_err(|e| McpToolsError::Server(format!("Failed to read response body: {}", e)))?;
201
202        Ok(HttpResponse {
203            status,
204            status_text,
205            headers,
206            body,
207            url: final_url,
208            content_type,
209            content_length,
210        })
211    }
212
213    /// Analyze web page content
214    async fn analyze_webpage(&self, url: &str) -> Result<WebPageAnalysis> {
215        debug!("Analyzing webpage: {}", url);
216
217        let request = HttpRequest {
218            url: url.to_string(),
219            method: "GET".to_string(),
220            headers: HashMap::new(),
221            body: None,
222            timeout: Some(30),
223            follow_redirects: true,
224        };
225
226        let start_time = std::time::Instant::now();
227        let response = self.http_request(request).await?;
228        let load_time = start_time.elapsed().as_millis() as u64;
229
230        // Basic HTML parsing (simplified - would use a proper HTML parser in production)
231        let html = &response.body;
232
233        // Extract title
234        let title = self.extract_html_tag(html, "title");
235
236        // Extract meta description
237        let description = self.extract_meta_content(html, "description");
238
239        // Extract meta keywords
240        let keywords_str = self
241            .extract_meta_content(html, "keywords")
242            .unwrap_or_default();
243        let keywords: Vec<String> = keywords_str
244            .split(',')
245            .map(|s| s.trim().to_string())
246            .filter(|s| !s.is_empty())
247            .collect();
248
249        // Extract links (simplified)
250        let links = self.extract_links(html);
251
252        // Extract images (simplified)
253        let images = self.extract_images(html);
254
255        // Extract forms (simplified)
256        let forms = self.extract_forms(html);
257
258        // Extract meta tags
259        let meta_tags = self.extract_meta_tags(html);
260
261        // Count words (simplified)
262        let word_count = html
263            .split_whitespace()
264            .filter(|word| !word.starts_with('<'))
265            .count() as u32;
266
267        Ok(WebPageAnalysis {
268            url: response.url,
269            title,
270            description,
271            keywords,
272            links,
273            images,
274            forms,
275            meta_tags,
276            word_count,
277            load_time,
278        })
279    }
280
281    /// Analyze URL structure
282    async fn analyze_url(&self, url_str: &str) -> Result<UrlAnalysis> {
283        debug!("Analyzing URL: {}", url_str);
284
285        match Url::parse(url_str) {
286            Ok(url) => {
287                let domain = url.host_str().unwrap_or("").to_string();
288                let domain_parts: Vec<&str> = domain.split('.').collect();
289
290                let (subdomain, tld) = if domain_parts.len() > 2 {
291                    (
292                        Some(domain_parts[0].to_string()),
293                        Some(domain_parts.last().unwrap().to_string()),
294                    )
295                } else {
296                    (None, domain_parts.last().map(|s| s.to_string()))
297                };
298
299                let is_ip = domain.parse::<std::net::IpAddr>().is_ok();
300
301                Ok(UrlAnalysis {
302                    url: url_str.to_string(),
303                    is_valid: true,
304                    scheme: Some(url.scheme().to_string()),
305                    host: url.host_str().map(|s| s.to_string()),
306                    port: url.port(),
307                    path: url.path().to_string(),
308                    query: url.query().map(|s| s.to_string()),
309                    fragment: url.fragment().map(|s| s.to_string()),
310                    domain_info: DomainInfo {
311                        domain,
312                        subdomain,
313                        tld,
314                        is_ip,
315                    },
316                })
317            }
318            Err(_) => Ok(UrlAnalysis {
319                url: url_str.to_string(),
320                is_valid: false,
321                scheme: None,
322                host: None,
323                port: None,
324                path: String::new(),
325                query: None,
326                fragment: None,
327                domain_info: DomainInfo {
328                    domain: String::new(),
329                    subdomain: None,
330                    tld: None,
331                    is_ip: false,
332                },
333            }),
334        }
335    }
336
337    // Helper methods for HTML parsing (simplified implementations)
338    fn extract_html_tag(&self, html: &str, tag: &str) -> Option<String> {
339        let start_tag = format!("<{}>", tag);
340        let end_tag = format!("</{}>", tag);
341
342        if let Some(start) = html.find(&start_tag) {
343            if let Some(end) = html[start..].find(&end_tag) {
344                let content = &html[start + start_tag.len()..start + end];
345                return Some(content.trim().to_string());
346            }
347        }
348        None
349    }
350
351    fn extract_meta_content(&self, html: &str, name: &str) -> Option<String> {
352        let pattern = format!(r#"<meta[^>]*name="{}"[^>]*content="([^"]*)"#, name);
353        // Simplified regex-like extraction (would use proper regex in production)
354        if let Some(start) = html.find(&format!(r#"name="{}""#, name)) {
355            if let Some(content_start) = html[start..].find(r#"content=""#) {
356                let content_pos = start + content_start + 9; // length of 'content="'
357                if let Some(content_end) = html[content_pos..].find('"') {
358                    return Some(html[content_pos..content_pos + content_end].to_string());
359                }
360            }
361        }
362        None
363    }
364
365    fn extract_links(&self, html: &str) -> Vec<String> {
366        let mut links = Vec::new();
367        let mut pos = 0;
368
369        while let Some(href_pos) = html[pos..].find("href=\"") {
370            let start = pos + href_pos + 6; // length of 'href="'
371            if let Some(end_pos) = html[start..].find('"') {
372                let link = html[start..start + end_pos].to_string();
373                if !link.is_empty() && !link.starts_with('#') {
374                    links.push(link);
375                }
376                pos = start + end_pos;
377            } else {
378                break;
379            }
380        }
381
382        links
383    }
384
385    fn extract_images(&self, html: &str) -> Vec<String> {
386        let mut images = Vec::new();
387        let mut pos = 0;
388
389        while let Some(src_pos) = html[pos..].find("src=\"") {
390            let start = pos + src_pos + 5; // length of 'src="'
391            if let Some(end_pos) = html[start..].find('"') {
392                let image = html[start..start + end_pos].to_string();
393                if !image.is_empty() {
394                    images.push(image);
395                }
396                pos = start + end_pos;
397            } else {
398                break;
399            }
400        }
401
402        images
403    }
404
405    fn extract_forms(&self, _html: &str) -> Vec<FormInfo> {
406        // Simplified implementation - would need proper HTML parsing
407        Vec::new()
408    }
409
410    fn extract_meta_tags(&self, _html: &str) -> HashMap<String, String> {
411        // Simplified implementation - would need proper HTML parsing
412        HashMap::new()
413    }
414}
415
416#[async_trait]
417impl McpServerBase for WebToolsServer {
418    async fn get_capabilities(&self) -> Result<ServerCapabilities> {
419        let mut capabilities = self.base.get_capabilities().await?;
420
421        // Add Web Tools-specific tools
422        let web_tools = vec![
423            McpTool {
424                name: "http_request".to_string(),
425                description: "Make HTTP requests (GET, POST, PUT, DELETE) to web endpoints"
426                    .to_string(),
427                input_schema: serde_json::json!({
428                    "type": "object",
429                    "properties": {
430                        "url": {
431                            "type": "string",
432                            "description": "Target URL for the HTTP request"
433                        },
434                        "method": {
435                            "type": "string",
436                            "description": "HTTP method (GET, POST, PUT, DELETE, HEAD, PATCH)",
437                            "enum": ["GET", "POST", "PUT", "DELETE", "HEAD", "PATCH"],
438                            "default": "GET"
439                        },
440                        "headers": {
441                            "type": "object",
442                            "description": "HTTP headers as key-value pairs",
443                            "additionalProperties": {"type": "string"}
444                        },
445                        "body": {
446                            "type": "string",
447                            "description": "Request body (for POST, PUT, PATCH methods)"
448                        },
449                        "timeout": {
450                            "type": "integer",
451                            "description": "Request timeout in seconds (default: 30)",
452                            "minimum": 1,
453                            "maximum": 300
454                        },
455                        "follow_redirects": {
456                            "type": "boolean",
457                            "description": "Whether to follow HTTP redirects (default: true)"
458                        }
459                    },
460                    "required": ["url"]
461                }),
462                category: "web".to_string(),
463                requires_permission: true,
464                permissions: vec!["network.http".to_string()],
465            },
466            McpTool {
467                name: "analyze_webpage".to_string(),
468                description:
469                    "Analyze a web page and extract metadata, links, images, and other information"
470                        .to_string(),
471                input_schema: serde_json::json!({
472                    "type": "object",
473                    "properties": {
474                        "url": {
475                            "type": "string",
476                            "description": "URL of the web page to analyze"
477                        }
478                    },
479                    "required": ["url"]
480                }),
481                category: "web".to_string(),
482                requires_permission: true,
483                permissions: vec!["network.http".to_string()],
484            },
485            McpTool {
486                name: "analyze_url".to_string(),
487                description:
488                    "Analyze URL structure and extract components (scheme, host, path, query, etc.)"
489                        .to_string(),
490                input_schema: serde_json::json!({
491                    "type": "object",
492                    "properties": {
493                        "url": {
494                            "type": "string",
495                            "description": "URL to analyze"
496                        }
497                    },
498                    "required": ["url"]
499                }),
500                category: "web".to_string(),
501                requires_permission: false,
502                permissions: vec![],
503            },
504            McpTool {
505                name: "fetch_content".to_string(),
506                description: "Fetch content from a URL with automatic content type detection"
507                    .to_string(),
508                input_schema: serde_json::json!({
509                    "type": "object",
510                    "properties": {
511                        "url": {
512                            "type": "string",
513                            "description": "URL to fetch content from"
514                        },
515                        "headers": {
516                            "type": "object",
517                            "description": "Additional HTTP headers",
518                            "additionalProperties": {"type": "string"}
519                        },
520                        "timeout": {
521                            "type": "integer",
522                            "description": "Request timeout in seconds (default: 30)"
523                        }
524                    },
525                    "required": ["url"]
526                }),
527                category: "web".to_string(),
528                requires_permission: true,
529                permissions: vec!["network.http".to_string()],
530            },
531        ];
532
533        capabilities.tools = web_tools;
534        Ok(capabilities)
535    }
536
537    async fn handle_tool_request(&self, request: McpToolRequest) -> Result<McpToolResponse> {
538        info!("Handling Web Tools request: {}", request.tool);
539
540        match request.tool.as_str() {
541            "http_request" => {
542                debug!("Making HTTP request");
543
544                let url = request
545                    .arguments
546                    .get("url")
547                    .and_then(|v| v.as_str())
548                    .ok_or_else(|| McpToolsError::Server("Missing 'url' parameter".to_string()))?;
549
550                let method = request
551                    .arguments
552                    .get("method")
553                    .and_then(|v| v.as_str())
554                    .unwrap_or("GET");
555
556                let headers = request
557                    .arguments
558                    .get("headers")
559                    .and_then(|v| v.as_object())
560                    .map(|obj| {
561                        obj.iter()
562                            .filter_map(|(k, v)| v.as_str().map(|s| (k.clone(), s.to_string())))
563                            .collect()
564                    })
565                    .unwrap_or_default();
566
567                let body = request
568                    .arguments
569                    .get("body")
570                    .and_then(|v| v.as_str())
571                    .map(|s| s.to_string());
572
573                let timeout = request.arguments.get("timeout").and_then(|v| v.as_u64());
574
575                let follow_redirects = request
576                    .arguments
577                    .get("follow_redirects")
578                    .and_then(|v| v.as_bool())
579                    .unwrap_or(true);
580
581                let http_request = HttpRequest {
582                    url: url.to_string(),
583                    method: method.to_string(),
584                    headers,
585                    body,
586                    timeout,
587                    follow_redirects,
588                };
589
590                let response = self.http_request(http_request).await?;
591
592                let content_text = format!(
593                    "HTTP Request Complete\n\
594                    Status: {} {}\n\
595                    URL: {}\n\
596                    Content-Type: {}\n\
597                    Content-Length: {} bytes",
598                    response.status,
599                    response.status_text,
600                    response.url,
601                    response.content_type.as_deref().unwrap_or("unknown"),
602                    response
603                        .content_length
604                        .unwrap_or(response.body.len() as u64)
605                );
606
607                let mut metadata = HashMap::new();
608                metadata.insert("http_response".to_string(), serde_json::to_value(response)?);
609
610                Ok(McpToolResponse {
611                    id: request.id,
612                    content: vec![McpContent::text(content_text)],
613                    is_error: false,
614                    error: None,
615                    metadata,
616                })
617            }
618            "analyze_webpage" => {
619                debug!("Analyzing webpage");
620
621                let url = request
622                    .arguments
623                    .get("url")
624                    .and_then(|v| v.as_str())
625                    .ok_or_else(|| McpToolsError::Server("Missing 'url' parameter".to_string()))?;
626
627                let analysis = self.analyze_webpage(url).await?;
628
629                let content_text = format!(
630                    "Web Page Analysis Complete\n\
631                    URL: {}\n\
632                    Title: {}\n\
633                    Description: {}\n\
634                    Links Found: {}\n\
635                    Images Found: {}\n\
636                    Word Count: {}\n\
637                    Load Time: {}ms",
638                    analysis.url,
639                    analysis.title.as_deref().unwrap_or("None"),
640                    analysis.description.as_deref().unwrap_or("None"),
641                    analysis.links.len(),
642                    analysis.images.len(),
643                    analysis.word_count,
644                    analysis.load_time
645                );
646
647                let mut metadata = HashMap::new();
648                metadata.insert(
649                    "webpage_analysis".to_string(),
650                    serde_json::to_value(analysis)?,
651                );
652
653                Ok(McpToolResponse {
654                    id: request.id,
655                    content: vec![McpContent::text(content_text)],
656                    is_error: false,
657                    error: None,
658                    metadata,
659                })
660            }
661            "analyze_url" => {
662                debug!("Analyzing URL structure");
663
664                let url = request
665                    .arguments
666                    .get("url")
667                    .and_then(|v| v.as_str())
668                    .ok_or_else(|| McpToolsError::Server("Missing 'url' parameter".to_string()))?;
669
670                let analysis = self.analyze_url(url).await?;
671
672                let content_text = format!(
673                    "URL Analysis Complete\n\
674                    URL: {}\n\
675                    Valid: {}\n\
676                    Scheme: {}\n\
677                    Host: {}\n\
678                    Port: {}\n\
679                    Path: {}\n\
680                    Domain: {}",
681                    analysis.url,
682                    analysis.is_valid,
683                    analysis.scheme.as_deref().unwrap_or("None"),
684                    analysis.host.as_deref().unwrap_or("None"),
685                    analysis
686                        .port
687                        .map(|p| p.to_string())
688                        .as_deref()
689                        .unwrap_or("None"),
690                    analysis.path,
691                    analysis.domain_info.domain
692                );
693
694                let mut metadata = HashMap::new();
695                metadata.insert("url_analysis".to_string(), serde_json::to_value(analysis)?);
696
697                Ok(McpToolResponse {
698                    id: request.id,
699                    content: vec![McpContent::text(content_text)],
700                    is_error: false,
701                    error: None,
702                    metadata,
703                })
704            }
705            "fetch_content" => {
706                debug!("Fetching content from URL");
707
708                let url = request
709                    .arguments
710                    .get("url")
711                    .and_then(|v| v.as_str())
712                    .ok_or_else(|| McpToolsError::Server("Missing 'url' parameter".to_string()))?;
713
714                let headers = request
715                    .arguments
716                    .get("headers")
717                    .and_then(|v| v.as_object())
718                    .map(|obj| {
719                        obj.iter()
720                            .filter_map(|(k, v)| v.as_str().map(|s| (k.clone(), s.to_string())))
721                            .collect()
722                    })
723                    .unwrap_or_default();
724
725                let timeout = request.arguments.get("timeout").and_then(|v| v.as_u64());
726
727                let http_request = HttpRequest {
728                    url: url.to_string(),
729                    method: "GET".to_string(),
730                    headers,
731                    body: None,
732                    timeout,
733                    follow_redirects: true,
734                };
735
736                let response = self.http_request(http_request).await?;
737
738                let content_text = format!(
739                    "Content Fetched Successfully\n\
740                    URL: {}\n\
741                    Status: {}\n\
742                    Content-Type: {}\n\
743                    Size: {} bytes\n\n{}",
744                    response.url,
745                    response.status,
746                    response.content_type.as_deref().unwrap_or("unknown"),
747                    response.body.len(),
748                    if response.body.len() > 1000 {
749                        format!("{}...", &response.body[..1000])
750                    } else {
751                        response.body.clone()
752                    }
753                );
754
755                let mut metadata = HashMap::new();
756                metadata.insert(
757                    "fetched_content".to_string(),
758                    serde_json::to_value(response)?,
759                );
760
761                Ok(McpToolResponse {
762                    id: request.id,
763                    content: vec![McpContent::text(content_text)],
764                    is_error: false,
765                    error: None,
766                    metadata,
767                })
768            }
769            _ => {
770                warn!("Unknown Web Tools request: {}", request.tool);
771                Err(McpToolsError::Server(format!(
772                    "Unknown Web Tools request: {}",
773                    request.tool
774                )))
775            }
776        }
777    }
778
779    async fn get_stats(&self) -> Result<crate::common::ServerStats> {
780        self.base.get_stats().await
781    }
782
783    async fn initialize(&mut self) -> Result<()> {
784        info!("Initializing Web Tools MCP Server");
785        Ok(())
786    }
787
788    async fn shutdown(&mut self) -> Result<()> {
789        info!("Shutting down Web Tools MCP Server");
790        Ok(())
791    }
792}