1use super::traits::Tool;
7use crate::config::constants::tools;
8use crate::tools::error_helpers::with_path_context;
9use anyhow::{Context, Result, anyhow, bail};
10use async_trait::async_trait;
11use hashbrown::HashSet;
12use reqwest::header::{ACCEPT, HeaderMap, HeaderValue, USER_AGENT};
13use serde::Deserialize;
14use serde_json::{Value, json};
15use std::fs;
16use std::net::IpAddr;
17use std::path::Path;
18use url::Url;
19
20pub mod domains;
21pub use domains::{BUILTIN_BLOCKED_DOMAINS, BUILTIN_BLOCKED_PATTERNS, MALICIOUS_PATTERNS};
22
23const DEFAULT_TIMEOUT_SECS: u64 = 30;
24const MAX_CONTENT_SIZE: usize = 500_000; #[derive(Debug, Deserialize)]
27struct WebFetchArgs {
28 url: String,
29 prompt: String,
30 #[serde(default)]
31 max_bytes: Option<usize>,
32 #[serde(default)]
33 timeout_secs: Option<u64>,
34}
35
36#[derive(Clone)]
38pub struct WebFetchTool {
39 pub mode: String,
41 pub blocked_domains: HashSet<String>,
43 pub blocked_patterns: Vec<String>,
45 pub allowed_domains: HashSet<String>,
47 pub strict_https_only: bool,
49}
50
51impl WebFetchTool {
52 pub fn new() -> Self {
53 Self {
54 mode: "restricted".to_string(),
55 blocked_domains: HashSet::new(),
56 blocked_patterns: Vec::new(),
57 allowed_domains: HashSet::new(),
58 strict_https_only: true,
59 }
60 }
61
62 pub fn with_config(
64 mode: String,
65 blocked_domains: Vec<String>,
66 blocked_patterns: Vec<String>,
67 allowed_domains: Vec<String>,
68 strict_https_only: bool,
69 ) -> Self {
70 Self {
71 mode,
72 blocked_domains: blocked_domains.into_iter().collect(),
73 blocked_patterns,
74 allowed_domains: allowed_domains.into_iter().collect(),
75 strict_https_only,
76 }
77 }
78
79 async fn fetch_url_content(
80 &self,
81 url: &str,
82 max_bytes: usize,
83 timeout_secs: u64,
84 ) -> Result<String> {
85 self.validate_url(url)?;
87
88 let default_headers = Self::default_headers();
89
90 let client = reqwest::Client::builder()
91 .default_headers(default_headers)
92 .timeout(std::time::Duration::from_secs(timeout_secs))
93 .build()?;
94
95 let response = client.get(url).send().await?;
96
97 if !response.status().is_success() {
98 return Err(anyhow!(
99 "HTTP request failed with status: {}",
100 response.status()
101 ));
102 }
103
104 let content_type = response
105 .headers()
106 .get("content-type")
107 .and_then(|h| h.to_str().ok())
108 .unwrap_or("")
109 .to_string();
110
111 self.validate_content_type(&content_type)?;
113
114 let bytes = response.bytes().await?;
116 if bytes.len() > max_bytes {
117 return Err(anyhow!(
118 "Response body size {} bytes exceeds maximum allowed size of {} bytes",
119 bytes.len(),
120 max_bytes
121 ));
122 }
123
124 String::from_utf8(bytes.to_vec()).context("Response body is not valid UTF-8")
125 }
126
127 fn validate_url(&self, url: &str) -> Result<()> {
128 if self.strict_https_only && !url.starts_with("https://") {
130 return Err(anyhow!("Only HTTPS URLs are allowed for security"));
131 }
132
133 let domain = extract_domain(url)
136 .map_err(|e| anyhow!("Failed to parse URL for security validation: {e}"))?;
137
138 if is_private_host(&domain) {
140 return Err(anyhow!("Access to local/private networks is blocked"));
141 }
142
143 let domain_lower = domain.to_ascii_lowercase();
145 if domain_lower.ends_with(".local") || domain_lower.ends_with(".internal") {
146 return Err(anyhow!("Access to local/private networks is blocked"));
147 }
148
149 let url_lower = url.to_lowercase();
150
151 match self.mode.as_str() {
153 "whitelist" => self.validate_whitelist_mode(&url_lower)?,
154 "restricted" => self.validate_restricted_mode(&url_lower)?,
155 _ => return Err(anyhow!("Unknown web_fetch security mode: {}", self.mode)),
156 }
157
158 Ok(())
159 }
160
161 fn validate_whitelist_mode(&self, url: &str) -> Result<()> {
162 let domain = extract_domain(url)?;
164
165 if self.allowed_domains.is_empty() {
166 return Err(anyhow!(
167 "Whitelist mode enabled but no domains are whitelisted. Configure allowed_domains in web_fetch settings."
168 ));
169 }
170
171 for allowed in &self.allowed_domains {
173 if domain_matches_allowed(&domain, allowed) {
174 return Ok(());
175 }
176 }
177
178 Err(anyhow!(
179 "Domain '{}' is not in the whitelist. Only explicitly allowed domains are permitted in whitelist mode.",
180 domain
181 ))
182 }
183
184 fn validate_restricted_mode(&self, url: &str) -> Result<()> {
185 let url_lower = url.to_lowercase();
187
188 let domain = extract_domain(url)?;
190 for allowed in &self.allowed_domains {
191 if domain_matches_allowed(&domain, allowed) {
192 return Ok(());
193 }
194 }
195
196 self.validate_url_safety(&url_lower)?;
198
199 Ok(())
200 }
201
202 fn validate_url_safety(&self, url: &str) -> Result<()> {
203 let mut all_blocked_domains = BUILTIN_BLOCKED_DOMAINS.to_vec();
205 all_blocked_domains.extend(self.blocked_domains.iter().map(|s| s.as_str()));
206
207 let mut all_blocked_patterns = BUILTIN_BLOCKED_PATTERNS.to_vec();
209 all_blocked_patterns.extend(self.blocked_patterns.iter().map(|s| s.as_str()));
210
211 for domain in &all_blocked_domains {
213 if url.contains(domain) {
214 return Err(anyhow!(
215 "Access to sensitive domain '{}' is blocked for privacy and security reasons",
216 domain
217 ));
218 }
219 }
220
221 for pattern in &all_blocked_patterns {
223 if url.contains(pattern) {
224 return Err(anyhow!(
225 "URL contains sensitive pattern '{}'. Fetching URLs with credentials or sensitive data is blocked",
226 pattern
227 ));
228 }
229 }
230
231 self.check_malicious_indicators(url)?;
233
234 Ok(())
235 }
236
237 fn check_malicious_indicators(&self, url: &str) -> Result<()> {
238 for pattern in MALICIOUS_PATTERNS {
239 if url.contains(pattern) {
240 return Err(anyhow!(
241 "URL contains potentially malicious pattern. Access blocked for safety"
242 ));
243 }
244 }
245
246 Ok(())
247 }
248
249 fn expand_home_path(path: &str) -> String {
251 if path.starts_with("~/")
252 && let Ok(home) = std::env::var("HOME")
253 {
254 return path.replace("~/", &format!("{}/", home));
255 }
256 path.to_string()
257 }
258
259 #[expect(dead_code)]
261 async fn load_dynamic_blocklist(&self, path: &str) -> Result<(Vec<String>, Vec<String>)> {
262 let expanded_path = Self::expand_home_path(path);
263 if !Path::new(&expanded_path).exists() {
264 return Ok((Vec::new(), Vec::new()));
265 }
266
267 let content = with_path_context(
268 fs::read_to_string(&expanded_path),
269 "read blocklist from",
270 path,
271 )?;
272
273 #[derive(Deserialize)]
274 struct BlocklistFile {
275 blocked_domains: Option<Vec<String>>,
276 blocked_patterns: Option<Vec<String>>,
277 }
278
279 let blocklist: BlocklistFile = with_path_context(
280 serde_json::from_str(&content),
281 "parse blocklist JSON from",
282 path,
283 )?;
284
285 Ok((
286 blocklist.blocked_domains.unwrap_or_default(),
287 blocklist.blocked_patterns.unwrap_or_default(),
288 ))
289 }
290
291 #[expect(dead_code)]
293 async fn load_dynamic_whitelist(&self, path: &str) -> Result<Vec<String>> {
294 let expanded_path = Self::expand_home_path(path);
295 if !Path::new(&expanded_path).exists() {
296 return Ok(Vec::new());
297 }
298
299 let content = with_path_context(
300 fs::read_to_string(&expanded_path),
301 "read whitelist from",
302 path,
303 )?;
304
305 #[derive(Deserialize)]
306 struct WhitelistFile {
307 allowed_domains: Option<Vec<String>>,
308 }
309
310 let whitelist: WhitelistFile = with_path_context(
311 serde_json::from_str(&content),
312 "parse whitelist JSON from",
313 path,
314 )?;
315
316 Ok(whitelist.allowed_domains.unwrap_or_default())
317 }
318
319 fn validate_content_type(&self, content_type: &str) -> Result<()> {
320 if content_type.is_empty() {
321 return Ok(());
322 }
323
324 let allowed_types = [
325 "text/html",
326 "text/plain",
327 "text/markdown",
328 "application/json",
329 "application/xml",
330 "text/xml",
331 "application/javascript",
332 "text/css",
333 "text/javascript",
334 "application/xhtml+xml",
335 ];
336
337 let content_type_lower = content_type.to_lowercase();
338 if allowed_types
339 .iter()
340 .any(|&t| content_type_lower.contains(t))
341 {
342 Ok(())
343 } else {
344 Err(anyhow!(
345 "Content type '{}' is not supported. Only text-based content types are allowed.",
346 content_type
347 ))
348 }
349 }
350
351 async fn run(&self, raw_args: Value) -> Result<Value> {
352 let args: WebFetchArgs = serde_json::from_value(raw_args)
353 .context("Invalid arguments for web_fetch tool. Provide 'url' and 'prompt'.")?;
354
355 let max_bytes = args.max_bytes.unwrap_or(MAX_CONTENT_SIZE);
356 let timeout_secs = args.timeout_secs.unwrap_or(DEFAULT_TIMEOUT_SECS);
357
358 let content = match self
360 .fetch_url_content(&args.url, max_bytes, timeout_secs)
361 .await
362 {
363 Ok(content) => content,
364 Err(e) => {
365 return Ok(json!({
367 "error": format!("web_fetch: failed to fetch URL '{}': {}", args.url, e),
368 "url": args.url,
369 "max_bytes": max_bytes,
370 "timeout_secs": timeout_secs
371 }));
372 }
373 };
374
375 let content_length = content.len();
376
377 if content_length == 0 {
378 return Ok(json!({
379 "error": format!(
380 "web_fetch: no content fetched from '{}'. The URL may be unreachable, returned empty content, or used an unsupported content-type.",
381 args.url
382 ),
383 "url": args.url
384 }));
385 }
386
387 let preview_limit = 8000;
389 let (preview, truncated, overflow_info) = if content_length > preview_limit {
390 let truncated_content =
391 vtcode_commons::formatting::truncate_byte_budget(&content, preview_limit, "...");
392 let overflow = format!("[+{} more characters]", content_length - preview_limit);
393 (truncated_content, true, Some(overflow))
394 } else {
395 (content.clone(), false, None)
396 };
397
398 let mut response = json!({
404 "url": args.url,
405 "prompt": args.prompt,
406 "content": content,
407 "preview": preview,
408 "content_length": content_length,
409 "truncated": truncated,
410 "next_action_hint": "Analyze `content` using `prompt` and answer the user in natural language based on the fetched page."
411 });
412
413 if let Some(overflow) = overflow_info {
415 response["overflow"] = json!(overflow);
416 }
417
418 Ok(response)
419 }
420}
421
422impl WebFetchTool {
423 fn default_headers() -> HeaderMap {
427 let mut headers = HeaderMap::new();
428 headers.insert(ACCEPT, HeaderValue::from_static("text/markdown, */*"));
429 headers.insert(
430 USER_AGENT,
431 HeaderValue::from_static("VT Code/1.0 (compatible; web-fetch tool)"),
432 );
433 headers
434 }
435}
436
437fn extract_domain(url: &str) -> Result<String> {
444 let parsed = Url::parse(url).with_context(|| format!("Failed to parse URL: {url}"))?;
445 let host = parsed
446 .host_str()
447 .ok_or_else(|| anyhow!("URL has no host: {url}"))?;
448 if host.is_empty() {
449 bail!("URL has empty host: {url}");
450 }
451 Ok(host.to_string())
452}
453
454fn is_private_host(host: &str) -> bool {
456 if let Ok(ip) = host.parse::<IpAddr>() {
458 return match ip {
459 IpAddr::V4(v4) => {
460 let octets = v4.octets();
461 octets[0] == 127
463 || octets[0] == 10
465 || (octets[0] == 172 && (octets[1] & 0xf0) == 16)
467 || (octets[0] == 192 && octets[1] == 168)
469 || (octets[0] == 169 && octets[1] == 254)
471 || octets[0] == 0
473 }
474 IpAddr::V6(v6) => {
475 let segments = v6.segments();
476 v6.is_loopback()
477 || v6.is_unspecified()
478 || (segments[0] & 0xfe00) == 0xfc00
480 || (segments[0] & 0xffc0) == 0xfe80
482 }
483 };
484 }
485
486 if host.eq_ignore_ascii_case("localhost") || host.eq_ignore_ascii_case("localhost.localdomain")
488 {
489 return true;
490 }
491
492 false
493}
494
495fn domain_matches_allowed(domain: &str, allowed: &str) -> bool {
496 let normalized_domain = domain.trim_end_matches('.').to_ascii_lowercase();
497 let normalized_allowed = allowed
498 .trim_start_matches('.')
499 .trim_end_matches('.')
500 .to_ascii_lowercase();
501
502 normalized_domain == normalized_allowed
503 || normalized_domain.ends_with(&format!(".{normalized_allowed}"))
504}
505
506impl Default for WebFetchTool {
507 fn default() -> Self {
508 Self::new()
509 }
510}
511
512#[async_trait]
513impl Tool for WebFetchTool {
514 async fn execute(&self, mut args: Value) -> Result<Value> {
515 if let Some(obj) = args.as_object_mut() {
524 let has_url = obj.get("url").is_some_and(Value::is_string);
525 let has_prompt = obj.get("prompt").is_some_and(Value::is_string);
526
527 if has_url && !has_prompt {
528 obj.insert(
529 "prompt".to_string(),
530 json!("Briefly summarize what this page is and what it represents. Focus on the owner/profile, primary purpose, and any notable repositories or projects."),
531 );
532 }
533 }
534
535 self.run(args).await
536 }
537
538 fn name(&self) -> &str {
539 tools::WEB_FETCH
540 }
541
542 fn description(&self) -> &str {
543 "Fetches content from a specified URL and returns an analyzed summary. Accepts: { url: string, prompt?: string, max_bytes?: number, timeout_secs?: number }. If 'prompt' is omitted, VT Code uses a safe default summary prompt so that simple 'fetch https://…' requests are handled by this built-in tool instead of delegating to external MCP tools."
544 }
545}
546
547#[cfg(test)]
548mod tests {
549 use super::*;
550 use serde_json::{Value, json};
551
552 async fn execute_json(tool: &WebFetchTool, args: Value) -> Value {
553 tool.execute(args)
554 .await
555 .expect("web_fetch should return structured JSON output")
556 }
557
558 fn error_text(result: &Value) -> Option<&str> {
559 result.get("error").and_then(Value::as_str)
560 }
561
562 #[tokio::test]
563 async fn rejects_non_https_urls() {
564 let tool = WebFetchTool::new();
565 let result = execute_json(
566 &tool,
567 json!({
568 "url": "http://example.com",
569 "prompt": "Extract the main content"
570 }),
571 )
572 .await;
573 let error = error_text(&result).unwrap_or("");
574 assert!(error.contains("Only HTTPS URLs are allowed"));
575 }
576
577 #[tokio::test]
578 async fn allows_http_when_https_disabled() {
579 let tool = WebFetchTool::with_config(
580 "restricted".to_string(),
581 Vec::new(),
582 Vec::new(),
583 Vec::new(),
584 false, );
586 let result = execute_json(
587 &tool,
588 json!({
589 "url": "http://example.com",
590 "prompt": "Extract the main content"
591 }),
592 )
593 .await;
594 if let Some(error) = error_text(&result) {
595 assert!(!error.contains("Only HTTPS URLs are allowed"));
596 }
597 }
598
599 #[tokio::test]
600 async fn rejects_localhost_urls() {
601 let tool = WebFetchTool::new();
602 let result = execute_json(
603 &tool,
604 json!({
605 "url": "https://localhost:8080",
606 "prompt": "Extract the main content"
607 }),
608 )
609 .await;
610 let error = error_text(&result).unwrap_or("");
611 assert!(error.contains("local/private networks"));
612 }
613
614 #[tokio::test]
615 async fn requires_both_url_and_prompt() {
616 let tool = WebFetchTool::new();
617 let result = execute_json(
618 &tool,
619 json!({
620 "url": "http://example.com"
621 }),
622 )
623 .await;
624 let error = error_text(&result).unwrap_or("");
626 assert!(error.contains("Only HTTPS URLs are allowed"));
627 }
628
629 #[tokio::test]
630 async fn rejects_sensitive_banking_domains() {
631 let tool = WebFetchTool::new();
632 let result = execute_json(
633 &tool,
634 json!({
635 "url": "https://paypal.com/login",
636 "prompt": "Extract the main content"
637 }),
638 )
639 .await;
640 let error = error_text(&result).unwrap_or("");
641 assert!(error.contains("blocked for privacy and security reasons"));
642 }
643
644 #[tokio::test]
645 async fn rejects_sensitive_auth_domains() {
646 let tool = WebFetchTool::new();
647 let result = execute_json(
648 &tool,
649 json!({
650 "url": "https://accounts.google.com",
651 "prompt": "Extract the main content"
652 }),
653 )
654 .await;
655 let error = error_text(&result).unwrap_or("");
656 assert!(error.contains("blocked for privacy and security reasons"));
657 }
658
659 #[tokio::test]
660 async fn rejects_urls_with_credentials() {
661 let tool = WebFetchTool::new();
662 let result = execute_json(
663 &tool,
664 json!({
665 "url": "https://example.com?password=secret123",
666 "prompt": "Extract the main content"
667 }),
668 )
669 .await;
670 let error = error_text(&result).unwrap_or("");
671 assert!(error.contains("sensitive pattern"));
672 }
673
674 #[tokio::test]
675 async fn rejects_urls_with_api_keys() {
676 let tool = WebFetchTool::new();
677 let result = execute_json(
678 &tool,
679 json!({
680 "url": "https://api.example.com?api_key=sk_live_123456",
681 "prompt": "Extract the main content"
682 }),
683 )
684 .await;
685 let error = error_text(&result).unwrap_or("");
686 assert!(error.contains("sensitive pattern"));
687 }
688
689 #[tokio::test]
690 async fn rejects_urls_with_tokens() {
691 let tool = WebFetchTool::new();
692 let result = execute_json(
693 &tool,
694 json!({
695 "url": "https://example.com?token=xyz123",
696 "prompt": "Extract the main content"
697 }),
698 )
699 .await;
700 let error = error_text(&result).unwrap_or("");
701 assert!(error.contains("sensitive pattern"));
702 }
703
704 #[tokio::test]
705 async fn rejects_malicious_url_patterns() {
706 let tool = WebFetchTool::new();
707 let result = execute_json(
708 &tool,
709 json!({
710 "url": "https://example.com/malware.exe\"",
711 "prompt": "Extract the main content"
712 }),
713 )
714 .await;
715 let error = error_text(&result).unwrap_or("");
716 assert!(error.contains("potentially malicious pattern"));
717 }
718
719 #[tokio::test]
720 async fn rejects_typosquatting_domains() {
721 let tool = WebFetchTool::new();
722 let result = execute_json(
723 &tool,
724 json!({
725 "url": "https://g00gle.com",
726 "prompt": "Extract the main content"
727 }),
728 )
729 .await;
730 let error = error_text(&result).unwrap_or("");
731 assert!(error.contains("potentially malicious pattern"));
732 }
733
734 #[tokio::test]
735 async fn rejects_url_shorteners() {
736 let tool = WebFetchTool::new();
737 let result = execute_json(
738 &tool,
739 json!({
740 "url": "https://bit.ly/xyz123",
741 "prompt": "Extract the main content"
742 }),
743 )
744 .await;
745 let error = error_text(&result).unwrap_or("");
746 assert!(error.contains("potentially malicious pattern"));
747 }
748
749 #[tokio::test]
750 async fn whitelist_mode_requires_allowed_domains() {
751 let tool = WebFetchTool::with_config(
752 "whitelist".to_string(),
753 Vec::new(),
754 Vec::new(),
755 Vec::new(), true,
757 );
758 let result = execute_json(
759 &tool,
760 json!({
761 "url": "https://example.com",
762 "prompt": "Extract the main content"
763 }),
764 )
765 .await;
766 let error = error_text(&result).unwrap_or("").to_string();
767 assert!(error.contains("whitelist") || error.contains("whitelisted"));
768 }
769
770 #[tokio::test]
771 async fn whitelist_mode_allows_whitelisted_domains() {
772 let tool = WebFetchTool::with_config(
773 "whitelist".to_string(),
774 Vec::new(),
775 Vec::new(),
776 vec!["example.com".to_string()], true,
778 );
779 let result = execute_json(
780 &tool,
781 json!({
782 "url": "https://example.com/path",
783 "prompt": "Extract the main content"
784 }),
785 )
786 .await;
787 if let Some(error) = error_text(&result) {
788 assert!(!error.contains("not in the whitelist"));
789 }
790 }
791
792 #[tokio::test]
793 async fn whitelist_mode_rejects_non_whitelisted_domains() {
794 let tool = WebFetchTool::with_config(
795 "whitelist".to_string(),
796 Vec::new(),
797 Vec::new(),
798 vec!["allowed.com".to_string()],
799 true,
800 );
801 let result = execute_json(
802 &tool,
803 json!({
804 "url": "https://notallowed.com",
805 "prompt": "Extract the main content"
806 }),
807 )
808 .await;
809 let error = error_text(&result).unwrap_or("").to_string();
810 assert!(error.contains("not in the whitelist"));
811 }
812
813 #[tokio::test]
814 async fn restricted_mode_allows_exemptions() {
815 let tool = WebFetchTool::with_config(
816 "restricted".to_string(),
817 Vec::new(),
818 Vec::new(),
819 vec!["paypal.com".to_string()], true,
821 );
822 let result = execute_json(
823 &tool,
824 json!({
825 "url": "https://paypal.com/login",
826 "prompt": "Extract the main content"
827 }),
828 )
829 .await;
830 if let Some(error) = error_text(&result) {
831 assert!(!error.contains("blocked for privacy"));
832 }
833 }
834
835 #[tokio::test]
836 async fn custom_blocked_domains_work() {
837 let tool = WebFetchTool::with_config(
838 "restricted".to_string(),
839 vec!["custom-blocked.com".to_string()], Vec::new(),
841 Vec::new(),
842 true,
843 );
844 let result = execute_json(
845 &tool,
846 json!({
847 "url": "https://custom-blocked.com/page",
848 "prompt": "Extract the main content"
849 }),
850 )
851 .await;
852 let error = error_text(&result).unwrap_or("").to_string();
853 assert!(error.contains("blocked for privacy and security reasons"));
854 }
855
856 #[tokio::test]
857 async fn custom_blocked_patterns_work() {
858 let tool = WebFetchTool::with_config(
859 "restricted".to_string(),
860 Vec::new(),
861 vec!["custom_secret=".to_string()], Vec::new(),
863 true,
864 );
865 let result = execute_json(
866 &tool,
867 json!({
868 "url": "https://example.com?custom_secret=abc123",
869 "prompt": "Extract the main content"
870 }),
871 )
872 .await;
873 let error = error_text(&result).unwrap_or("").to_string();
874 assert!(error.contains("sensitive pattern"));
875 }
876
877 #[test]
878 fn default_headers_contain_text_markdown_accept() {
879 let headers = WebFetchTool::default_headers();
880 assert!(headers.contains_key(ACCEPT));
881 let val = headers.get(ACCEPT).unwrap().to_str().unwrap();
882 assert!(val.contains("text/markdown"));
883 }
884}