1use std::net::{IpAddr, SocketAddr};
20use std::sync::Arc;
21use std::sync::atomic::{AtomicU64, Ordering};
22use std::time::{Duration, Instant};
23
24use schemars::JsonSchema;
25use serde::Deserialize;
26use url::Url;
27
28use zeph_common::ToolName;
29
30use zeph_sanitizer::IpiFilter;
31
32use crate::audit::{AuditEntry, AuditLogger, AuditResult, EgressEvent, chrono_now};
33use crate::config::{EgressConfig, ScrapeConfig};
34use crate::executor::{
35 ClaimSource, ToolCall, ToolError, ToolExecutor, ToolOutput, deserialize_params,
36};
37use crate::net::is_private_ip;
38
39fn redact_url_for_log(url: &str) -> String {
43 let Ok(mut parsed) = Url::parse(url) else {
44 return url.to_owned();
45 };
46 let _ = parsed.set_username("");
48 let _ = parsed.set_password(None);
49 let sensitive = [
51 "token", "key", "secret", "password", "auth", "sig", "api_key", "apikey",
52 ];
53 let filtered: Vec<(String, String)> = parsed
54 .query_pairs()
55 .filter(|(k, _)| {
56 let lower = k.to_lowercase();
57 !sensitive.iter().any(|s| lower.contains(s))
58 })
59 .map(|(k, v)| (k.into_owned(), v.into_owned()))
60 .collect();
61 if filtered.is_empty() {
62 parsed.set_query(None);
63 } else {
64 let q: String = filtered
65 .iter()
66 .map(|(k, v)| format!("{k}={v}"))
67 .collect::<Vec<_>>()
68 .join("&");
69 parsed.set_query(Some(&q));
70 }
71 parsed.to_string()
72}
73
74#[derive(Debug, Deserialize, JsonSchema)]
75struct FetchParams {
76 url: String,
78}
79
80#[derive(Debug, Deserialize, JsonSchema)]
81struct ScrapeInstruction {
82 url: String,
84 select: String,
86 #[serde(default = "default_extract")]
88 extract: String,
89 limit: Option<usize>,
91}
92
93fn default_extract() -> String {
94 "text".into()
95}
96
97#[derive(Debug)]
98enum ExtractMode {
99 Text,
100 Html,
101 Attr(String),
102}
103
104impl ExtractMode {
105 fn parse(s: &str) -> Self {
106 match s {
107 "text" => Self::Text,
108 "html" => Self::Html,
109 attr if attr.starts_with("attr:") => {
110 Self::Attr(attr.strip_prefix("attr:").unwrap_or(attr).to_owned())
111 }
112 _ => Self::Text,
113 }
114 }
115}
116
117#[derive(Debug)]
159pub struct WebScrapeExecutor {
160 timeout: Duration,
161 max_body_bytes: usize,
162 allowed_domains: Vec<String>,
163 denied_domains: Vec<String>,
164 audit_logger: Option<Arc<AuditLogger>>,
165 egress_config: EgressConfig,
166 egress_tx: Option<tokio::sync::mpsc::Sender<EgressEvent>>,
167 egress_dropped: Arc<AtomicU64>,
168 ipi_filter: IpiFilter,
170}
171
172impl WebScrapeExecutor {
173 #[must_use]
177 pub fn new(config: &ScrapeConfig) -> Self {
178 Self {
179 timeout: Duration::from_secs(config.timeout),
180 max_body_bytes: config.max_body_bytes,
181 allowed_domains: config.allowed_domains.clone(),
182 denied_domains: config.denied_domains.clone(),
183 audit_logger: None,
184 egress_config: EgressConfig::default(),
185 egress_tx: None,
186 egress_dropped: Arc::new(AtomicU64::new(0)),
187 ipi_filter: IpiFilter::new(config.ipi_filter_threshold),
188 }
189 }
190
191 #[must_use]
193 pub fn with_audit(mut self, logger: Arc<AuditLogger>) -> Self {
194 self.audit_logger = Some(logger);
195 self
196 }
197
198 #[must_use]
200 pub fn with_egress_config(mut self, config: EgressConfig) -> Self {
201 self.egress_config = config;
202 self
203 }
204
205 #[must_use]
210 pub fn with_egress_tx(
211 mut self,
212 tx: tokio::sync::mpsc::Sender<EgressEvent>,
213 dropped: Arc<AtomicU64>,
214 ) -> Self {
215 self.egress_tx = Some(tx);
216 self.egress_dropped = dropped;
217 self
218 }
219
220 #[must_use]
222 pub fn egress_dropped(&self) -> Arc<AtomicU64> {
223 Arc::clone(&self.egress_dropped)
224 }
225
226 fn build_client(&self, host: &str, addrs: &[SocketAddr]) -> reqwest::Client {
227 let mut builder = reqwest::Client::builder()
228 .timeout(self.timeout)
229 .redirect(reqwest::redirect::Policy::none());
230 builder = builder.resolve_to_addrs(host, addrs);
231 builder.build().unwrap_or_default()
232 }
233}
234
235impl ToolExecutor for WebScrapeExecutor {
236 fn tool_definitions(&self) -> Vec<crate::registry::ToolDef> {
237 use crate::registry::{InvocationHint, ToolDef};
238 vec![
239 ToolDef {
240 id: "web_scrape".into(),
241 description: "Extract structured data from a web page using CSS selectors.\n\nONLY call this tool when the user has explicitly provided a URL in their message, or when a prior tool call returned a URL to retrieve. NEVER construct, guess, or infer a URL from entity names, brand knowledge, or domain patterns.\n\nParameters: url (string, required) - HTTPS URL; select (string, required) - CSS selector; extract (string, optional) - \"text\", \"html\", or \"attr:<name>\"; limit (integer, optional) - max results\nReturns: extracted text/HTML/attribute values, one per line\nErrors: InvalidParams if URL is not HTTPS or selector is empty; Timeout after configured seconds; connection/DNS failures".into(),
242 schema: schemars::schema_for!(ScrapeInstruction),
243 invocation: InvocationHint::FencedBlock("scrape"),
244 output_schema: None,
245 },
246 ToolDef {
247 id: "fetch".into(),
248 description: "Fetch a URL and return the response body as plain text.\n\nONLY call this tool when the user has explicitly provided a URL in their message, or when a prior tool call returned a URL to retrieve. NEVER construct, guess, or infer a URL from entity names, brand knowledge, or domain patterns. If no URL is present in the conversation, do not call this tool.\n\nParameters: url (string, required) - HTTPS URL to fetch\nReturns: response body as UTF-8 text, truncated if exceeding max body size\nErrors: InvalidParams if URL is not HTTPS; Timeout; SSRF-blocked private IPs; connection failures".into(),
249 schema: schemars::schema_for!(FetchParams),
250 invocation: InvocationHint::ToolCall,
251 output_schema: None,
252 },
253 ]
254 }
255
256 async fn execute(&self, response: &str) -> Result<Option<ToolOutput>, ToolError> {
257 let blocks = extract_scrape_blocks(response);
258 if blocks.is_empty() {
259 return Ok(None);
260 }
261
262 let mut outputs = Vec::with_capacity(blocks.len());
263 #[allow(clippy::cast_possible_truncation)]
264 let blocks_executed = blocks.len() as u32;
265
266 for block in &blocks {
267 let instruction: ScrapeInstruction = serde_json::from_str(block).map_err(|e| {
268 ToolError::Execution(std::io::Error::new(
269 std::io::ErrorKind::InvalidData,
270 e.to_string(),
271 ))
272 })?;
273 let correlation_id = EgressEvent::new_correlation_id();
274 let start = Instant::now();
275 let scrape_result = self
276 .scrape_instruction(&instruction, &correlation_id, None, None)
277 .await;
278 #[allow(clippy::cast_possible_truncation)]
279 let duration_ms = start.elapsed().as_millis() as u64;
280 match scrape_result {
281 Ok(output) => {
282 self.log_audit(
283 "web_scrape",
284 &redact_url_for_log(&instruction.url),
285 AuditResult::Success,
286 duration_ms,
287 None,
288 None,
289 None,
290 Some(correlation_id),
291 )
292 .await;
293 outputs.push(output);
294 }
295 Err(e) => {
296 let audit_result = tool_error_to_audit_result(&e);
297 self.log_audit(
298 "web_scrape",
299 &redact_url_for_log(&instruction.url),
300 audit_result,
301 duration_ms,
302 Some(&e),
303 None,
304 None,
305 Some(correlation_id),
306 )
307 .await;
308 return Err(e);
309 }
310 }
311 }
312
313 Ok(Some(ToolOutput {
314 tool_name: ToolName::new("web-scrape"),
315 summary: outputs.join("\n\n"),
316 blocks_executed,
317 filter_stats: None,
318 diff: None,
319 streamed: false,
320 terminal_id: None,
321 locations: None,
322 raw_response: None,
323 claim_source: Some(ClaimSource::WebScrape),
324 }))
325 }
326
327 #[cfg_attr(
328 feature = "profiling",
329 tracing::instrument(name = "tools.scrape.fetch", skip_all)
330 )]
331 async fn execute_tool_call(&self, call: &ToolCall) -> Result<Option<ToolOutput>, ToolError> {
332 match call.tool_id.as_str() {
333 "web_scrape" => {
334 let instruction: ScrapeInstruction = deserialize_params(&call.params)?;
335 let correlation_id = EgressEvent::new_correlation_id();
336 let start = Instant::now();
337 let result = self
338 .scrape_instruction(
339 &instruction,
340 &correlation_id,
341 call.caller_id.clone(),
342 call.skill_name.clone(),
343 )
344 .await;
345 #[allow(clippy::cast_possible_truncation)]
346 let duration_ms = start.elapsed().as_millis() as u64;
347 self.run_with_audit(
348 "web_scrape",
349 "web-scrape",
350 &redact_url_for_log(&instruction.url),
351 call.caller_id.clone(),
352 call.skill_name.clone(),
353 correlation_id,
354 duration_ms,
355 result,
356 )
357 .await
358 }
359 "fetch" => {
360 let p: FetchParams = deserialize_params(&call.params)?;
361 let correlation_id = EgressEvent::new_correlation_id();
362 let start = Instant::now();
363 let result = self
364 .handle_fetch(
365 &p,
366 &correlation_id,
367 call.caller_id.clone(),
368 call.skill_name.clone(),
369 )
370 .await;
371 #[allow(clippy::cast_possible_truncation)]
372 let duration_ms = start.elapsed().as_millis() as u64;
373 self.run_with_audit(
374 "fetch",
375 "fetch",
376 &redact_url_for_log(&p.url),
377 call.caller_id.clone(),
378 call.skill_name.clone(),
379 correlation_id,
380 duration_ms,
381 result,
382 )
383 .await
384 }
385 _ => Ok(None),
386 }
387 }
388
389 fn is_tool_retryable(&self, tool_id: &str) -> bool {
390 matches!(tool_id, "web_scrape" | "fetch")
391 }
392}
393
394fn tool_error_to_audit_result(e: &ToolError) -> AuditResult {
395 match e {
396 ToolError::Blocked { command } => AuditResult::Blocked {
397 reason: command.clone(),
398 },
399 ToolError::Timeout { .. } => AuditResult::Timeout,
400 _ => AuditResult::Error {
401 message: e.to_string(),
402 },
403 }
404}
405
406impl WebScrapeExecutor {
407 #[allow(clippy::too_many_arguments)]
408 async fn run_with_audit(
409 &self,
410 audit_tool_name: &str,
411 public_tool_name: &str,
412 audit_command: &str,
413 caller_id: Option<String>,
414 skill_name: Option<Vec<String>>,
415 correlation_id: String,
416 duration_ms: u64,
417 result: Result<String, ToolError>,
418 ) -> Result<Option<ToolOutput>, ToolError> {
419 match result {
420 Ok(output) => {
421 self.log_audit(
422 audit_tool_name,
423 audit_command,
424 AuditResult::Success,
425 duration_ms,
426 None,
427 caller_id,
428 skill_name,
429 Some(correlation_id),
430 )
431 .await;
432 Ok(Some(ToolOutput {
433 tool_name: ToolName::new(public_tool_name),
434 summary: output,
435 blocks_executed: 1,
436 filter_stats: None,
437 diff: None,
438 streamed: false,
439 terminal_id: None,
440 locations: None,
441 raw_response: None,
442 claim_source: Some(ClaimSource::WebScrape),
443 }))
444 }
445 Err(e) => {
446 let audit_result = tool_error_to_audit_result(&e);
447 self.log_audit(
448 audit_tool_name,
449 audit_command,
450 audit_result,
451 duration_ms,
452 Some(&e),
453 caller_id,
454 skill_name,
455 Some(correlation_id),
456 )
457 .await;
458 Err(e)
459 }
460 }
461 }
462
463 #[allow(clippy::too_many_arguments)] async fn log_audit(
465 &self,
466 tool: &str,
467 command: &str,
468 result: AuditResult,
469 duration_ms: u64,
470 error: Option<&ToolError>,
471 caller_id: Option<String>,
472 skill_name: Option<Vec<String>>,
473 correlation_id: Option<String>,
474 ) {
475 if let Some(ref logger) = self.audit_logger {
476 let (error_category, error_domain, error_phase) =
477 error.map_or((None, None, None), |e| {
478 let cat = e.category();
479 (
480 Some(cat.label().to_owned()),
481 Some(cat.domain().label().to_owned()),
482 Some(cat.phase().label().to_owned()),
483 )
484 });
485 let entry = AuditEntry {
486 timestamp: chrono_now(),
487 tool: tool.into(),
488 command: command.into(),
489 result,
490 duration_ms,
491 error_category,
492 error_domain,
493 error_phase,
494 claim_source: Some(ClaimSource::WebScrape),
495 mcp_server_id: None,
496 injection_flagged: false,
497 embedding_anomalous: false,
498 cross_boundary_mcp_to_acp: false,
499 adversarial_policy_decision: None,
500 exit_code: None,
501 truncated: false,
502 caller_id,
503 skill_name,
504 policy_match: None,
505 correlation_id,
506 vigil_risk: None,
507 execution_env: None,
508 resolved_cwd: None,
509 scope_at_definition: None,
510 scope_at_dispatch: None,
511 };
512 logger.log(&entry).await;
513 }
514 }
515
516 fn send_egress_event(&self, event: EgressEvent) {
517 if let Some(ref tx) = self.egress_tx {
518 match tx.try_send(event) {
519 Ok(()) => {}
520 Err(tokio::sync::mpsc::error::TrySendError::Full(_)) => {
521 self.egress_dropped.fetch_add(1, Ordering::Relaxed);
522 }
523 Err(tokio::sync::mpsc::error::TrySendError::Closed(_)) => {
524 tracing::debug!("egress channel closed; executor continuing without telemetry");
525 }
526 }
527 }
528 }
529
530 async fn log_egress_event(&self, event: &EgressEvent) {
531 if let Some(ref logger) = self.audit_logger {
532 logger.log_egress(event).await;
533 }
534 self.send_egress_event(event.clone());
535 }
536
537 async fn handle_fetch(
538 &self,
539 params: &FetchParams,
540 correlation_id: &str,
541 caller_id: Option<String>,
542 skill_name: Option<Vec<String>>,
543 ) -> Result<String, ToolError> {
544 let parsed = validate_url(¶ms.url);
545 let host_str = parsed
546 .as_ref()
547 .map(|u| u.host_str().unwrap_or("").to_owned())
548 .unwrap_or_default();
549
550 if let Err(ref _e) = parsed {
551 if self.egress_config.enabled && self.egress_config.log_blocked {
552 let event = Self::make_blocked_event(
553 "fetch",
554 ¶ms.url,
555 &host_str,
556 correlation_id,
557 caller_id.clone(),
558 skill_name.clone(),
559 "scheme",
560 );
561 self.log_egress_event(&event).await;
562 }
563 return Err(parsed.unwrap_err());
564 }
565 let parsed = parsed.unwrap();
566
567 if let Err(e) = check_domain_policy(
568 parsed.host_str().unwrap_or(""),
569 &self.allowed_domains,
570 &self.denied_domains,
571 ) {
572 if self.egress_config.enabled && self.egress_config.log_blocked {
573 let event = Self::make_blocked_event(
574 "fetch",
575 ¶ms.url,
576 parsed.host_str().unwrap_or(""),
577 correlation_id,
578 caller_id.clone(),
579 skill_name.clone(),
580 "blocklist",
581 );
582 self.log_egress_event(&event).await;
583 }
584 return Err(e);
585 }
586
587 let (host, addrs) = match resolve_and_validate(&parsed).await {
588 Ok(v) => v,
589 Err(e) => {
590 if self.egress_config.enabled && self.egress_config.log_blocked {
591 let event = Self::make_blocked_event(
592 "fetch",
593 ¶ms.url,
594 parsed.host_str().unwrap_or(""),
595 correlation_id,
596 caller_id.clone(),
597 skill_name.clone(),
598 "ssrf",
599 );
600 self.log_egress_event(&event).await;
601 }
602 return Err(e);
603 }
604 };
605
606 let body = self
607 .fetch_html(
608 ¶ms.url,
609 &host,
610 &addrs,
611 "fetch",
612 correlation_id,
613 caller_id,
614 skill_name,
615 )
616 .await?;
617 self.apply_ipi_filter(&body, ¶ms.url).await
618 }
619
620 async fn scrape_instruction(
621 &self,
622 instruction: &ScrapeInstruction,
623 correlation_id: &str,
624 caller_id: Option<String>,
625 skill_name: Option<Vec<String>>,
626 ) -> Result<String, ToolError> {
627 let parsed = validate_url(&instruction.url);
628 let host_str = parsed
629 .as_ref()
630 .map(|u| u.host_str().unwrap_or("").to_owned())
631 .unwrap_or_default();
632
633 if let Err(ref _e) = parsed {
634 if self.egress_config.enabled && self.egress_config.log_blocked {
635 let event = Self::make_blocked_event(
636 "web_scrape",
637 &instruction.url,
638 &host_str,
639 correlation_id,
640 caller_id.clone(),
641 skill_name.clone(),
642 "scheme",
643 );
644 self.log_egress_event(&event).await;
645 }
646 return Err(parsed.unwrap_err());
647 }
648 let parsed = parsed.unwrap();
649
650 if let Err(e) = check_domain_policy(
651 parsed.host_str().unwrap_or(""),
652 &self.allowed_domains,
653 &self.denied_domains,
654 ) {
655 if self.egress_config.enabled && self.egress_config.log_blocked {
656 let event = Self::make_blocked_event(
657 "web_scrape",
658 &instruction.url,
659 parsed.host_str().unwrap_or(""),
660 correlation_id,
661 caller_id.clone(),
662 skill_name.clone(),
663 "blocklist",
664 );
665 self.log_egress_event(&event).await;
666 }
667 return Err(e);
668 }
669
670 let (host, addrs) = match resolve_and_validate(&parsed).await {
671 Ok(v) => v,
672 Err(e) => {
673 if self.egress_config.enabled && self.egress_config.log_blocked {
674 let event = Self::make_blocked_event(
675 "web_scrape",
676 &instruction.url,
677 parsed.host_str().unwrap_or(""),
678 correlation_id,
679 caller_id.clone(),
680 skill_name.clone(),
681 "ssrf",
682 );
683 self.log_egress_event(&event).await;
684 }
685 return Err(e);
686 }
687 };
688
689 let html = self
690 .fetch_html(
691 &instruction.url,
692 &host,
693 &addrs,
694 "web_scrape",
695 correlation_id,
696 caller_id,
697 skill_name,
698 )
699 .await?;
700 let selector = instruction.select.clone();
701 let extract = ExtractMode::parse(&instruction.extract);
702 let limit = instruction.limit.unwrap_or(10);
703 let extracted = tokio::task::spawn_blocking(move || {
704 parse_and_extract(&html, &selector, &extract, limit)
705 })
706 .await
707 .map_err(|e| ToolError::Execution(std::io::Error::other(e.to_string())))??;
708 self.apply_ipi_filter(&extracted, &instruction.url).await
710 }
711
712 fn make_blocked_event(
713 tool: &str,
714 url: &str,
715 host: &str,
716 correlation_id: &str,
717 caller_id: Option<String>,
718 skill_name: Option<Vec<String>>,
719 block_reason: &'static str,
720 ) -> EgressEvent {
721 EgressEvent {
722 timestamp: chrono_now(),
723 kind: "egress",
724 correlation_id: correlation_id.to_owned(),
725 tool: tool.into(),
726 url: redact_url_for_log(url),
727 host: host.to_owned(),
728 method: "GET".to_owned(),
729 status: None,
730 duration_ms: 0,
731 response_bytes: 0,
732 blocked: true,
733 block_reason: Some(block_reason),
734 caller_id,
735 skill_name,
736 hop: 0,
737 }
738 }
739
740 #[allow(clippy::too_many_lines, clippy::too_many_arguments)]
751 async fn fetch_html(
752 &self,
753 url: &str,
754 host: &str,
755 addrs: &[SocketAddr],
756 tool: &str,
757 correlation_id: &str,
758 caller_id: Option<String>,
759 skill_name: Option<Vec<String>>,
760 ) -> Result<String, ToolError> {
761 const MAX_REDIRECTS: usize = 3;
762
763 let mut current_url = url.to_owned();
764 let mut current_host = host.to_owned();
765 let mut current_addrs = addrs.to_vec();
766
767 for hop in 0..=MAX_REDIRECTS {
768 let hop_start = Instant::now();
769 let client = self.build_client(¤t_host, ¤t_addrs);
771 let resp = client
772 .get(¤t_url)
773 .send()
774 .await
775 .map_err(|e| ToolError::Execution(std::io::Error::other(e.to_string())));
776
777 let resp = match resp {
778 Ok(r) => r,
779 Err(e) => {
780 if self.egress_config.enabled {
781 #[allow(clippy::cast_possible_truncation)]
782 let duration_ms = hop_start.elapsed().as_millis() as u64;
783 let event = EgressEvent {
784 timestamp: chrono_now(),
785 kind: "egress",
786 correlation_id: correlation_id.to_owned(),
787 tool: tool.into(),
788 url: redact_url_for_log(¤t_url),
789 host: current_host.clone(),
790 method: "GET".to_owned(),
791 status: None,
792 duration_ms,
793 response_bytes: 0,
794 blocked: false,
795 block_reason: None,
796 caller_id: caller_id.clone(),
797 skill_name: skill_name.clone(),
798 #[allow(clippy::cast_possible_truncation)]
799 hop: hop as u8,
800 };
801 self.log_egress_event(&event).await;
802 }
803 return Err(e);
804 }
805 };
806
807 let status = resp.status();
808
809 if status.is_redirection() {
810 if hop == MAX_REDIRECTS {
811 return Err(ToolError::Execution(std::io::Error::other(
812 "too many redirects",
813 )));
814 }
815
816 let location = resp
817 .headers()
818 .get(reqwest::header::LOCATION)
819 .and_then(|v| v.to_str().ok())
820 .ok_or_else(|| {
821 ToolError::Execution(std::io::Error::other("redirect with no Location"))
822 })?;
823
824 let base = Url::parse(¤t_url)
826 .map_err(|e| ToolError::Execution(std::io::Error::other(e.to_string())))?;
827 let next_url = base
828 .join(location)
829 .map_err(|e| ToolError::Execution(std::io::Error::other(e.to_string())))?;
830
831 let validated = validate_url(next_url.as_str());
832 if let Err(ref _e) = validated {
833 if self.egress_config.enabled && self.egress_config.log_blocked {
834 #[allow(clippy::cast_possible_truncation)]
835 let duration_ms = hop_start.elapsed().as_millis() as u64;
836 let next_host = next_url.host_str().unwrap_or("").to_owned();
837 let event = EgressEvent {
838 timestamp: chrono_now(),
839 kind: "egress",
840 correlation_id: correlation_id.to_owned(),
841 tool: tool.into(),
842 url: redact_url_for_log(next_url.as_str()),
843 host: next_host,
844 method: "GET".to_owned(),
845 status: None,
846 duration_ms,
847 response_bytes: 0,
848 blocked: true,
849 block_reason: Some("ssrf"),
850 caller_id: caller_id.clone(),
851 skill_name: skill_name.clone(),
852 #[allow(clippy::cast_possible_truncation)]
853 hop: (hop + 1) as u8,
854 };
855 self.log_egress_event(&event).await;
856 }
857 return Err(validated.unwrap_err());
858 }
859 let validated = validated.unwrap();
860 let resolve_result = resolve_and_validate(&validated).await;
861 if let Err(ref _e) = resolve_result {
862 if self.egress_config.enabled && self.egress_config.log_blocked {
863 #[allow(clippy::cast_possible_truncation)]
864 let duration_ms = hop_start.elapsed().as_millis() as u64;
865 let next_host = next_url.host_str().unwrap_or("").to_owned();
866 let event = EgressEvent {
867 timestamp: chrono_now(),
868 kind: "egress",
869 correlation_id: correlation_id.to_owned(),
870 tool: tool.into(),
871 url: redact_url_for_log(next_url.as_str()),
872 host: next_host,
873 method: "GET".to_owned(),
874 status: None,
875 duration_ms,
876 response_bytes: 0,
877 blocked: true,
878 block_reason: Some("ssrf"),
879 caller_id: caller_id.clone(),
880 skill_name: skill_name.clone(),
881 #[allow(clippy::cast_possible_truncation)]
882 hop: (hop + 1) as u8,
883 };
884 self.log_egress_event(&event).await;
885 }
886 return Err(resolve_result.unwrap_err());
887 }
888 let (next_host, next_addrs) = resolve_result.unwrap();
889
890 current_url = next_url.to_string();
891 current_host = next_host;
892 current_addrs = next_addrs;
893 continue;
894 }
895
896 if !status.is_success() {
897 if self.egress_config.enabled {
898 #[allow(clippy::cast_possible_truncation)]
899 let duration_ms = hop_start.elapsed().as_millis() as u64;
900 let event = EgressEvent {
901 timestamp: chrono_now(),
902 kind: "egress",
903 correlation_id: correlation_id.to_owned(),
904 tool: tool.into(),
905 url: redact_url_for_log(¤t_url),
906 host: current_host.clone(),
907 method: "GET".to_owned(),
908 status: Some(status.as_u16()),
909 duration_ms,
910 response_bytes: 0,
911 blocked: false,
912 block_reason: None,
913 caller_id: caller_id.clone(),
914 skill_name: skill_name.clone(),
915 #[allow(clippy::cast_possible_truncation)]
916 hop: hop as u8,
917 };
918 self.log_egress_event(&event).await;
919 }
920 return Err(ToolError::Http {
921 status: status.as_u16(),
922 message: status.canonical_reason().unwrap_or("unknown").to_owned(),
923 });
924 }
925
926 let bytes = resp
927 .bytes()
928 .await
929 .map_err(|e| ToolError::Execution(std::io::Error::other(e.to_string())))?;
930
931 if bytes.len() > self.max_body_bytes {
932 if self.egress_config.enabled {
933 #[allow(clippy::cast_possible_truncation)]
934 let duration_ms = hop_start.elapsed().as_millis() as u64;
935 let event = EgressEvent {
936 timestamp: chrono_now(),
937 kind: "egress",
938 correlation_id: correlation_id.to_owned(),
939 tool: tool.into(),
940 url: redact_url_for_log(¤t_url),
941 host: current_host.clone(),
942 method: "GET".to_owned(),
943 status: Some(status.as_u16()),
944 duration_ms,
945 response_bytes: bytes.len(),
946 blocked: false,
947 block_reason: None,
948 caller_id: caller_id.clone(),
949 skill_name: skill_name.clone(),
950 #[allow(clippy::cast_possible_truncation)]
951 hop: hop as u8,
952 };
953 self.log_egress_event(&event).await;
954 }
955 return Err(ToolError::Execution(std::io::Error::other(format!(
956 "response too large: {} bytes (max: {})",
957 bytes.len(),
958 self.max_body_bytes,
959 ))));
960 }
961
962 if self.egress_config.enabled {
964 #[allow(clippy::cast_possible_truncation)]
965 let duration_ms = hop_start.elapsed().as_millis() as u64;
966 let response_bytes = if self.egress_config.log_response_bytes {
967 bytes.len()
968 } else {
969 0
970 };
971 let event = EgressEvent {
972 timestamp: chrono_now(),
973 kind: "egress",
974 correlation_id: correlation_id.to_owned(),
975 tool: tool.into(),
976 url: redact_url_for_log(¤t_url),
977 host: current_host.clone(),
978 method: "GET".to_owned(),
979 status: Some(status.as_u16()),
980 duration_ms,
981 response_bytes,
982 blocked: false,
983 block_reason: None,
984 caller_id: caller_id.clone(),
985 skill_name: skill_name.clone(),
986 #[allow(clippy::cast_possible_truncation)]
987 hop: hop as u8,
988 };
989 self.log_egress_event(&event).await;
990 }
991
992 return String::from_utf8(bytes.to_vec())
993 .map_err(|e| ToolError::Execution(std::io::Error::other(e.to_string())));
994 }
995
996 Err(ToolError::Execution(std::io::Error::other(
997 "too many redirects",
998 )))
999 }
1000
1001 #[tracing::instrument(name = "tools.scrape.apply_ipi_filter", skip(self, body), fields(body_len = body.len()))]
1011 async fn apply_ipi_filter(&self, body: &str, url: &str) -> Result<String, ToolError> {
1012 let verdict = self
1013 .ipi_filter
1014 .filter_async(body.to_owned())
1015 .await
1016 .map_err(|e| ToolError::Execution(std::io::Error::other(e.to_string())))?;
1017 if !verdict.patterns_found.is_empty() {
1018 tracing::warn!(
1019 url = url,
1020 score = verdict.score,
1021 patterns = ?verdict.patterns_found,
1022 "IPI patterns detected in fetched content"
1023 );
1024 }
1025 if verdict.sanitized == body {
1027 Ok(verdict.sanitized)
1028 } else {
1029 Ok(format!(
1030 "[IPI WARNING: score={:.2}, patterns={}] {}",
1031 verdict.score,
1032 verdict.patterns_found.join(", "),
1033 verdict.sanitized,
1034 ))
1035 }
1036 }
1037}
1038
1039fn extract_scrape_blocks(text: &str) -> Vec<&str> {
1040 crate::executor::extract_fenced_blocks(text, "scrape")
1041}
1042
1043fn check_domain_policy(
1055 host: &str,
1056 allowed_domains: &[String],
1057 denied_domains: &[String],
1058) -> Result<(), ToolError> {
1059 if denied_domains.iter().any(|p| domain_matches(p, host)) {
1060 return Err(ToolError::Blocked {
1061 command: format!("domain blocked by denylist: {host}"),
1062 });
1063 }
1064 if !allowed_domains.is_empty() {
1065 let is_ip = host.parse::<std::net::IpAddr>().is_ok()
1067 || (host.starts_with('[') && host.ends_with(']'));
1068 if is_ip {
1069 return Err(ToolError::Blocked {
1070 command: format!(
1071 "bare IP address not allowed when domain allowlist is active: {host}"
1072 ),
1073 });
1074 }
1075 if !allowed_domains.iter().any(|p| domain_matches(p, host)) {
1076 return Err(ToolError::Blocked {
1077 command: format!("domain not in allowlist: {host}"),
1078 });
1079 }
1080 }
1081 Ok(())
1082}
1083
1084use crate::domain_match::domain_matches;
1086
1087fn validate_url(raw: &str) -> Result<Url, ToolError> {
1088 let parsed = Url::parse(raw).map_err(|_| ToolError::Blocked {
1089 command: format!("invalid URL: {raw}"),
1090 })?;
1091
1092 if parsed.scheme() != "https" {
1093 return Err(ToolError::Blocked {
1094 command: format!("scheme not allowed: {}", parsed.scheme()),
1095 });
1096 }
1097
1098 if let Some(host) = parsed.host()
1099 && is_private_host(&host)
1100 {
1101 return Err(ToolError::Blocked {
1102 command: format!(
1103 "private/local host blocked: {}",
1104 parsed.host_str().unwrap_or("")
1105 ),
1106 });
1107 }
1108
1109 Ok(parsed)
1110}
1111
1112fn is_private_host(host: &url::Host<&str>) -> bool {
1113 match host {
1114 url::Host::Domain(d) => {
1115 #[allow(clippy::case_sensitive_file_extension_comparisons)]
1118 {
1119 *d == "localhost"
1120 || d.ends_with(".localhost")
1121 || d.ends_with(".internal")
1122 || d.ends_with(".local")
1123 }
1124 }
1125 url::Host::Ipv4(v4) => is_private_ip(IpAddr::V4(*v4)),
1126 url::Host::Ipv6(v6) => is_private_ip(IpAddr::V6(*v6)),
1127 }
1128}
1129
1130#[tracing::instrument(name = "tools.scrape.dns.resolve", skip(url), fields(host = url.host_str().unwrap_or("")))]
1136async fn resolve_and_validate(url: &Url) -> Result<(String, Vec<SocketAddr>), ToolError> {
1137 let Some(host) = url.host_str() else {
1138 return Ok((String::new(), vec![]));
1139 };
1140 let port = url.port_or_known_default().unwrap_or(443);
1141 let addrs: Vec<SocketAddr> = tokio::time::timeout(
1142 Duration::from_secs(10),
1143 tokio::net::lookup_host(format!("{host}:{port}")),
1144 )
1145 .await
1146 .map_err(|_| ToolError::Timeout { timeout_secs: 10 })?
1147 .map_err(|e| ToolError::Blocked {
1148 command: format!("DNS resolution failed: {e}"),
1149 })?
1150 .collect();
1151 for addr in &addrs {
1152 if is_private_ip(addr.ip()) {
1153 return Err(ToolError::Blocked {
1154 command: format!("SSRF protection: private IP {} for host {host}", addr.ip()),
1155 });
1156 }
1157 }
1158 Ok((host.to_owned(), addrs))
1159}
1160
1161fn parse_and_extract(
1162 html: &str,
1163 selector: &str,
1164 extract: &ExtractMode,
1165 limit: usize,
1166) -> Result<String, ToolError> {
1167 let soup = scrape_core::Soup::parse(html);
1168
1169 let tags = soup.find_all(selector).map_err(|e| {
1170 ToolError::Execution(std::io::Error::new(
1171 std::io::ErrorKind::InvalidData,
1172 format!("invalid selector: {e}"),
1173 ))
1174 })?;
1175
1176 let mut results = Vec::new();
1177
1178 for tag in tags.into_iter().take(limit) {
1179 let value = match extract {
1180 ExtractMode::Text => tag.text(),
1181 ExtractMode::Html => tag.inner_html(),
1182 ExtractMode::Attr(name) => tag.get(name).unwrap_or_default().to_owned(),
1183 };
1184 if !value.trim().is_empty() {
1185 results.push(value.trim().to_owned());
1186 }
1187 }
1188
1189 if results.is_empty() {
1190 Ok(format!("No results for selector: {selector}"))
1191 } else {
1192 Ok(results.join("\n"))
1193 }
1194}
1195
1196#[cfg(test)]
1197mod tests {
1198 use super::*;
1199
1200 #[test]
1203 fn extract_single_block() {
1204 let text =
1205 "Here:\n```scrape\n{\"url\":\"https://example.com\",\"select\":\"h1\"}\n```\nDone.";
1206 let blocks = extract_scrape_blocks(text);
1207 assert_eq!(blocks.len(), 1);
1208 assert!(blocks[0].contains("example.com"));
1209 }
1210
1211 #[test]
1212 fn extract_multiple_blocks() {
1213 let text = "```scrape\n{\"url\":\"https://a.com\",\"select\":\"h1\"}\n```\ntext\n```scrape\n{\"url\":\"https://b.com\",\"select\":\"p\"}\n```";
1214 let blocks = extract_scrape_blocks(text);
1215 assert_eq!(blocks.len(), 2);
1216 }
1217
1218 #[test]
1219 fn no_blocks_returns_empty() {
1220 let blocks = extract_scrape_blocks("plain text, no code blocks");
1221 assert!(blocks.is_empty());
1222 }
1223
1224 #[test]
1225 fn unclosed_block_ignored() {
1226 let blocks = extract_scrape_blocks("```scrape\n{\"url\":\"https://x.com\"}");
1227 assert!(blocks.is_empty());
1228 }
1229
1230 #[test]
1231 fn non_scrape_block_ignored() {
1232 let text =
1233 "```bash\necho hi\n```\n```scrape\n{\"url\":\"https://x.com\",\"select\":\"h1\"}\n```";
1234 let blocks = extract_scrape_blocks(text);
1235 assert_eq!(blocks.len(), 1);
1236 assert!(blocks[0].contains("x.com"));
1237 }
1238
1239 #[test]
1240 fn multiline_json_block() {
1241 let text =
1242 "```scrape\n{\n \"url\": \"https://example.com\",\n \"select\": \"h1\"\n}\n```";
1243 let blocks = extract_scrape_blocks(text);
1244 assert_eq!(blocks.len(), 1);
1245 let instr: ScrapeInstruction = serde_json::from_str(blocks[0]).unwrap();
1246 assert_eq!(instr.url, "https://example.com");
1247 }
1248
1249 #[test]
1252 fn parse_valid_instruction() {
1253 let json = r#"{"url":"https://example.com","select":"h1","extract":"text","limit":5}"#;
1254 let instr: ScrapeInstruction = serde_json::from_str(json).unwrap();
1255 assert_eq!(instr.url, "https://example.com");
1256 assert_eq!(instr.select, "h1");
1257 assert_eq!(instr.extract, "text");
1258 assert_eq!(instr.limit, Some(5));
1259 }
1260
1261 #[test]
1262 fn parse_minimal_instruction() {
1263 let json = r#"{"url":"https://example.com","select":"p"}"#;
1264 let instr: ScrapeInstruction = serde_json::from_str(json).unwrap();
1265 assert_eq!(instr.extract, "text");
1266 assert!(instr.limit.is_none());
1267 }
1268
1269 #[test]
1270 fn parse_attr_extract() {
1271 let json = r#"{"url":"https://example.com","select":"a","extract":"attr:href"}"#;
1272 let instr: ScrapeInstruction = serde_json::from_str(json).unwrap();
1273 assert_eq!(instr.extract, "attr:href");
1274 }
1275
1276 #[test]
1277 fn parse_invalid_json_errors() {
1278 let result = serde_json::from_str::<ScrapeInstruction>("not json");
1279 assert!(result.is_err());
1280 }
1281
1282 #[test]
1285 fn extract_mode_text() {
1286 assert!(matches!(ExtractMode::parse("text"), ExtractMode::Text));
1287 }
1288
1289 #[test]
1290 fn extract_mode_html() {
1291 assert!(matches!(ExtractMode::parse("html"), ExtractMode::Html));
1292 }
1293
1294 #[test]
1295 fn extract_mode_attr() {
1296 let mode = ExtractMode::parse("attr:href");
1297 assert!(matches!(mode, ExtractMode::Attr(ref s) if s == "href"));
1298 }
1299
1300 #[test]
1301 fn extract_mode_unknown_defaults_to_text() {
1302 assert!(matches!(ExtractMode::parse("unknown"), ExtractMode::Text));
1303 }
1304
1305 #[test]
1308 fn valid_https_url() {
1309 assert!(validate_url("https://example.com").is_ok());
1310 }
1311
1312 #[test]
1313 fn http_rejected() {
1314 let err = validate_url("http://example.com").unwrap_err();
1315 assert!(matches!(err, ToolError::Blocked { .. }));
1316 }
1317
1318 #[test]
1319 fn ftp_rejected() {
1320 let err = validate_url("ftp://files.example.com").unwrap_err();
1321 assert!(matches!(err, ToolError::Blocked { .. }));
1322 }
1323
1324 #[test]
1325 fn file_rejected() {
1326 let err = validate_url("file:///etc/passwd").unwrap_err();
1327 assert!(matches!(err, ToolError::Blocked { .. }));
1328 }
1329
1330 #[test]
1331 fn invalid_url_rejected() {
1332 let err = validate_url("not a url").unwrap_err();
1333 assert!(matches!(err, ToolError::Blocked { .. }));
1334 }
1335
1336 #[test]
1337 fn localhost_blocked() {
1338 let err = validate_url("https://localhost/path").unwrap_err();
1339 assert!(matches!(err, ToolError::Blocked { .. }));
1340 }
1341
1342 #[test]
1343 fn loopback_ip_blocked() {
1344 let err = validate_url("https://127.0.0.1/path").unwrap_err();
1345 assert!(matches!(err, ToolError::Blocked { .. }));
1346 }
1347
1348 #[test]
1349 fn private_10_blocked() {
1350 let err = validate_url("https://10.0.0.1/api").unwrap_err();
1351 assert!(matches!(err, ToolError::Blocked { .. }));
1352 }
1353
1354 #[test]
1355 fn private_172_blocked() {
1356 let err = validate_url("https://172.16.0.1/api").unwrap_err();
1357 assert!(matches!(err, ToolError::Blocked { .. }));
1358 }
1359
1360 #[test]
1361 fn private_192_blocked() {
1362 let err = validate_url("https://192.168.1.1/api").unwrap_err();
1363 assert!(matches!(err, ToolError::Blocked { .. }));
1364 }
1365
1366 #[test]
1367 fn ipv6_loopback_blocked() {
1368 let err = validate_url("https://[::1]/path").unwrap_err();
1369 assert!(matches!(err, ToolError::Blocked { .. }));
1370 }
1371
1372 #[test]
1373 fn public_ip_allowed() {
1374 assert!(validate_url("https://93.184.216.34/page").is_ok());
1375 }
1376
1377 #[test]
1380 fn extract_text_from_html() {
1381 let html = "<html><body><h1>Hello World</h1><p>Content</p></body></html>";
1382 let result = parse_and_extract(html, "h1", &ExtractMode::Text, 10).unwrap();
1383 assert_eq!(result, "Hello World");
1384 }
1385
1386 #[test]
1387 fn extract_multiple_elements() {
1388 let html = "<ul><li>A</li><li>B</li><li>C</li></ul>";
1389 let result = parse_and_extract(html, "li", &ExtractMode::Text, 10).unwrap();
1390 assert_eq!(result, "A\nB\nC");
1391 }
1392
1393 #[test]
1394 fn extract_with_limit() {
1395 let html = "<ul><li>A</li><li>B</li><li>C</li></ul>";
1396 let result = parse_and_extract(html, "li", &ExtractMode::Text, 2).unwrap();
1397 assert_eq!(result, "A\nB");
1398 }
1399
1400 #[test]
1401 fn extract_attr_href() {
1402 let html = r#"<a href="https://example.com">Link</a>"#;
1403 let result =
1404 parse_and_extract(html, "a", &ExtractMode::Attr("href".to_owned()), 10).unwrap();
1405 assert_eq!(result, "https://example.com");
1406 }
1407
1408 #[test]
1409 fn extract_inner_html() {
1410 let html = "<div><span>inner</span></div>";
1411 let result = parse_and_extract(html, "div", &ExtractMode::Html, 10).unwrap();
1412 assert!(result.contains("<span>inner</span>"));
1413 }
1414
1415 #[test]
1416 fn no_matches_returns_message() {
1417 let html = "<html><body><p>text</p></body></html>";
1418 let result = parse_and_extract(html, "h1", &ExtractMode::Text, 10).unwrap();
1419 assert!(result.starts_with("No results for selector:"));
1420 }
1421
1422 #[test]
1423 fn empty_text_skipped() {
1424 let html = "<ul><li> </li><li>A</li></ul>";
1425 let result = parse_and_extract(html, "li", &ExtractMode::Text, 10).unwrap();
1426 assert_eq!(result, "A");
1427 }
1428
1429 #[test]
1430 fn invalid_selector_errors() {
1431 let html = "<html><body></body></html>";
1432 let result = parse_and_extract(html, "[[[invalid", &ExtractMode::Text, 10);
1433 assert!(result.is_err());
1434 }
1435
1436 #[test]
1437 fn empty_html_returns_no_results() {
1438 let result = parse_and_extract("", "h1", &ExtractMode::Text, 10).unwrap();
1439 assert!(result.starts_with("No results for selector:"));
1440 }
1441
1442 #[test]
1443 fn nested_selector() {
1444 let html = "<div><span>inner</span></div><span>outer</span>";
1445 let result = parse_and_extract(html, "div > span", &ExtractMode::Text, 10).unwrap();
1446 assert_eq!(result, "inner");
1447 }
1448
1449 #[test]
1450 fn attr_missing_returns_empty() {
1451 let html = r"<a>No href</a>";
1452 let result =
1453 parse_and_extract(html, "a", &ExtractMode::Attr("href".to_owned()), 10).unwrap();
1454 assert!(result.starts_with("No results for selector:"));
1455 }
1456
1457 #[test]
1458 fn extract_html_mode() {
1459 let html = "<div><b>bold</b> text</div>";
1460 let result = parse_and_extract(html, "div", &ExtractMode::Html, 10).unwrap();
1461 assert!(result.contains("<b>bold</b>"));
1462 }
1463
1464 #[test]
1465 fn limit_zero_returns_no_results() {
1466 let html = "<ul><li>A</li><li>B</li></ul>";
1467 let result = parse_and_extract(html, "li", &ExtractMode::Text, 0).unwrap();
1468 assert!(result.starts_with("No results for selector:"));
1469 }
1470
1471 #[test]
1474 fn url_with_port_allowed() {
1475 assert!(validate_url("https://example.com:8443/path").is_ok());
1476 }
1477
1478 #[test]
1479 fn link_local_ip_blocked() {
1480 let err = validate_url("https://169.254.1.1/path").unwrap_err();
1481 assert!(matches!(err, ToolError::Blocked { .. }));
1482 }
1483
1484 #[test]
1485 fn url_no_scheme_rejected() {
1486 let err = validate_url("example.com/path").unwrap_err();
1487 assert!(matches!(err, ToolError::Blocked { .. }));
1488 }
1489
1490 #[test]
1491 fn unspecified_ipv4_blocked() {
1492 let err = validate_url("https://0.0.0.0/path").unwrap_err();
1493 assert!(matches!(err, ToolError::Blocked { .. }));
1494 }
1495
1496 #[test]
1497 fn broadcast_ipv4_blocked() {
1498 let err = validate_url("https://255.255.255.255/path").unwrap_err();
1499 assert!(matches!(err, ToolError::Blocked { .. }));
1500 }
1501
1502 #[test]
1503 fn ipv6_link_local_blocked() {
1504 let err = validate_url("https://[fe80::1]/path").unwrap_err();
1505 assert!(matches!(err, ToolError::Blocked { .. }));
1506 }
1507
1508 #[test]
1509 fn ipv6_unique_local_blocked() {
1510 let err = validate_url("https://[fd12::1]/path").unwrap_err();
1511 assert!(matches!(err, ToolError::Blocked { .. }));
1512 }
1513
1514 #[test]
1515 fn ipv4_mapped_ipv6_loopback_blocked() {
1516 let err = validate_url("https://[::ffff:127.0.0.1]/path").unwrap_err();
1517 assert!(matches!(err, ToolError::Blocked { .. }));
1518 }
1519
1520 #[test]
1521 fn ipv4_mapped_ipv6_private_blocked() {
1522 let err = validate_url("https://[::ffff:10.0.0.1]/path").unwrap_err();
1523 assert!(matches!(err, ToolError::Blocked { .. }));
1524 }
1525
1526 #[tokio::test]
1529 async fn executor_no_blocks_returns_none() {
1530 let config = ScrapeConfig::default();
1531 let executor = WebScrapeExecutor::new(&config);
1532 let result = executor.execute("plain text").await;
1533 assert!(result.unwrap().is_none());
1534 }
1535
1536 #[tokio::test]
1537 async fn executor_invalid_json_errors() {
1538 let config = ScrapeConfig::default();
1539 let executor = WebScrapeExecutor::new(&config);
1540 let response = "```scrape\nnot json\n```";
1541 let result = executor.execute(response).await;
1542 assert!(matches!(result, Err(ToolError::Execution(_))));
1543 }
1544
1545 #[tokio::test]
1546 async fn executor_blocked_url_errors() {
1547 let config = ScrapeConfig::default();
1548 let executor = WebScrapeExecutor::new(&config);
1549 let response = "```scrape\n{\"url\":\"http://example.com\",\"select\":\"h1\"}\n```";
1550 let result = executor.execute(response).await;
1551 assert!(matches!(result, Err(ToolError::Blocked { .. })));
1552 }
1553
1554 #[tokio::test]
1555 async fn executor_private_ip_blocked() {
1556 let config = ScrapeConfig::default();
1557 let executor = WebScrapeExecutor::new(&config);
1558 let response = "```scrape\n{\"url\":\"https://192.168.1.1/api\",\"select\":\"h1\"}\n```";
1559 let result = executor.execute(response).await;
1560 assert!(matches!(result, Err(ToolError::Blocked { .. })));
1561 }
1562
1563 #[tokio::test]
1564 async fn executor_unreachable_host_returns_error() {
1565 let config = ScrapeConfig {
1566 timeout: 1,
1567 max_body_bytes: 1_048_576,
1568 ..Default::default()
1569 };
1570 let executor = WebScrapeExecutor::new(&config);
1571 let response = "```scrape\n{\"url\":\"https://192.0.2.1:1/page\",\"select\":\"h1\"}\n```";
1572 let result = executor.execute(response).await;
1573 assert!(matches!(result, Err(ToolError::Execution(_))));
1574 }
1575
1576 #[tokio::test]
1577 async fn executor_localhost_url_blocked() {
1578 let config = ScrapeConfig::default();
1579 let executor = WebScrapeExecutor::new(&config);
1580 let response = "```scrape\n{\"url\":\"https://localhost:9999/api\",\"select\":\"h1\"}\n```";
1581 let result = executor.execute(response).await;
1582 assert!(matches!(result, Err(ToolError::Blocked { .. })));
1583 }
1584
1585 #[tokio::test]
1586 async fn executor_empty_text_returns_none() {
1587 let config = ScrapeConfig::default();
1588 let executor = WebScrapeExecutor::new(&config);
1589 let result = executor.execute("").await;
1590 assert!(result.unwrap().is_none());
1591 }
1592
1593 #[tokio::test]
1594 async fn executor_multiple_blocks_first_blocked() {
1595 let config = ScrapeConfig::default();
1596 let executor = WebScrapeExecutor::new(&config);
1597 let response = "```scrape\n{\"url\":\"http://evil.com\",\"select\":\"h1\"}\n```\n\
1598 ```scrape\n{\"url\":\"https://ok.com\",\"select\":\"h1\"}\n```";
1599 let result = executor.execute(response).await;
1600 assert!(result.is_err());
1601 }
1602
1603 #[test]
1604 fn validate_url_empty_string() {
1605 let err = validate_url("").unwrap_err();
1606 assert!(matches!(err, ToolError::Blocked { .. }));
1607 }
1608
1609 #[test]
1610 fn validate_url_javascript_scheme_blocked() {
1611 let err = validate_url("javascript:alert(1)").unwrap_err();
1612 assert!(matches!(err, ToolError::Blocked { .. }));
1613 }
1614
1615 #[test]
1616 fn validate_url_data_scheme_blocked() {
1617 let err = validate_url("data:text/html,<h1>hi</h1>").unwrap_err();
1618 assert!(matches!(err, ToolError::Blocked { .. }));
1619 }
1620
1621 #[test]
1622 fn is_private_host_public_domain_is_false() {
1623 let host: url::Host<&str> = url::Host::Domain("example.com");
1624 assert!(!is_private_host(&host));
1625 }
1626
1627 #[test]
1628 fn is_private_host_localhost_is_true() {
1629 let host: url::Host<&str> = url::Host::Domain("localhost");
1630 assert!(is_private_host(&host));
1631 }
1632
1633 #[test]
1634 fn is_private_host_ipv6_unspecified_is_true() {
1635 let host = url::Host::Ipv6(std::net::Ipv6Addr::UNSPECIFIED);
1636 assert!(is_private_host(&host));
1637 }
1638
1639 #[test]
1640 fn is_private_host_public_ipv6_is_false() {
1641 let host = url::Host::Ipv6("2001:db8::1".parse().unwrap());
1642 assert!(!is_private_host(&host));
1643 }
1644
1645 async fn mock_server_executor() -> (WebScrapeExecutor, wiremock::MockServer) {
1656 let server = wiremock::MockServer::start().await;
1657 let executor = WebScrapeExecutor {
1658 timeout: Duration::from_secs(5),
1659 max_body_bytes: 1_048_576,
1660 allowed_domains: vec![],
1661 denied_domains: vec![],
1662 audit_logger: None,
1663 egress_config: EgressConfig::default(),
1664 egress_tx: None,
1665 egress_dropped: Arc::new(AtomicU64::new(0)),
1666 ipi_filter: IpiFilter::new(0.6),
1667 };
1668 (executor, server)
1669 }
1670
1671 fn server_host_and_addr(server: &wiremock::MockServer) -> (String, Vec<std::net::SocketAddr>) {
1673 let uri = server.uri();
1674 let url = Url::parse(&uri).unwrap();
1675 let host = url.host_str().unwrap_or("127.0.0.1").to_owned();
1676 let port = url.port().unwrap_or(80);
1677 let addr: std::net::SocketAddr = format!("{host}:{port}").parse().unwrap();
1678 (host, vec![addr])
1679 }
1680
1681 async fn follow_redirects_raw(
1685 executor: &WebScrapeExecutor,
1686 start_url: &str,
1687 host: &str,
1688 addrs: &[std::net::SocketAddr],
1689 ) -> Result<String, ToolError> {
1690 const MAX_REDIRECTS: usize = 3;
1691 let mut current_url = start_url.to_owned();
1692 let mut current_host = host.to_owned();
1693 let mut current_addrs = addrs.to_vec();
1694
1695 for hop in 0..=MAX_REDIRECTS {
1696 let client = executor.build_client(¤t_host, ¤t_addrs);
1697 let resp = client
1698 .get(¤t_url)
1699 .send()
1700 .await
1701 .map_err(|e| ToolError::Execution(std::io::Error::other(e.to_string())))?;
1702
1703 let status = resp.status();
1704
1705 if status.is_redirection() {
1706 if hop == MAX_REDIRECTS {
1707 return Err(ToolError::Execution(std::io::Error::other(
1708 "too many redirects",
1709 )));
1710 }
1711
1712 let location = resp
1713 .headers()
1714 .get(reqwest::header::LOCATION)
1715 .and_then(|v| v.to_str().ok())
1716 .ok_or_else(|| {
1717 ToolError::Execution(std::io::Error::other("redirect with no Location"))
1718 })?;
1719
1720 let base = Url::parse(¤t_url)
1721 .map_err(|e| ToolError::Execution(std::io::Error::other(e.to_string())))?;
1722 let next_url = base
1723 .join(location)
1724 .map_err(|e| ToolError::Execution(std::io::Error::other(e.to_string())))?;
1725
1726 current_url = next_url.to_string();
1728 let _ = &mut current_host;
1730 let _ = &mut current_addrs;
1731 continue;
1732 }
1733
1734 if !status.is_success() {
1735 return Err(ToolError::Execution(std::io::Error::other(format!(
1736 "HTTP {status}",
1737 ))));
1738 }
1739
1740 let bytes = resp
1741 .bytes()
1742 .await
1743 .map_err(|e| ToolError::Execution(std::io::Error::other(e.to_string())))?;
1744
1745 if bytes.len() > executor.max_body_bytes {
1746 return Err(ToolError::Execution(std::io::Error::other(format!(
1747 "response too large: {} bytes (max: {})",
1748 bytes.len(),
1749 executor.max_body_bytes,
1750 ))));
1751 }
1752
1753 return String::from_utf8(bytes.to_vec())
1754 .map_err(|e| ToolError::Execution(std::io::Error::other(e.to_string())));
1755 }
1756
1757 Err(ToolError::Execution(std::io::Error::other(
1758 "too many redirects",
1759 )))
1760 }
1761
1762 #[tokio::test]
1763 async fn fetch_html_success_returns_body() {
1764 use wiremock::matchers::{method, path};
1765 use wiremock::{Mock, ResponseTemplate};
1766
1767 let (executor, server) = mock_server_executor().await;
1768 Mock::given(method("GET"))
1769 .and(path("/page"))
1770 .respond_with(ResponseTemplate::new(200).set_body_string("<h1>OK</h1>"))
1771 .mount(&server)
1772 .await;
1773
1774 let (host, addrs) = server_host_and_addr(&server);
1775 let url = format!("{}/page", server.uri());
1776 let result = executor
1777 .fetch_html(&url, &host, &addrs, "fetch", "test-cid", None, None)
1778 .await;
1779 assert!(result.is_ok(), "expected Ok, got: {result:?}");
1780 assert_eq!(result.unwrap(), "<h1>OK</h1>");
1781 }
1782
1783 #[tokio::test]
1784 async fn fetch_html_non_2xx_returns_error() {
1785 use wiremock::matchers::{method, path};
1786 use wiremock::{Mock, ResponseTemplate};
1787
1788 let (executor, server) = mock_server_executor().await;
1789 Mock::given(method("GET"))
1790 .and(path("/forbidden"))
1791 .respond_with(ResponseTemplate::new(403))
1792 .mount(&server)
1793 .await;
1794
1795 let (host, addrs) = server_host_and_addr(&server);
1796 let url = format!("{}/forbidden", server.uri());
1797 let result = executor
1798 .fetch_html(&url, &host, &addrs, "fetch", "test-cid", None, None)
1799 .await;
1800 assert!(result.is_err());
1801 let msg = result.unwrap_err().to_string();
1802 assert!(msg.contains("403"), "expected 403 in error: {msg}");
1803 }
1804
1805 #[tokio::test]
1806 async fn fetch_html_404_returns_error() {
1807 use wiremock::matchers::{method, path};
1808 use wiremock::{Mock, ResponseTemplate};
1809
1810 let (executor, server) = mock_server_executor().await;
1811 Mock::given(method("GET"))
1812 .and(path("/missing"))
1813 .respond_with(ResponseTemplate::new(404))
1814 .mount(&server)
1815 .await;
1816
1817 let (host, addrs) = server_host_and_addr(&server);
1818 let url = format!("{}/missing", server.uri());
1819 let result = executor
1820 .fetch_html(&url, &host, &addrs, "fetch", "test-cid", None, None)
1821 .await;
1822 assert!(result.is_err());
1823 let msg = result.unwrap_err().to_string();
1824 assert!(msg.contains("404"), "expected 404 in error: {msg}");
1825 }
1826
1827 #[tokio::test]
1828 async fn fetch_html_redirect_no_location_returns_error() {
1829 use wiremock::matchers::{method, path};
1830 use wiremock::{Mock, ResponseTemplate};
1831
1832 let (executor, server) = mock_server_executor().await;
1833 Mock::given(method("GET"))
1835 .and(path("/redirect-no-loc"))
1836 .respond_with(ResponseTemplate::new(302))
1837 .mount(&server)
1838 .await;
1839
1840 let (host, addrs) = server_host_and_addr(&server);
1841 let url = format!("{}/redirect-no-loc", server.uri());
1842 let result = executor
1843 .fetch_html(&url, &host, &addrs, "fetch", "test-cid", None, None)
1844 .await;
1845 assert!(result.is_err());
1846 let msg = result.unwrap_err().to_string();
1847 assert!(
1848 msg.contains("Location") || msg.contains("location"),
1849 "expected Location-related error: {msg}"
1850 );
1851 }
1852
1853 #[tokio::test]
1854 async fn fetch_html_single_redirect_followed() {
1855 use wiremock::matchers::{method, path};
1856 use wiremock::{Mock, ResponseTemplate};
1857
1858 let (executor, server) = mock_server_executor().await;
1859 let final_url = format!("{}/final", server.uri());
1860
1861 Mock::given(method("GET"))
1862 .and(path("/start"))
1863 .respond_with(ResponseTemplate::new(302).insert_header("location", final_url.as_str()))
1864 .mount(&server)
1865 .await;
1866
1867 Mock::given(method("GET"))
1868 .and(path("/final"))
1869 .respond_with(ResponseTemplate::new(200).set_body_string("<p>final</p>"))
1870 .mount(&server)
1871 .await;
1872
1873 let (host, addrs) = server_host_and_addr(&server);
1874 let url = format!("{}/start", server.uri());
1875 let result = follow_redirects_raw(&executor, &url, &host, &addrs).await;
1876 assert!(result.is_ok(), "single redirect should succeed: {result:?}");
1877 assert_eq!(result.unwrap(), "<p>final</p>");
1878 }
1879
1880 #[tokio::test]
1881 async fn fetch_html_three_redirects_allowed() {
1882 use wiremock::matchers::{method, path};
1883 use wiremock::{Mock, ResponseTemplate};
1884
1885 let (executor, server) = mock_server_executor().await;
1886 let hop2 = format!("{}/hop2", server.uri());
1887 let hop3 = format!("{}/hop3", server.uri());
1888 let final_dest = format!("{}/done", server.uri());
1889
1890 Mock::given(method("GET"))
1891 .and(path("/hop1"))
1892 .respond_with(ResponseTemplate::new(301).insert_header("location", hop2.as_str()))
1893 .mount(&server)
1894 .await;
1895 Mock::given(method("GET"))
1896 .and(path("/hop2"))
1897 .respond_with(ResponseTemplate::new(301).insert_header("location", hop3.as_str()))
1898 .mount(&server)
1899 .await;
1900 Mock::given(method("GET"))
1901 .and(path("/hop3"))
1902 .respond_with(ResponseTemplate::new(301).insert_header("location", final_dest.as_str()))
1903 .mount(&server)
1904 .await;
1905 Mock::given(method("GET"))
1906 .and(path("/done"))
1907 .respond_with(ResponseTemplate::new(200).set_body_string("<p>done</p>"))
1908 .mount(&server)
1909 .await;
1910
1911 let (host, addrs) = server_host_and_addr(&server);
1912 let url = format!("{}/hop1", server.uri());
1913 let result = follow_redirects_raw(&executor, &url, &host, &addrs).await;
1914 assert!(result.is_ok(), "3 redirects should succeed: {result:?}");
1915 assert_eq!(result.unwrap(), "<p>done</p>");
1916 }
1917
1918 #[tokio::test]
1919 async fn fetch_html_four_redirects_rejected() {
1920 use wiremock::matchers::{method, path};
1921 use wiremock::{Mock, ResponseTemplate};
1922
1923 let (executor, server) = mock_server_executor().await;
1924 let hop2 = format!("{}/r2", server.uri());
1925 let hop3 = format!("{}/r3", server.uri());
1926 let hop4 = format!("{}/r4", server.uri());
1927 let hop5 = format!("{}/r5", server.uri());
1928
1929 for (from, to) in [
1930 ("/r1", &hop2),
1931 ("/r2", &hop3),
1932 ("/r3", &hop4),
1933 ("/r4", &hop5),
1934 ] {
1935 Mock::given(method("GET"))
1936 .and(path(from))
1937 .respond_with(ResponseTemplate::new(301).insert_header("location", to.as_str()))
1938 .mount(&server)
1939 .await;
1940 }
1941
1942 let (host, addrs) = server_host_and_addr(&server);
1943 let url = format!("{}/r1", server.uri());
1944 let result = follow_redirects_raw(&executor, &url, &host, &addrs).await;
1945 assert!(result.is_err(), "4 redirects should be rejected");
1946 let msg = result.unwrap_err().to_string();
1947 assert!(
1948 msg.contains("redirect"),
1949 "expected redirect-related error: {msg}"
1950 );
1951 }
1952
1953 #[tokio::test]
1954 async fn fetch_html_body_too_large_returns_error() {
1955 use wiremock::matchers::{method, path};
1956 use wiremock::{Mock, ResponseTemplate};
1957
1958 let small_limit_executor = WebScrapeExecutor {
1959 timeout: Duration::from_secs(5),
1960 max_body_bytes: 10,
1961 allowed_domains: vec![],
1962 denied_domains: vec![],
1963 audit_logger: None,
1964 egress_config: EgressConfig::default(),
1965 egress_tx: None,
1966 egress_dropped: Arc::new(AtomicU64::new(0)),
1967 ipi_filter: IpiFilter::new(0.6),
1968 };
1969 let server = wiremock::MockServer::start().await;
1970 Mock::given(method("GET"))
1971 .and(path("/big"))
1972 .respond_with(
1973 ResponseTemplate::new(200)
1974 .set_body_string("this body is definitely longer than ten bytes"),
1975 )
1976 .mount(&server)
1977 .await;
1978
1979 let (host, addrs) = server_host_and_addr(&server);
1980 let url = format!("{}/big", server.uri());
1981 let result = small_limit_executor
1982 .fetch_html(&url, &host, &addrs, "fetch", "test-cid", None, None)
1983 .await;
1984 assert!(result.is_err());
1985 let msg = result.unwrap_err().to_string();
1986 assert!(msg.contains("too large"), "expected too-large error: {msg}");
1987 }
1988
1989 #[test]
1990 fn extract_scrape_blocks_empty_block_content() {
1991 let text = "```scrape\n\n```";
1992 let blocks = extract_scrape_blocks(text);
1993 assert_eq!(blocks.len(), 1);
1994 assert!(blocks[0].is_empty());
1995 }
1996
1997 #[test]
1998 fn extract_scrape_blocks_whitespace_only() {
1999 let text = "```scrape\n \n```";
2000 let blocks = extract_scrape_blocks(text);
2001 assert_eq!(blocks.len(), 1);
2002 }
2003
2004 #[test]
2005 fn parse_and_extract_multiple_selectors() {
2006 let html = "<div><h1>Title</h1><p>Para</p></div>";
2007 let result = parse_and_extract(html, "h1, p", &ExtractMode::Text, 10).unwrap();
2008 assert!(result.contains("Title"));
2009 assert!(result.contains("Para"));
2010 }
2011
2012 #[test]
2013 fn webscrape_executor_new_with_custom_config() {
2014 let config = ScrapeConfig {
2015 timeout: 60,
2016 max_body_bytes: 512,
2017 ..Default::default()
2018 };
2019 let executor = WebScrapeExecutor::new(&config);
2020 assert_eq!(executor.max_body_bytes, 512);
2021 }
2022
2023 #[test]
2024 fn webscrape_executor_debug() {
2025 let config = ScrapeConfig::default();
2026 let executor = WebScrapeExecutor::new(&config);
2027 let dbg = format!("{executor:?}");
2028 assert!(dbg.contains("WebScrapeExecutor"));
2029 }
2030
2031 #[test]
2032 fn extract_mode_attr_empty_name() {
2033 let mode = ExtractMode::parse("attr:");
2034 assert!(matches!(mode, ExtractMode::Attr(ref s) if s.is_empty()));
2035 }
2036
2037 #[test]
2038 fn default_extract_returns_text() {
2039 assert_eq!(default_extract(), "text");
2040 }
2041
2042 #[test]
2043 fn scrape_instruction_debug() {
2044 let json = r#"{"url":"https://example.com","select":"h1"}"#;
2045 let instr: ScrapeInstruction = serde_json::from_str(json).unwrap();
2046 let dbg = format!("{instr:?}");
2047 assert!(dbg.contains("ScrapeInstruction"));
2048 }
2049
2050 #[test]
2051 fn extract_mode_debug() {
2052 let mode = ExtractMode::Text;
2053 let dbg = format!("{mode:?}");
2054 assert!(dbg.contains("Text"));
2055 }
2056
2057 #[test]
2062 fn max_redirects_constant_is_three() {
2063 const MAX_REDIRECTS: usize = 3;
2067 assert_eq!(MAX_REDIRECTS, 3, "fetch_html allows exactly 3 redirects");
2068 }
2069
2070 #[test]
2073 fn redirect_no_location_error_message() {
2074 let err = std::io::Error::other("redirect with no Location");
2075 assert!(err.to_string().contains("redirect with no Location"));
2076 }
2077
2078 #[test]
2080 fn too_many_redirects_error_message() {
2081 let err = std::io::Error::other("too many redirects");
2082 assert!(err.to_string().contains("too many redirects"));
2083 }
2084
2085 #[test]
2087 fn non_2xx_status_error_format() {
2088 let status = reqwest::StatusCode::FORBIDDEN;
2089 let msg = format!("HTTP {status}");
2090 assert!(msg.contains("403"));
2091 }
2092
2093 #[test]
2095 fn not_found_status_error_format() {
2096 let status = reqwest::StatusCode::NOT_FOUND;
2097 let msg = format!("HTTP {status}");
2098 assert!(msg.contains("404"));
2099 }
2100
2101 #[test]
2103 fn relative_redirect_same_host_path() {
2104 let base = Url::parse("https://example.com/current").unwrap();
2105 let resolved = base.join("/other").unwrap();
2106 assert_eq!(resolved.as_str(), "https://example.com/other");
2107 }
2108
2109 #[test]
2111 fn relative_redirect_relative_path() {
2112 let base = Url::parse("https://example.com/a/b").unwrap();
2113 let resolved = base.join("c").unwrap();
2114 assert_eq!(resolved.as_str(), "https://example.com/a/c");
2115 }
2116
2117 #[test]
2119 fn absolute_redirect_overrides_base() {
2120 let base = Url::parse("https://example.com/page").unwrap();
2121 let resolved = base.join("https://other.com/target").unwrap();
2122 assert_eq!(resolved.as_str(), "https://other.com/target");
2123 }
2124
2125 #[test]
2127 fn redirect_http_downgrade_rejected() {
2128 let location = "http://example.com/page";
2129 let base = Url::parse("https://example.com/start").unwrap();
2130 let next = base.join(location).unwrap();
2131 let err = validate_url(next.as_str()).unwrap_err();
2132 assert!(matches!(err, ToolError::Blocked { .. }));
2133 }
2134
2135 #[test]
2137 fn redirect_location_private_ip_blocked() {
2138 let location = "https://192.168.100.1/admin";
2139 let base = Url::parse("https://example.com/start").unwrap();
2140 let next = base.join(location).unwrap();
2141 let err = validate_url(next.as_str()).unwrap_err();
2142 assert!(matches!(err, ToolError::Blocked { .. }));
2143 let ToolError::Blocked { command: cmd } = err else {
2144 panic!("expected Blocked");
2145 };
2146 assert!(
2147 cmd.contains("private") || cmd.contains("scheme"),
2148 "error message should describe the block reason: {cmd}"
2149 );
2150 }
2151
2152 #[test]
2154 fn redirect_location_internal_domain_blocked() {
2155 let location = "https://metadata.internal/latest/meta-data/";
2156 let base = Url::parse("https://example.com/start").unwrap();
2157 let next = base.join(location).unwrap();
2158 let err = validate_url(next.as_str()).unwrap_err();
2159 assert!(matches!(err, ToolError::Blocked { .. }));
2160 }
2161
2162 #[test]
2164 fn redirect_chain_three_hops_all_public() {
2165 let hops = [
2166 "https://redirect1.example.com/hop1",
2167 "https://redirect2.example.com/hop2",
2168 "https://destination.example.com/final",
2169 ];
2170 for hop in hops {
2171 assert!(validate_url(hop).is_ok(), "expected ok for {hop}");
2172 }
2173 }
2174
2175 #[test]
2180 fn redirect_to_private_ip_rejected_by_validate_url() {
2181 let private_targets = [
2183 "https://127.0.0.1/secret",
2184 "https://10.0.0.1/internal",
2185 "https://192.168.1.1/admin",
2186 "https://172.16.0.1/data",
2187 "https://[::1]/path",
2188 "https://[fe80::1]/path",
2189 "https://localhost/path",
2190 "https://service.internal/api",
2191 ];
2192 for target in private_targets {
2193 let result = validate_url(target);
2194 assert!(result.is_err(), "expected error for {target}");
2195 assert!(
2196 matches!(result.unwrap_err(), ToolError::Blocked { .. }),
2197 "expected Blocked for {target}"
2198 );
2199 }
2200 }
2201
2202 #[test]
2204 fn redirect_relative_url_resolves_correctly() {
2205 let base = Url::parse("https://example.com/page").unwrap();
2206 let relative = "/other";
2207 let resolved = base.join(relative).unwrap();
2208 assert_eq!(resolved.as_str(), "https://example.com/other");
2209 }
2210
2211 #[test]
2213 fn redirect_to_http_rejected() {
2214 let err = validate_url("http://example.com/page").unwrap_err();
2215 assert!(matches!(err, ToolError::Blocked { .. }));
2216 }
2217
2218 #[test]
2219 fn ipv4_mapped_ipv6_link_local_blocked() {
2220 let err = validate_url("https://[::ffff:169.254.0.1]/path").unwrap_err();
2221 assert!(matches!(err, ToolError::Blocked { .. }));
2222 }
2223
2224 #[test]
2225 fn ipv4_mapped_ipv6_public_allowed() {
2226 assert!(validate_url("https://[::ffff:93.184.216.34]/path").is_ok());
2227 }
2228
2229 #[tokio::test]
2232 async fn fetch_http_scheme_blocked() {
2233 let config = ScrapeConfig::default();
2234 let executor = WebScrapeExecutor::new(&config);
2235 let call = crate::executor::ToolCall {
2236 tool_id: ToolName::new("fetch"),
2237 params: {
2238 let mut m = serde_json::Map::new();
2239 m.insert("url".to_owned(), serde_json::json!("http://example.com"));
2240 m
2241 },
2242 caller_id: None,
2243 context: None,
2244
2245 tool_call_id: String::new(),
2246 skill_name: None,
2247 };
2248 let result = executor.execute_tool_call(&call).await;
2249 assert!(matches!(result, Err(ToolError::Blocked { .. })));
2250 }
2251
2252 #[tokio::test]
2253 async fn fetch_private_ip_blocked() {
2254 let config = ScrapeConfig::default();
2255 let executor = WebScrapeExecutor::new(&config);
2256 let call = crate::executor::ToolCall {
2257 tool_id: ToolName::new("fetch"),
2258 params: {
2259 let mut m = serde_json::Map::new();
2260 m.insert(
2261 "url".to_owned(),
2262 serde_json::json!("https://192.168.1.1/secret"),
2263 );
2264 m
2265 },
2266 caller_id: None,
2267 context: None,
2268
2269 tool_call_id: String::new(),
2270 skill_name: None,
2271 };
2272 let result = executor.execute_tool_call(&call).await;
2273 assert!(matches!(result, Err(ToolError::Blocked { .. })));
2274 }
2275
2276 #[tokio::test]
2277 async fn fetch_localhost_blocked() {
2278 let config = ScrapeConfig::default();
2279 let executor = WebScrapeExecutor::new(&config);
2280 let call = crate::executor::ToolCall {
2281 tool_id: ToolName::new("fetch"),
2282 params: {
2283 let mut m = serde_json::Map::new();
2284 m.insert(
2285 "url".to_owned(),
2286 serde_json::json!("https://localhost/page"),
2287 );
2288 m
2289 },
2290 caller_id: None,
2291 context: None,
2292
2293 tool_call_id: String::new(),
2294 skill_name: None,
2295 };
2296 let result = executor.execute_tool_call(&call).await;
2297 assert!(matches!(result, Err(ToolError::Blocked { .. })));
2298 }
2299
2300 #[tokio::test]
2301 async fn fetch_unknown_tool_returns_none() {
2302 let config = ScrapeConfig::default();
2303 let executor = WebScrapeExecutor::new(&config);
2304 let call = crate::executor::ToolCall {
2305 tool_id: ToolName::new("unknown_tool"),
2306 params: serde_json::Map::new(),
2307 caller_id: None,
2308 context: None,
2309
2310 tool_call_id: String::new(),
2311 skill_name: None,
2312 };
2313 let result = executor.execute_tool_call(&call).await;
2314 assert!(result.unwrap().is_none());
2315 }
2316
2317 #[tokio::test]
2318 async fn fetch_returns_body_via_mock() {
2319 use wiremock::matchers::{method, path};
2320 use wiremock::{Mock, ResponseTemplate};
2321
2322 let (executor, server) = mock_server_executor().await;
2323 Mock::given(method("GET"))
2324 .and(path("/content"))
2325 .respond_with(ResponseTemplate::new(200).set_body_string("plain text content"))
2326 .mount(&server)
2327 .await;
2328
2329 let (host, addrs) = server_host_and_addr(&server);
2330 let url = format!("{}/content", server.uri());
2331 let result = executor
2332 .fetch_html(&url, &host, &addrs, "fetch", "test-cid", None, None)
2333 .await;
2334 assert!(result.is_ok());
2335 assert_eq!(result.unwrap(), "plain text content");
2336 }
2337
2338 #[test]
2339 fn tool_definitions_returns_web_scrape_and_fetch() {
2340 let config = ScrapeConfig::default();
2341 let executor = WebScrapeExecutor::new(&config);
2342 let defs = executor.tool_definitions();
2343 assert_eq!(defs.len(), 2);
2344 assert_eq!(defs[0].id, "web_scrape");
2345 assert_eq!(
2346 defs[0].invocation,
2347 crate::registry::InvocationHint::FencedBlock("scrape")
2348 );
2349 assert_eq!(defs[1].id, "fetch");
2350 assert_eq!(
2351 defs[1].invocation,
2352 crate::registry::InvocationHint::ToolCall
2353 );
2354 }
2355
2356 #[test]
2357 fn tool_definitions_schema_has_all_params() {
2358 let config = ScrapeConfig::default();
2359 let executor = WebScrapeExecutor::new(&config);
2360 let defs = executor.tool_definitions();
2361 let obj = defs[0].schema.as_object().unwrap();
2362 let props = obj["properties"].as_object().unwrap();
2363 assert!(props.contains_key("url"));
2364 assert!(props.contains_key("select"));
2365 assert!(props.contains_key("extract"));
2366 assert!(props.contains_key("limit"));
2367 let req = obj["required"].as_array().unwrap();
2368 assert!(req.iter().any(|v| v.as_str() == Some("url")));
2369 assert!(req.iter().any(|v| v.as_str() == Some("select")));
2370 assert!(!req.iter().any(|v| v.as_str() == Some("extract")));
2371 }
2372
2373 #[test]
2376 fn subdomain_localhost_blocked() {
2377 let host: url::Host<&str> = url::Host::Domain("foo.localhost");
2378 assert!(is_private_host(&host));
2379 }
2380
2381 #[test]
2382 fn internal_tld_blocked() {
2383 let host: url::Host<&str> = url::Host::Domain("service.internal");
2384 assert!(is_private_host(&host));
2385 }
2386
2387 #[test]
2388 fn local_tld_blocked() {
2389 let host: url::Host<&str> = url::Host::Domain("printer.local");
2390 assert!(is_private_host(&host));
2391 }
2392
2393 #[test]
2394 fn public_domain_not_blocked() {
2395 let host: url::Host<&str> = url::Host::Domain("example.com");
2396 assert!(!is_private_host(&host));
2397 }
2398
2399 #[tokio::test]
2402 async fn resolve_loopback_rejected() {
2403 let url = url::Url::parse("https://127.0.0.1/path").unwrap();
2405 let result = resolve_and_validate(&url).await;
2407 assert!(
2408 result.is_err(),
2409 "loopback IP must be rejected by resolve_and_validate"
2410 );
2411 let err = result.unwrap_err();
2412 assert!(matches!(err, crate::executor::ToolError::Blocked { .. }));
2413 }
2414
2415 #[tokio::test]
2416 async fn resolve_private_10_rejected() {
2417 let url = url::Url::parse("https://10.0.0.1/path").unwrap();
2418 let result = resolve_and_validate(&url).await;
2419 assert!(result.is_err());
2420 assert!(matches!(
2421 result.unwrap_err(),
2422 crate::executor::ToolError::Blocked { .. }
2423 ));
2424 }
2425
2426 #[tokio::test]
2427 async fn resolve_private_192_rejected() {
2428 let url = url::Url::parse("https://192.168.1.1/path").unwrap();
2429 let result = resolve_and_validate(&url).await;
2430 assert!(result.is_err());
2431 assert!(matches!(
2432 result.unwrap_err(),
2433 crate::executor::ToolError::Blocked { .. }
2434 ));
2435 }
2436
2437 #[tokio::test]
2438 async fn resolve_ipv6_loopback_rejected() {
2439 let url = url::Url::parse("https://[::1]/path").unwrap();
2440 let result = resolve_and_validate(&url).await;
2441 assert!(result.is_err());
2442 assert!(matches!(
2443 result.unwrap_err(),
2444 crate::executor::ToolError::Blocked { .. }
2445 ));
2446 }
2447
2448 #[tokio::test]
2449 async fn resolve_no_host_returns_ok() {
2450 let url = url::Url::parse("https://example.com/path").unwrap();
2452 let url_no_host = url::Url::parse("data:text/plain,hello").unwrap();
2454 let result = resolve_and_validate(&url_no_host).await;
2456 assert!(result.is_ok());
2457 let (host, addrs) = result.unwrap();
2458 assert!(host.is_empty());
2459 assert!(addrs.is_empty());
2460 drop(url);
2461 drop(url_no_host);
2462 }
2463
2464 async fn make_file_audit_logger(
2468 dir: &tempfile::TempDir,
2469 ) -> (
2470 std::sync::Arc<crate::audit::AuditLogger>,
2471 std::path::PathBuf,
2472 ) {
2473 use crate::audit::AuditLogger;
2474 use crate::config::AuditConfig;
2475 let path = dir.path().join("audit.log");
2476 let config = AuditConfig {
2477 enabled: true,
2478 destination: crate::config::AuditDestination::File(path.clone()),
2479 ..Default::default()
2480 };
2481 let logger = std::sync::Arc::new(AuditLogger::from_config(&config, false).await.unwrap());
2482 (logger, path)
2483 }
2484
2485 #[tokio::test]
2486 async fn with_audit_sets_logger() {
2487 let config = ScrapeConfig::default();
2488 let executor = WebScrapeExecutor::new(&config);
2489 assert!(executor.audit_logger.is_none());
2490
2491 let dir = tempfile::tempdir().unwrap();
2492 let (logger, _path) = make_file_audit_logger(&dir).await;
2493 let executor = executor.with_audit(logger);
2494 assert!(executor.audit_logger.is_some());
2495 }
2496
2497 #[test]
2498 fn tool_error_to_audit_result_blocked_maps_correctly() {
2499 let err = ToolError::Blocked {
2500 command: "scheme not allowed: http".into(),
2501 };
2502 let result = tool_error_to_audit_result(&err);
2503 assert!(
2504 matches!(result, AuditResult::Blocked { reason } if reason == "scheme not allowed: http")
2505 );
2506 }
2507
2508 #[test]
2509 fn tool_error_to_audit_result_timeout_maps_correctly() {
2510 let err = ToolError::Timeout { timeout_secs: 15 };
2511 let result = tool_error_to_audit_result(&err);
2512 assert!(matches!(result, AuditResult::Timeout));
2513 }
2514
2515 #[test]
2516 fn tool_error_to_audit_result_execution_error_maps_correctly() {
2517 let err = ToolError::Execution(std::io::Error::other("connection refused"));
2518 let result = tool_error_to_audit_result(&err);
2519 assert!(
2520 matches!(result, AuditResult::Error { message } if message.contains("connection refused"))
2521 );
2522 }
2523
2524 #[tokio::test]
2525 async fn fetch_audit_blocked_url_logged() {
2526 let dir = tempfile::tempdir().unwrap();
2527 let (logger, log_path) = make_file_audit_logger(&dir).await;
2528
2529 let config = ScrapeConfig::default();
2530 let executor = WebScrapeExecutor::new(&config).with_audit(logger);
2531
2532 let call = crate::executor::ToolCall {
2533 tool_id: ToolName::new("fetch"),
2534 params: {
2535 let mut m = serde_json::Map::new();
2536 m.insert("url".to_owned(), serde_json::json!("http://example.com"));
2537 m
2538 },
2539 caller_id: None,
2540 context: None,
2541
2542 tool_call_id: String::new(),
2543 skill_name: None,
2544 };
2545 let result = executor.execute_tool_call(&call).await;
2546 assert!(matches!(result, Err(ToolError::Blocked { .. })));
2547
2548 let content = tokio::fs::read_to_string(&log_path).await.unwrap();
2549 assert!(
2550 content.contains("\"tool\":\"fetch\""),
2551 "expected tool=fetch in audit: {content}"
2552 );
2553 assert!(
2554 content.contains("\"type\":\"blocked\""),
2555 "expected type=blocked in audit: {content}"
2556 );
2557 assert!(
2558 content.contains("http://example.com"),
2559 "expected URL in audit command field: {content}"
2560 );
2561 }
2562
2563 #[tokio::test]
2564 async fn log_audit_success_writes_to_file() {
2565 let dir = tempfile::tempdir().unwrap();
2566 let (logger, log_path) = make_file_audit_logger(&dir).await;
2567
2568 let config = ScrapeConfig::default();
2569 let executor = WebScrapeExecutor::new(&config).with_audit(logger);
2570
2571 executor
2572 .log_audit(
2573 "fetch",
2574 "https://example.com/page",
2575 AuditResult::Success,
2576 42,
2577 None,
2578 None,
2579 None,
2580 None,
2581 )
2582 .await;
2583
2584 let content = tokio::fs::read_to_string(&log_path).await.unwrap();
2585 assert!(
2586 content.contains("\"tool\":\"fetch\""),
2587 "expected tool=fetch in audit: {content}"
2588 );
2589 assert!(
2590 content.contains("\"type\":\"success\""),
2591 "expected type=success in audit: {content}"
2592 );
2593 assert!(
2594 content.contains("\"command\":\"https://example.com/page\""),
2595 "expected command URL in audit: {content}"
2596 );
2597 assert!(
2598 content.contains("\"duration_ms\":42"),
2599 "expected duration_ms in audit: {content}"
2600 );
2601 }
2602
2603 #[tokio::test]
2604 async fn log_audit_blocked_writes_to_file() {
2605 let dir = tempfile::tempdir().unwrap();
2606 let (logger, log_path) = make_file_audit_logger(&dir).await;
2607
2608 let config = ScrapeConfig::default();
2609 let executor = WebScrapeExecutor::new(&config).with_audit(logger);
2610
2611 executor
2612 .log_audit(
2613 "web_scrape",
2614 "http://evil.com/page",
2615 AuditResult::Blocked {
2616 reason: "scheme not allowed: http".into(),
2617 },
2618 0,
2619 None,
2620 None,
2621 None,
2622 None,
2623 )
2624 .await;
2625
2626 let content = tokio::fs::read_to_string(&log_path).await.unwrap();
2627 assert!(
2628 content.contains("\"tool\":\"web_scrape\""),
2629 "expected tool=web_scrape in audit: {content}"
2630 );
2631 assert!(
2632 content.contains("\"type\":\"blocked\""),
2633 "expected type=blocked in audit: {content}"
2634 );
2635 assert!(
2636 content.contains("scheme not allowed"),
2637 "expected block reason in audit: {content}"
2638 );
2639 }
2640
2641 #[tokio::test]
2642 async fn web_scrape_audit_blocked_url_logged() {
2643 let dir = tempfile::tempdir().unwrap();
2644 let (logger, log_path) = make_file_audit_logger(&dir).await;
2645
2646 let config = ScrapeConfig::default();
2647 let executor = WebScrapeExecutor::new(&config).with_audit(logger);
2648
2649 let call = crate::executor::ToolCall {
2650 tool_id: ToolName::new("web_scrape"),
2651 params: {
2652 let mut m = serde_json::Map::new();
2653 m.insert("url".to_owned(), serde_json::json!("http://example.com"));
2654 m.insert("select".to_owned(), serde_json::json!("h1"));
2655 m
2656 },
2657 caller_id: None,
2658 context: None,
2659
2660 tool_call_id: String::new(),
2661 skill_name: None,
2662 };
2663 let result = executor.execute_tool_call(&call).await;
2664 assert!(matches!(result, Err(ToolError::Blocked { .. })));
2665
2666 let content = tokio::fs::read_to_string(&log_path).await.unwrap();
2667 assert!(
2668 content.contains("\"tool\":\"web_scrape\""),
2669 "expected tool=web_scrape in audit: {content}"
2670 );
2671 assert!(
2672 content.contains("\"type\":\"blocked\""),
2673 "expected type=blocked in audit: {content}"
2674 );
2675 }
2676
2677 #[tokio::test]
2678 async fn no_audit_logger_does_not_panic_on_blocked_fetch() {
2679 let config = ScrapeConfig::default();
2680 let executor = WebScrapeExecutor::new(&config);
2681 assert!(executor.audit_logger.is_none());
2682
2683 let call = crate::executor::ToolCall {
2684 tool_id: ToolName::new("fetch"),
2685 params: {
2686 let mut m = serde_json::Map::new();
2687 m.insert("url".to_owned(), serde_json::json!("http://example.com"));
2688 m
2689 },
2690 caller_id: None,
2691 context: None,
2692
2693 tool_call_id: String::new(),
2694 skill_name: None,
2695 };
2696 let result = executor.execute_tool_call(&call).await;
2698 assert!(matches!(result, Err(ToolError::Blocked { .. })));
2699 }
2700
2701 #[tokio::test]
2703 async fn fetch_execute_tool_call_end_to_end() {
2704 use wiremock::matchers::{method, path};
2705 use wiremock::{Mock, ResponseTemplate};
2706
2707 let (executor, server) = mock_server_executor().await;
2708 Mock::given(method("GET"))
2709 .and(path("/e2e"))
2710 .respond_with(ResponseTemplate::new(200).set_body_string("<h1>end-to-end</h1>"))
2711 .mount(&server)
2712 .await;
2713
2714 let (host, addrs) = server_host_and_addr(&server);
2715 let result = executor
2717 .fetch_html(
2718 &format!("{}/e2e", server.uri()),
2719 &host,
2720 &addrs,
2721 "fetch",
2722 "test-cid",
2723 None,
2724 None,
2725 )
2726 .await;
2727 assert!(result.is_ok());
2728 assert!(result.unwrap().contains("end-to-end"));
2729 }
2730
2731 #[test]
2734 fn domain_matches_exact() {
2735 assert!(domain_matches("example.com", "example.com"));
2736 assert!(!domain_matches("example.com", "other.com"));
2737 assert!(!domain_matches("example.com", "sub.example.com"));
2738 }
2739
2740 #[test]
2741 fn domain_matches_wildcard_single_subdomain() {
2742 assert!(domain_matches("*.example.com", "sub.example.com"));
2743 assert!(!domain_matches("*.example.com", "example.com"));
2744 assert!(!domain_matches("*.example.com", "sub.sub.example.com"));
2745 }
2746
2747 #[test]
2748 fn domain_matches_wildcard_does_not_match_empty_label() {
2749 assert!(!domain_matches("*.example.com", ".example.com"));
2751 }
2752
2753 #[test]
2754 fn domain_matches_multi_wildcard_treated_as_exact() {
2755 assert!(!domain_matches("*.*.example.com", "a.b.example.com"));
2757 }
2758
2759 #[test]
2762 fn check_domain_policy_empty_lists_allow_all() {
2763 assert!(check_domain_policy("example.com", &[], &[]).is_ok());
2764 assert!(check_domain_policy("evil.com", &[], &[]).is_ok());
2765 }
2766
2767 #[test]
2768 fn check_domain_policy_denylist_blocks() {
2769 let denied = vec!["evil.com".to_string()];
2770 let err = check_domain_policy("evil.com", &[], &denied).unwrap_err();
2771 assert!(matches!(err, ToolError::Blocked { .. }));
2772 }
2773
2774 #[test]
2775 fn check_domain_policy_denylist_does_not_block_other_domains() {
2776 let denied = vec!["evil.com".to_string()];
2777 assert!(check_domain_policy("good.com", &[], &denied).is_ok());
2778 }
2779
2780 #[test]
2781 fn check_domain_policy_allowlist_permits_matching() {
2782 let allowed = vec!["docs.rs".to_string(), "*.rust-lang.org".to_string()];
2783 assert!(check_domain_policy("docs.rs", &allowed, &[]).is_ok());
2784 assert!(check_domain_policy("blog.rust-lang.org", &allowed, &[]).is_ok());
2785 }
2786
2787 #[test]
2788 fn check_domain_policy_allowlist_blocks_unknown() {
2789 let allowed = vec!["docs.rs".to_string()];
2790 let err = check_domain_policy("other.com", &allowed, &[]).unwrap_err();
2791 assert!(matches!(err, ToolError::Blocked { .. }));
2792 }
2793
2794 #[test]
2795 fn check_domain_policy_deny_overrides_allow() {
2796 let allowed = vec!["example.com".to_string()];
2797 let denied = vec!["example.com".to_string()];
2798 let err = check_domain_policy("example.com", &allowed, &denied).unwrap_err();
2799 assert!(matches!(err, ToolError::Blocked { .. }));
2800 }
2801
2802 #[test]
2803 fn check_domain_policy_wildcard_in_denylist() {
2804 let denied = vec!["*.evil.com".to_string()];
2805 let err = check_domain_policy("sub.evil.com", &[], &denied).unwrap_err();
2806 assert!(matches!(err, ToolError::Blocked { .. }));
2807 assert!(check_domain_policy("evil.com", &[], &denied).is_ok());
2809 }
2810}