use argentor_core::{ArgentorResult, ToolCall, ToolResult};
use argentor_security::Capability;
use argentor_skills::skill::{Skill, SkillDescriptor};
use async_trait::async_trait;
use regex::Regex;
use std::time::Duration;
use tracing::info;
const DEFAULT_MAX_LENGTH: usize = 50_000;
const MAX_RESPONSE_SIZE: usize = 5 * 1024 * 1024;
pub fn strip_html_tags(html: &str) -> String {
let re_script = Regex::new(r"(?is)<script[^>]*>.*?</script>").unwrap_or_else(|_| {
Regex::new(r"<script>")
.unwrap_or_else(|_| Regex::new("$^").unwrap_or_else(|_| unreachable!()))
});
let re_style = Regex::new(r"(?is)<style[^>]*>.*?</style>").unwrap_or_else(|_| {
Regex::new(r"<style>")
.unwrap_or_else(|_| Regex::new("$^").unwrap_or_else(|_| unreachable!()))
});
let cleaned = re_script.replace_all(html, " ");
let cleaned = re_style.replace_all(&cleaned, " ");
let re_tags = Regex::new(r"<[^>]+>")
.unwrap_or_else(|_| Regex::new("$^").unwrap_or_else(|_| unreachable!()));
let text = re_tags.replace_all(&cleaned, " ");
let text = text
.replace("&", "&")
.replace("<", "<")
.replace(">", ">")
.replace(""", "\"")
.replace("'", "'")
.replace("'", "'")
.replace(" ", " ");
let re_ws =
Regex::new(r"\s+").unwrap_or_else(|_| Regex::new(" +").unwrap_or_else(|_| unreachable!()));
let result = re_ws.replace_all(&text, " ");
result.trim().to_string()
}
fn extract_by_selector(html: &str, selector: &str) -> String {
if selector == "all" {
return html.to_string();
}
let pattern = format!(r"(?is)<{selector}[^>]*>(.*?)</{selector}>");
if let Ok(re) = Regex::new(&pattern) {
if let Some(caps) = re.captures(html) {
if let Some(m) = caps.get(1) {
return m.as_str().to_string();
}
}
}
html.to_string()
}
fn extract_all_links(html: &str) -> Vec<serde_json::Value> {
let mut links = Vec::new();
let re = match Regex::new(r#"(?is)<a\s[^>]*href=["']([^"']+)["'][^>]*>(.*?)</a>"#) {
Ok(r) => r,
Err(_) => return links,
};
for caps in re.captures_iter(html) {
let url = caps.get(1).map_or("", |m| m.as_str()).to_string();
let raw_text = caps.get(2).map_or("", |m| m.as_str());
let text = strip_html_tags(raw_text);
if url.starts_with('#') || url.starts_with("javascript:") {
continue;
}
links.push(serde_json::json!({
"url": url,
"text": text.trim(),
}));
}
links
}
fn extract_metadata(html: &str) -> serde_json::Value {
let title = extract_tag_content(html, "title");
let description = extract_meta_content(html, "description");
let keywords = extract_meta_content(html, "keywords");
let og_title = extract_meta_property(html, "og:title");
let og_description = extract_meta_property(html, "og:description");
let og_image = extract_meta_property(html, "og:image");
serde_json::json!({
"title": title,
"description": description,
"keywords": keywords,
"og_title": og_title,
"og_description": og_description,
"og_image": og_image,
})
}
fn extract_tag_content(html: &str, tag: &str) -> String {
let pattern = format!(r"(?is)<{tag}[^>]*>(.*?)</{tag}>");
if let Ok(re) = Regex::new(&pattern) {
if let Some(caps) = re.captures(html) {
return caps.get(1).map_or("", |m| m.as_str()).trim().to_string();
}
}
String::new()
}
fn extract_meta_content(html: &str, name: &str) -> String {
let pattern =
format!(r#"(?is)<meta\s[^>]*name=["']{name}["'][^>]*content=["']([^"']*)["'][^>]*/?\s*>"#);
if let Ok(re) = Regex::new(&pattern) {
if let Some(caps) = re.captures(html) {
return caps.get(1).map_or("", |m| m.as_str()).to_string();
}
}
let pattern_rev =
format!(r#"(?is)<meta\s[^>]*content=["']([^"']*)["'][^>]*name=["']{name}["'][^>]*/?\s*>"#);
if let Ok(re) = Regex::new(&pattern_rev) {
if let Some(caps) = re.captures(html) {
return caps.get(1).map_or("", |m| m.as_str()).to_string();
}
}
String::new()
}
fn extract_meta_property(html: &str, property: &str) -> String {
let pattern = format!(
r#"(?is)<meta\s[^>]*property=["']{property}["'][^>]*content=["']([^"']*)["'][^>]*/?\s*>"#
);
if let Ok(re) = Regex::new(&pattern) {
if let Some(caps) = re.captures(html) {
return caps.get(1).map_or("", |m| m.as_str()).to_string();
}
}
let pattern_rev = format!(
r#"(?is)<meta\s[^>]*content=["']([^"']*)["'][^>]*property=["']{property}["'][^>]*/?\s*>"#
);
if let Ok(re) = Regex::new(&pattern_rev) {
if let Some(caps) = re.captures(html) {
return caps.get(1).map_or("", |m| m.as_str()).to_string();
}
}
String::new()
}
fn extract_headings(html: &str) -> Vec<serde_json::Value> {
let mut headings = Vec::new();
for level in 1u8..=6 {
let pattern = format!(r"(?is)<h{level}[^>]*>(.*?)</h{level}>");
let re = match Regex::new(&pattern) {
Ok(r) => r,
Err(_) => continue,
};
for caps in re.captures_iter(html) {
let raw_text = caps.get(1).map_or("", |m| m.as_str());
let text = strip_html_tags(raw_text);
if !text.is_empty() {
headings.push(serde_json::json!({
"level": level,
"text": text.trim(),
}));
}
}
}
let mut positioned: Vec<(usize, serde_json::Value)> = Vec::new();
for level in 1u8..=6 {
let pattern = format!(r"(?is)<h{level}[^>]*>(.*?)</h{level}>");
if let Ok(re) = Regex::new(&pattern) {
for m in re.find_iter(html) {
let cap = re.captures(m.as_str());
let raw_text = cap
.as_ref()
.and_then(|c| c.get(1))
.map_or("", |m| m.as_str());
let text = strip_html_tags(raw_text);
if !text.is_empty() {
positioned.push((
m.start(),
serde_json::json!({
"level": level,
"text": text.trim(),
}),
));
}
}
}
}
positioned.sort_by_key(|(pos, _)| *pos);
positioned.into_iter().map(|(_, v)| v).collect()
}
fn truncate_text(text: &str, max_len: usize) -> String {
if text.len() <= max_len {
return text.to_string();
}
let mut end = max_len;
while end > 0 && !text.is_char_boundary(end) {
end -= 1;
}
format!("{}...[truncated]", &text[..end])
}
pub struct WebScraperSkill {
descriptor: SkillDescriptor,
client: reqwest::Client,
}
impl WebScraperSkill {
pub fn new() -> Self {
#[allow(clippy::expect_used)]
let client = reqwest::Client::builder()
.timeout(Duration::from_secs(30))
.redirect(reqwest::redirect::Policy::limited(10))
.user_agent("Argentor/0.1 (AI Agent WebScraper)")
.build()
.expect("Failed to create HTTP client -- TLS backend unavailable");
Self {
descriptor: SkillDescriptor {
name: "web_scraper".to_string(),
description: "Extract clean text, links, metadata, and headings from web pages."
.to_string(),
parameters_schema: serde_json::json!({
"type": "object",
"properties": {
"operation": {
"type": "string",
"enum": ["scrape", "extract_links", "extract_metadata", "extract_headings", "extract_text"],
"description": "The operation to perform"
},
"url": {
"type": "string",
"description": "URL to scrape (required for scrape, extract_links, extract_metadata, extract_headings)"
},
"html": {
"type": "string",
"description": "Raw HTML string (required for extract_text)"
},
"selector": {
"type": "string",
"enum": ["article", "main", "body", "all"],
"description": "CSS-like selector to scope content extraction (default: all)"
},
"max_length": {
"type": "integer",
"description": "Maximum content length in characters (default: 50000)"
}
},
"required": ["operation"]
}),
required_capabilities: vec![Capability::NetworkAccess {
allowed_hosts: vec![],
}],
requires_approval: false,
},
client,
}
}
async fn fetch_html(&self, url: &str, call_id: &str) -> Result<String, ToolResult> {
if url.is_empty() {
return Err(ToolResult::error(
call_id,
"URL is required for this operation",
));
}
let parsed = reqwest::Url::parse(url)
.map_err(|e| ToolResult::error(call_id, format!("Invalid URL '{url}': {e}")))?;
match parsed.scheme() {
"http" | "https" => {}
scheme => {
return Err(ToolResult::error(
call_id,
format!("Unsupported scheme '{scheme}'. Only http/https allowed."),
));
}
}
let response = self
.client
.get(url)
.send()
.await
.map_err(|e| ToolResult::error(call_id, format!("Failed to fetch '{url}': {e}")))?;
let status = response.status().as_u16();
if !response.status().is_success() {
return Err(ToolResult::error(
call_id,
format!("HTTP {status} from {url}"),
));
}
let body_bytes = response.bytes().await.map_err(|e| {
ToolResult::error(call_id, format!("Failed to read response body: {e}"))
})?;
if body_bytes.len() > MAX_RESPONSE_SIZE {
return Err(ToolResult::error(
call_id,
format!(
"Response too large: {} bytes (max {})",
body_bytes.len(),
MAX_RESPONSE_SIZE
),
));
}
Ok(String::from_utf8_lossy(&body_bytes).to_string())
}
}
impl Default for WebScraperSkill {
fn default() -> Self {
Self::new()
}
}
#[async_trait]
impl Skill for WebScraperSkill {
fn descriptor(&self) -> &SkillDescriptor {
&self.descriptor
}
async fn execute(&self, call: ToolCall) -> ArgentorResult<ToolResult> {
let operation = call.arguments["operation"]
.as_str()
.unwrap_or_default()
.to_string();
let max_length = call.arguments["max_length"]
.as_u64()
.map(|v| v as usize)
.unwrap_or(DEFAULT_MAX_LENGTH);
let selector = call.arguments["selector"]
.as_str()
.unwrap_or("all")
.to_string();
info!(operation = %operation, "WebScraper execute");
match operation.as_str() {
"scrape" => {
let url = call.arguments["url"].as_str().unwrap_or_default();
let html = match self.fetch_html(url, &call.id).await {
Ok(h) => h,
Err(err_result) => return Ok(err_result),
};
let scoped = extract_by_selector(&html, &selector);
let text = strip_html_tags(&scoped);
let truncated = truncate_text(&text, max_length);
let result = serde_json::json!({
"url": url,
"selector": selector,
"text": truncated,
"length": text.len(),
"truncated": text.len() > max_length,
});
Ok(ToolResult::success(&call.id, result.to_string()))
}
"extract_links" => {
let url = call.arguments["url"].as_str().unwrap_or_default();
let html = match self.fetch_html(url, &call.id).await {
Ok(h) => h,
Err(err_result) => return Ok(err_result),
};
let links = extract_all_links(&html);
let result = serde_json::json!({
"url": url,
"links": links,
"count": links.len(),
});
Ok(ToolResult::success(&call.id, result.to_string()))
}
"extract_metadata" => {
let url = call.arguments["url"].as_str().unwrap_or_default();
let html = match self.fetch_html(url, &call.id).await {
Ok(h) => h,
Err(err_result) => return Ok(err_result),
};
let metadata = extract_metadata(&html);
let result = serde_json::json!({
"url": url,
"metadata": metadata,
});
Ok(ToolResult::success(&call.id, result.to_string()))
}
"extract_headings" => {
let url = call.arguments["url"].as_str().unwrap_or_default();
let html = match self.fetch_html(url, &call.id).await {
Ok(h) => h,
Err(err_result) => return Ok(err_result),
};
let headings = extract_headings(&html);
let result = serde_json::json!({
"url": url,
"headings": headings,
"count": headings.len(),
});
Ok(ToolResult::success(&call.id, result.to_string()))
}
"extract_text" => {
let html = call.arguments["html"]
.as_str()
.unwrap_or_default()
.to_string();
if html.is_empty() {
return Ok(ToolResult::error(
&call.id,
"The 'html' parameter is required for extract_text",
));
}
let text = strip_html_tags(&html);
let truncated = truncate_text(&text, max_length);
let result = serde_json::json!({
"text": truncated,
"length": text.len(),
"truncated": text.len() > max_length,
});
Ok(ToolResult::success(&call.id, result.to_string()))
}
_ => Ok(ToolResult::error(
&call.id,
format!(
"Unknown operation '{operation}'. Valid: scrape, extract_links, extract_metadata, extract_headings, extract_text"
),
)),
}
}
}
#[cfg(test)]
#[allow(clippy::unwrap_used, clippy::expect_used)]
mod tests {
use super::*;
const SAMPLE_HTML: &str = r##"<!DOCTYPE html>
<html>
<head>
<title>Test Page</title>
<meta name="description" content="A test page for scraping">
<meta name="keywords" content="test, scraping, html">
<meta property="og:title" content="OG Test Title">
<meta property="og:description" content="OG test description">
<meta property="og:image" content="https://example.com/image.png">
</head>
<body>
<h1>Main Heading</h1>
<article>
<h2>Article Title</h2>
<p>This is the article content with <strong>bold text</strong> and a
<a href="https://example.com">link to example</a>.</p>
<p>Second paragraph with & entities <here>.</p>
</article>
<main>
<h3>Main Section</h3>
<p>Main content area.</p>
<a href="/about">About Us</a>
<a href="https://other.com/page">Other Page</a>
</main>
<script>var x = "should be removed";</script>
<style>.hidden { display: none; }</style>
<footer>
<a href="#top">Back to top</a>
<a href="javascript:void(0)">JS Link</a>
</footer>
</body>
</html>"##;
#[test]
fn test_strip_html_tags_basic() {
let html = "<p>Hello <b>World</b></p>";
let result = strip_html_tags(html);
assert_eq!(result, "Hello World");
}
#[test]
fn test_strip_html_tags_script_removal() {
let html = "<p>Before</p><script>alert('xss')</script><p>After</p>";
let result = strip_html_tags(html);
assert!(result.contains("Before"));
assert!(result.contains("After"));
assert!(!result.contains("alert"));
assert!(!result.contains("xss"));
}
#[test]
fn test_strip_html_tags_style_removal() {
let html = "<style>.a{color:red}</style><p>Content</p>";
let result = strip_html_tags(html);
assert!(result.contains("Content"));
assert!(!result.contains("color"));
}
#[test]
fn test_strip_html_tags_entities() {
let html = "<p>A & B < C > D "E" F's</p>";
let result = strip_html_tags(html);
assert!(result.contains("A & B < C > D \"E\" F's"));
}
#[test]
fn test_strip_html_tags_whitespace_collapse() {
let html = "<p> Multiple spaces here </p>";
let result = strip_html_tags(html);
assert_eq!(result, "Multiple spaces here");
}
#[test]
fn test_extract_by_selector_article() {
let content = extract_by_selector(SAMPLE_HTML, "article");
assert!(content.contains("Article Title"));
assert!(content.contains("article content"));
assert!(!content.contains("Main Section"));
}
#[test]
fn test_extract_by_selector_main() {
let content = extract_by_selector(SAMPLE_HTML, "main");
assert!(content.contains("Main Section"));
assert!(content.contains("Main content area"));
}
#[test]
fn test_extract_by_selector_all() {
let content = extract_by_selector(SAMPLE_HTML, "all");
assert_eq!(content, SAMPLE_HTML);
}
#[test]
fn test_extract_by_selector_missing() {
let content = extract_by_selector(SAMPLE_HTML, "nav");
assert_eq!(content, SAMPLE_HTML);
}
#[test]
fn test_extract_all_links() {
let links = extract_all_links(SAMPLE_HTML);
let urls: Vec<&str> = links.iter().filter_map(|l| l["url"].as_str()).collect();
assert!(urls.contains(&"https://example.com"));
assert!(urls.contains(&"/about"));
assert!(urls.contains(&"https://other.com/page"));
assert!(!urls.iter().any(|u| u.starts_with('#')));
assert!(!urls.iter().any(|u| u.starts_with("javascript:")));
}
#[test]
fn test_extract_all_links_text() {
let links = extract_all_links(SAMPLE_HTML);
let example_link = links
.iter()
.find(|l| l["url"].as_str() == Some("https://example.com"))
.unwrap();
assert_eq!(example_link["text"].as_str().unwrap(), "link to example");
}
#[test]
fn test_extract_metadata() {
let meta = extract_metadata(SAMPLE_HTML);
assert_eq!(meta["title"], "Test Page");
assert_eq!(meta["description"], "A test page for scraping");
assert_eq!(meta["keywords"], "test, scraping, html");
assert_eq!(meta["og_title"], "OG Test Title");
assert_eq!(meta["og_description"], "OG test description");
assert_eq!(meta["og_image"], "https://example.com/image.png");
}
#[test]
fn test_extract_metadata_empty() {
let html = "<html><body><p>No metadata here</p></body></html>";
let meta = extract_metadata(html);
assert_eq!(meta["title"], "");
assert_eq!(meta["description"], "");
}
#[test]
fn test_extract_headings() {
let headings = extract_headings(SAMPLE_HTML);
assert!(headings.len() >= 3);
let h1 = headings.iter().find(|h| h["level"] == 1).unwrap();
assert_eq!(h1["text"].as_str().unwrap(), "Main Heading");
let h2 = headings.iter().find(|h| h["level"] == 2).unwrap();
assert_eq!(h2["text"].as_str().unwrap(), "Article Title");
let h3 = headings.iter().find(|h| h["level"] == 3).unwrap();
assert_eq!(h3["text"].as_str().unwrap(), "Main Section");
}
#[test]
fn test_truncate_text() {
let short = "Hello World";
assert_eq!(truncate_text(short, 100), "Hello World");
let long = "a".repeat(100);
let truncated = truncate_text(&long, 50);
assert!(truncated.len() < 100);
assert!(truncated.ends_with("...[truncated]"));
}
#[tokio::test]
async fn test_extract_text_operation() {
let skill = WebScraperSkill::new();
let call = ToolCall {
id: "t1".to_string(),
name: "web_scraper".to_string(),
arguments: serde_json::json!({
"operation": "extract_text",
"html": "<h1>Hello</h1><p>World with <b>bold</b> text</p>"
}),
};
let result = skill.execute(call).await.unwrap();
assert!(!result.is_error);
let parsed: serde_json::Value = serde_json::from_str(&result.content).unwrap();
let text = parsed["text"].as_str().unwrap();
assert!(text.contains("Hello"));
assert!(text.contains("World"));
assert!(text.contains("bold"));
assert!(!text.contains("<h1>"));
assert!(!text.contains("<b>"));
}
#[tokio::test]
async fn test_extract_text_missing_html() {
let skill = WebScraperSkill::new();
let call = ToolCall {
id: "t2".to_string(),
name: "web_scraper".to_string(),
arguments: serde_json::json!({
"operation": "extract_text",
"html": ""
}),
};
let result = skill.execute(call).await.unwrap();
assert!(result.is_error);
assert!(result.content.contains("required"));
}
#[tokio::test]
async fn test_unknown_operation() {
let skill = WebScraperSkill::new();
let call = ToolCall {
id: "t3".to_string(),
name: "web_scraper".to_string(),
arguments: serde_json::json!({
"operation": "invalid_op"
}),
};
let result = skill.execute(call).await.unwrap();
assert!(result.is_error);
assert!(result.content.contains("Unknown operation"));
}
#[tokio::test]
async fn test_extract_text_with_max_length() {
let skill = WebScraperSkill::new();
let long_content = format!("<p>{}</p>", "word ".repeat(20000));
let call = ToolCall {
id: "t4".to_string(),
name: "web_scraper".to_string(),
arguments: serde_json::json!({
"operation": "extract_text",
"html": long_content,
"max_length": 100
}),
};
let result = skill.execute(call).await.unwrap();
assert!(!result.is_error);
let parsed: serde_json::Value = serde_json::from_str(&result.content).unwrap();
assert!(parsed["truncated"].as_bool().unwrap());
let text = parsed["text"].as_str().unwrap();
assert!(text.ends_with("...[truncated]"));
}
#[test]
fn test_descriptor() {
let skill = WebScraperSkill::new();
let desc = skill.descriptor();
assert_eq!(desc.name, "web_scraper");
assert!(!desc.required_capabilities.is_empty());
}
}