1use argentor_core::{ArgentorResult, ToolCall, ToolResult};
7use argentor_skills::skill::{Skill, SkillDescriptor};
8use async_trait::async_trait;
9use regex::Regex;
10use serde_json::{json, Value};
11
12use crate::web_scraper::strip_html_tags;
13
14pub struct HtmlLoaderSkill {
16 descriptor: SkillDescriptor,
17}
18
19impl HtmlLoaderSkill {
20 pub fn new() -> Self {
22 Self {
23 descriptor: SkillDescriptor {
24 name: "html_loader".to_string(),
25 description: "HTML document loader: extract_text, extract_links, extract_images, extract_metadata, strip_tags.".to_string(),
26 parameters_schema: json!({
27 "type": "object",
28 "properties": {
29 "operation": {
30 "type": "string",
31 "enum": ["extract_text", "extract_links", "extract_images", "extract_metadata", "strip_tags"],
32 "description": "The HTML operation to perform"
33 },
34 "html": {
35 "type": "string",
36 "description": "HTML content to process"
37 }
38 },
39 "required": ["operation", "html"]
40 }),
41 required_capabilities: vec![],
42 requires_approval: false,
43 },
44 }
45 }
46}
47
48impl Default for HtmlLoaderSkill {
49 fn default() -> Self {
50 Self::new()
51 }
52}
53
54fn extract_links_internal(html: &str) -> Vec<Value> {
56 let mut links = Vec::new();
57 let re = match Regex::new(r#"(?is)<a\s[^>]*href=["']([^"']+)["'][^>]*>(.*?)</a>"#) {
58 Ok(r) => r,
59 Err(_) => return links,
60 };
61 for caps in re.captures_iter(html) {
62 let url = caps.get(1).map_or("", |m| m.as_str()).to_string();
63 let raw_text = caps.get(2).map_or("", |m| m.as_str());
64 let text = strip_html_tags(raw_text);
65 if url.starts_with('#') || url.starts_with("javascript:") {
66 continue;
67 }
68 links.push(json!({
69 "url": url,
70 "text": text.trim(),
71 }));
72 }
73 links
74}
75
76fn extract_images_internal(html: &str) -> Vec<Value> {
78 let mut images = Vec::new();
79 let re_src = match Regex::new(r#"(?is)<img\s[^>]*src=["']([^"']+)["'][^>]*>"#) {
80 Ok(r) => r,
81 Err(_) => return images,
82 };
83 let re_alt = Regex::new(r#"(?is)alt=["']([^"']*)["']"#);
84
85 for caps in re_src.captures_iter(html) {
86 let full_tag = caps.get(0).map_or("", |m| m.as_str());
87 let src = caps.get(1).map_or("", |m| m.as_str()).to_string();
88 let alt = if let Ok(ref re) = re_alt {
89 re.captures(full_tag)
90 .and_then(|c| c.get(1).map(|m| m.as_str().to_string()))
91 .unwrap_or_default()
92 } else {
93 String::new()
94 };
95 images.push(json!({
96 "src": src,
97 "alt": alt,
98 }));
99 }
100 images
101}
102
103fn extract_metadata_internal(html: &str) -> Value {
105 let mut meta = json!({});
106
107 if let Ok(re) = Regex::new(r"(?is)<title[^>]*>(.*?)</title>") {
109 if let Some(caps) = re.captures(html) {
110 if let Some(m) = caps.get(1) {
111 meta["title"] = Value::String(strip_html_tags(m.as_str()));
112 }
113 }
114 }
115
116 if let Ok(re) = Regex::new(
118 r#"(?is)<meta\s[^>]*name=["']description["'][^>]*content=["']([^"']+)["'][^>]*/?>"#,
119 ) {
120 if let Some(caps) = re.captures(html) {
121 if let Some(m) = caps.get(1) {
122 meta["description"] = Value::String(m.as_str().to_string());
123 }
124 }
125 }
126
127 if meta.get("description").is_none() {
129 if let Ok(re) = Regex::new(
130 r#"(?is)<meta\s[^>]*content=["']([^"']+)["'][^>]*name=["']description["'][^>]*/?>"#,
131 ) {
132 if let Some(caps) = re.captures(html) {
133 if let Some(m) = caps.get(1) {
134 meta["description"] = Value::String(m.as_str().to_string());
135 }
136 }
137 }
138 }
139
140 if let Ok(re) =
142 Regex::new(r#"(?is)<meta\s[^>]*name=["']keywords["'][^>]*content=["']([^"']+)["'][^>]*/?>"#)
143 {
144 if let Some(caps) = re.captures(html) {
145 if let Some(m) = caps.get(1) {
146 meta["keywords"] = Value::String(m.as_str().to_string());
147 }
148 }
149 }
150
151 if let Ok(re) = Regex::new(r#"(?is)<html[^>]*lang=["']([^"']+)["']"#) {
153 if let Some(caps) = re.captures(html) {
154 if let Some(m) = caps.get(1) {
155 meta["lang"] = Value::String(m.as_str().to_string());
156 }
157 }
158 }
159
160 if let Ok(re) = Regex::new(r#"(?is)<meta\s[^>]*charset=["']?([A-Za-z0-9\-_]+)["']?"#) {
162 if let Some(caps) = re.captures(html) {
163 if let Some(m) = caps.get(1) {
164 meta["charset"] = Value::String(m.as_str().to_string());
165 }
166 }
167 }
168
169 meta
170}
171
172#[async_trait]
173impl Skill for HtmlLoaderSkill {
174 fn descriptor(&self) -> &SkillDescriptor {
175 &self.descriptor
176 }
177
178 async fn execute(&self, call: ToolCall) -> ArgentorResult<ToolResult> {
179 let operation = match call.arguments["operation"].as_str() {
180 Some(op) => op,
181 None => {
182 return Ok(ToolResult::error(
183 &call.id,
184 "Missing required parameter: 'operation'",
185 ))
186 }
187 };
188
189 let html = match call.arguments["html"].as_str() {
190 Some(v) => v,
191 None => {
192 return Ok(ToolResult::error(
193 &call.id,
194 "Missing required parameter: 'html'",
195 ))
196 }
197 };
198
199 match operation {
200 "extract_text" | "strip_tags" => {
201 let text = strip_html_tags(html);
202 let response = json!({
203 "text": text,
204 "length": text.len(),
205 });
206 Ok(ToolResult::success(&call.id, response.to_string()))
207 }
208 "extract_links" => {
209 let links = extract_links_internal(html);
210 let response = json!({ "links": links, "count": links.len() });
211 Ok(ToolResult::success(&call.id, response.to_string()))
212 }
213 "extract_images" => {
214 let images = extract_images_internal(html);
215 let response = json!({ "images": images, "count": images.len() });
216 Ok(ToolResult::success(&call.id, response.to_string()))
217 }
218 "extract_metadata" => {
219 let meta = extract_metadata_internal(html);
220 Ok(ToolResult::success(&call.id, meta.to_string()))
221 }
222 _ => Ok(ToolResult::error(
223 &call.id,
224 format!("Unknown operation: '{operation}'. Supported: extract_text, extract_links, extract_images, extract_metadata, strip_tags"),
225 )),
226 }
227 }
228}
229
230#[cfg(test)]
231#[allow(clippy::unwrap_used, clippy::expect_used)]
232mod tests {
233 use super::*;
234
235 const SAMPLE_HTML: &str = r##"<!DOCTYPE html>
236<html lang="en">
237<head>
238 <meta charset="UTF-8">
239 <title>Test Page</title>
240 <meta name="description" content="A sample page for testing">
241 <meta name="keywords" content="test, html, parser">
242</head>
243<body>
244 <h1>Main Heading</h1>
245 <p>This is a paragraph with <strong>bold</strong> text.</p>
246 <a href="https://example.com">Example Link</a>
247 <a href="https://github.com">GitHub</a>
248 <a href="#anchor">Skip anchor</a>
249 <img src="photo.jpg" alt="A photo">
250 <img src="icon.png" alt="">
251 <script>alert('bad');</script>
252 <style>body { color: red; }</style>
253</body>
254</html>"##;
255
256 fn make_call(args: Value) -> ToolCall {
257 ToolCall {
258 id: "test".to_string(),
259 name: "html_loader".to_string(),
260 arguments: args,
261 }
262 }
263
264 #[tokio::test]
265 async fn test_extract_text() {
266 let skill = HtmlLoaderSkill::new();
267 let call = make_call(json!({"operation": "extract_text", "html": SAMPLE_HTML}));
268 let result = skill.execute(call).await.unwrap();
269 assert!(!result.is_error, "Result: {}", result.content);
270 let parsed: Value = serde_json::from_str(&result.content).unwrap();
271 let text = parsed["text"].as_str().unwrap();
272 assert!(text.contains("Main Heading"));
273 assert!(text.contains("paragraph"));
274 assert!(text.contains("bold"));
275 assert!(!text.contains("alert"), "Scripts should be stripped");
276 }
277
278 #[tokio::test]
279 async fn test_strip_tags_alias() {
280 let skill = HtmlLoaderSkill::new();
281 let call =
282 make_call(json!({"operation": "strip_tags", "html": "<p>Hello <b>World</b></p>"}));
283 let result = skill.execute(call).await.unwrap();
284 assert!(!result.is_error);
285 let parsed: Value = serde_json::from_str(&result.content).unwrap();
286 let text = parsed["text"].as_str().unwrap();
287 assert!(text.contains("Hello"));
288 assert!(!text.contains('<'));
289 }
290
291 #[tokio::test]
292 async fn test_extract_links() {
293 let skill = HtmlLoaderSkill::new();
294 let call = make_call(json!({"operation": "extract_links", "html": SAMPLE_HTML}));
295 let result = skill.execute(call).await.unwrap();
296 assert!(!result.is_error);
297 let parsed: Value = serde_json::from_str(&result.content).unwrap();
298 assert_eq!(parsed["count"], 2);
300 let links = parsed["links"].as_array().unwrap();
301 assert_eq!(links[0]["url"], "https://example.com");
302 assert_eq!(links[0]["text"], "Example Link");
303 }
304
305 #[tokio::test]
306 async fn test_extract_images() {
307 let skill = HtmlLoaderSkill::new();
308 let call = make_call(json!({"operation": "extract_images", "html": SAMPLE_HTML}));
309 let result = skill.execute(call).await.unwrap();
310 assert!(!result.is_error);
311 let parsed: Value = serde_json::from_str(&result.content).unwrap();
312 assert_eq!(parsed["count"], 2);
313 let images = parsed["images"].as_array().unwrap();
314 assert_eq!(images[0]["src"], "photo.jpg");
315 assert_eq!(images[0]["alt"], "A photo");
316 assert_eq!(images[1]["alt"], "");
317 }
318
319 #[tokio::test]
320 async fn test_extract_metadata() {
321 let skill = HtmlLoaderSkill::new();
322 let call = make_call(json!({"operation": "extract_metadata", "html": SAMPLE_HTML}));
323 let result = skill.execute(call).await.unwrap();
324 assert!(!result.is_error);
325 let parsed: Value = serde_json::from_str(&result.content).unwrap();
326 assert_eq!(parsed["title"], "Test Page");
327 assert_eq!(parsed["description"], "A sample page for testing");
328 assert_eq!(parsed["keywords"], "test, html, parser");
329 assert_eq!(parsed["lang"], "en");
330 assert_eq!(parsed["charset"], "UTF-8");
331 }
332
333 #[tokio::test]
334 async fn test_extract_metadata_missing_fields() {
335 let skill = HtmlLoaderSkill::new();
336 let html = "<html><body><p>no meta</p></body></html>";
337 let call = make_call(json!({"operation": "extract_metadata", "html": html}));
338 let result = skill.execute(call).await.unwrap();
339 assert!(!result.is_error);
340 let parsed: Value = serde_json::from_str(&result.content).unwrap();
341 assert!(parsed.get("title").is_none() || parsed["title"].is_null());
342 assert!(parsed.get("description").is_none() || parsed["description"].is_null());
343 }
344
345 #[tokio::test]
346 async fn test_extract_text_empty_html() {
347 let skill = HtmlLoaderSkill::new();
348 let call = make_call(json!({"operation": "extract_text", "html": ""}));
349 let result = skill.execute(call).await.unwrap();
350 assert!(!result.is_error);
351 let parsed: Value = serde_json::from_str(&result.content).unwrap();
352 assert_eq!(parsed["text"], "");
353 assert_eq!(parsed["length"], 0);
354 }
355
356 #[tokio::test]
357 async fn test_extract_links_no_links() {
358 let skill = HtmlLoaderSkill::new();
359 let html = "<p>No links at all.</p>";
360 let call = make_call(json!({"operation": "extract_links", "html": html}));
361 let result = skill.execute(call).await.unwrap();
362 assert!(!result.is_error);
363 let parsed: Value = serde_json::from_str(&result.content).unwrap();
364 assert_eq!(parsed["count"], 0);
365 }
366
367 #[tokio::test]
368 async fn test_extract_links_skips_javascript() {
369 let skill = HtmlLoaderSkill::new();
370 let html = r#"<a href="javascript:alert(1)">bad</a><a href="https://ok.com">ok</a>"#;
371 let call = make_call(json!({"operation": "extract_links", "html": html}));
372 let result = skill.execute(call).await.unwrap();
373 let parsed: Value = serde_json::from_str(&result.content).unwrap();
374 assert_eq!(parsed["count"], 1);
375 assert_eq!(parsed["links"][0]["url"], "https://ok.com");
376 }
377
378 #[tokio::test]
379 async fn test_decodes_entities() {
380 let skill = HtmlLoaderSkill::new();
381 let html = "<p>Tom & Jerry <3</p>";
382 let call = make_call(json!({"operation": "extract_text", "html": html}));
383 let result = skill.execute(call).await.unwrap();
384 let parsed: Value = serde_json::from_str(&result.content).unwrap();
385 let text = parsed["text"].as_str().unwrap();
386 assert!(text.contains("Tom & Jerry"));
387 }
388
389 #[tokio::test]
390 async fn test_missing_operation() {
391 let skill = HtmlLoaderSkill::new();
392 let call = make_call(json!({"html": "<p>hi</p>"}));
393 let result = skill.execute(call).await.unwrap();
394 assert!(result.is_error);
395 }
396
397 #[tokio::test]
398 async fn test_missing_html() {
399 let skill = HtmlLoaderSkill::new();
400 let call = make_call(json!({"operation": "extract_text"}));
401 let result = skill.execute(call).await.unwrap();
402 assert!(result.is_error);
403 }
404
405 #[tokio::test]
406 async fn test_unknown_operation() {
407 let skill = HtmlLoaderSkill::new();
408 let call = make_call(json!({"operation": "parse_dom", "html": "<p/>"}));
409 let result = skill.execute(call).await.unwrap();
410 assert!(result.is_error);
411 assert!(result.content.contains("Unknown operation"));
412 }
413
414 #[test]
415 fn test_descriptor_name() {
416 let skill = HtmlLoaderSkill::new();
417 assert_eq!(skill.descriptor().name, "html_loader");
418 }
419
420 #[test]
421 fn test_descriptor_no_capabilities_required() {
422 let skill = HtmlLoaderSkill::new();
423 assert!(skill.descriptor().required_capabilities.is_empty());
424 }
425}