Skip to main content

argentor_builtins/
html_loader.rs

1//! HTML document loader skill for the Argentor framework.
2//!
3//! Strips HTML tags, decodes common entities, extracts metadata, links, and
4//! images. Designed for RAG pipelines that ingest raw HTML content.
5
6use argentor_core::{ArgentorResult, ToolCall, ToolResult};
7use argentor_skills::skill::{Skill, SkillDescriptor};
8use async_trait::async_trait;
9use regex::Regex;
10use serde_json::{json, Value};
11
12use crate::web_scraper::strip_html_tags;
13
14/// HTML document loader skill: tag stripping, metadata, links, images.
15pub struct HtmlLoaderSkill {
16    descriptor: SkillDescriptor,
17}
18
19impl HtmlLoaderSkill {
20    /// Create a new HTML loader skill.
21    pub fn new() -> Self {
22        Self {
23            descriptor: SkillDescriptor {
24                name: "html_loader".to_string(),
25                description: "HTML document loader: extract_text, extract_links, extract_images, extract_metadata, strip_tags.".to_string(),
26                parameters_schema: json!({
27                    "type": "object",
28                    "properties": {
29                        "operation": {
30                            "type": "string",
31                            "enum": ["extract_text", "extract_links", "extract_images", "extract_metadata", "strip_tags"],
32                            "description": "The HTML operation to perform"
33                        },
34                        "html": {
35                            "type": "string",
36                            "description": "HTML content to process"
37                        }
38                    },
39                    "required": ["operation", "html"]
40                }),
41                required_capabilities: vec![],
42                requires_approval: false,
43            },
44        }
45    }
46}
47
48impl Default for HtmlLoaderSkill {
49    fn default() -> Self {
50        Self::new()
51    }
52}
53
54/// Extract `<a href="...">text</a>` links from HTML.
55fn extract_links_internal(html: &str) -> Vec<Value> {
56    let mut links = Vec::new();
57    let re = match Regex::new(r#"(?is)<a\s[^>]*href=["']([^"']+)["'][^>]*>(.*?)</a>"#) {
58        Ok(r) => r,
59        Err(_) => return links,
60    };
61    for caps in re.captures_iter(html) {
62        let url = caps.get(1).map_or("", |m| m.as_str()).to_string();
63        let raw_text = caps.get(2).map_or("", |m| m.as_str());
64        let text = strip_html_tags(raw_text);
65        if url.starts_with('#') || url.starts_with("javascript:") {
66            continue;
67        }
68        links.push(json!({
69            "url": url,
70            "text": text.trim(),
71        }));
72    }
73    links
74}
75
76/// Extract `<img src="..." alt="...">` images from HTML.
77fn extract_images_internal(html: &str) -> Vec<Value> {
78    let mut images = Vec::new();
79    let re_src = match Regex::new(r#"(?is)<img\s[^>]*src=["']([^"']+)["'][^>]*>"#) {
80        Ok(r) => r,
81        Err(_) => return images,
82    };
83    let re_alt = Regex::new(r#"(?is)alt=["']([^"']*)["']"#);
84
85    for caps in re_src.captures_iter(html) {
86        let full_tag = caps.get(0).map_or("", |m| m.as_str());
87        let src = caps.get(1).map_or("", |m| m.as_str()).to_string();
88        let alt = if let Ok(ref re) = re_alt {
89            re.captures(full_tag)
90                .and_then(|c| c.get(1).map(|m| m.as_str().to_string()))
91                .unwrap_or_default()
92        } else {
93            String::new()
94        };
95        images.push(json!({
96            "src": src,
97            "alt": alt,
98        }));
99    }
100    images
101}
102
103/// Extract document title and meta description from HTML.
104fn extract_metadata_internal(html: &str) -> Value {
105    let mut meta = json!({});
106
107    // Title
108    if let Ok(re) = Regex::new(r"(?is)<title[^>]*>(.*?)</title>") {
109        if let Some(caps) = re.captures(html) {
110            if let Some(m) = caps.get(1) {
111                meta["title"] = Value::String(strip_html_tags(m.as_str()));
112            }
113        }
114    }
115
116    // Meta description
117    if let Ok(re) = Regex::new(
118        r#"(?is)<meta\s[^>]*name=["']description["'][^>]*content=["']([^"']+)["'][^>]*/?>"#,
119    ) {
120        if let Some(caps) = re.captures(html) {
121            if let Some(m) = caps.get(1) {
122                meta["description"] = Value::String(m.as_str().to_string());
123            }
124        }
125    }
126
127    // Also try content before name (common alternate order)
128    if meta.get("description").is_none() {
129        if let Ok(re) = Regex::new(
130            r#"(?is)<meta\s[^>]*content=["']([^"']+)["'][^>]*name=["']description["'][^>]*/?>"#,
131        ) {
132            if let Some(caps) = re.captures(html) {
133                if let Some(m) = caps.get(1) {
134                    meta["description"] = Value::String(m.as_str().to_string());
135                }
136            }
137        }
138    }
139
140    // Meta keywords
141    if let Ok(re) =
142        Regex::new(r#"(?is)<meta\s[^>]*name=["']keywords["'][^>]*content=["']([^"']+)["'][^>]*/?>"#)
143    {
144        if let Some(caps) = re.captures(html) {
145            if let Some(m) = caps.get(1) {
146                meta["keywords"] = Value::String(m.as_str().to_string());
147            }
148        }
149    }
150
151    // Language (html lang attribute)
152    if let Ok(re) = Regex::new(r#"(?is)<html[^>]*lang=["']([^"']+)["']"#) {
153        if let Some(caps) = re.captures(html) {
154            if let Some(m) = caps.get(1) {
155                meta["lang"] = Value::String(m.as_str().to_string());
156            }
157        }
158    }
159
160    // Charset
161    if let Ok(re) = Regex::new(r#"(?is)<meta\s[^>]*charset=["']?([A-Za-z0-9\-_]+)["']?"#) {
162        if let Some(caps) = re.captures(html) {
163            if let Some(m) = caps.get(1) {
164                meta["charset"] = Value::String(m.as_str().to_string());
165            }
166        }
167    }
168
169    meta
170}
171
172#[async_trait]
173impl Skill for HtmlLoaderSkill {
174    fn descriptor(&self) -> &SkillDescriptor {
175        &self.descriptor
176    }
177
178    async fn execute(&self, call: ToolCall) -> ArgentorResult<ToolResult> {
179        let operation = match call.arguments["operation"].as_str() {
180            Some(op) => op,
181            None => {
182                return Ok(ToolResult::error(
183                    &call.id,
184                    "Missing required parameter: 'operation'",
185                ))
186            }
187        };
188
189        let html = match call.arguments["html"].as_str() {
190            Some(v) => v,
191            None => {
192                return Ok(ToolResult::error(
193                    &call.id,
194                    "Missing required parameter: 'html'",
195                ))
196            }
197        };
198
199        match operation {
200            "extract_text" | "strip_tags" => {
201                let text = strip_html_tags(html);
202                let response = json!({
203                    "text": text,
204                    "length": text.len(),
205                });
206                Ok(ToolResult::success(&call.id, response.to_string()))
207            }
208            "extract_links" => {
209                let links = extract_links_internal(html);
210                let response = json!({ "links": links, "count": links.len() });
211                Ok(ToolResult::success(&call.id, response.to_string()))
212            }
213            "extract_images" => {
214                let images = extract_images_internal(html);
215                let response = json!({ "images": images, "count": images.len() });
216                Ok(ToolResult::success(&call.id, response.to_string()))
217            }
218            "extract_metadata" => {
219                let meta = extract_metadata_internal(html);
220                Ok(ToolResult::success(&call.id, meta.to_string()))
221            }
222            _ => Ok(ToolResult::error(
223                &call.id,
224                format!("Unknown operation: '{operation}'. Supported: extract_text, extract_links, extract_images, extract_metadata, strip_tags"),
225            )),
226        }
227    }
228}
229
230#[cfg(test)]
231#[allow(clippy::unwrap_used, clippy::expect_used)]
232mod tests {
233    use super::*;
234
235    const SAMPLE_HTML: &str = r##"<!DOCTYPE html>
236<html lang="en">
237<head>
238    <meta charset="UTF-8">
239    <title>Test Page</title>
240    <meta name="description" content="A sample page for testing">
241    <meta name="keywords" content="test, html, parser">
242</head>
243<body>
244    <h1>Main Heading</h1>
245    <p>This is a paragraph with <strong>bold</strong> text.</p>
246    <a href="https://example.com">Example Link</a>
247    <a href="https://github.com">GitHub</a>
248    <a href="#anchor">Skip anchor</a>
249    <img src="photo.jpg" alt="A photo">
250    <img src="icon.png" alt="">
251    <script>alert('bad');</script>
252    <style>body { color: red; }</style>
253</body>
254</html>"##;
255
256    fn make_call(args: Value) -> ToolCall {
257        ToolCall {
258            id: "test".to_string(),
259            name: "html_loader".to_string(),
260            arguments: args,
261        }
262    }
263
264    #[tokio::test]
265    async fn test_extract_text() {
266        let skill = HtmlLoaderSkill::new();
267        let call = make_call(json!({"operation": "extract_text", "html": SAMPLE_HTML}));
268        let result = skill.execute(call).await.unwrap();
269        assert!(!result.is_error, "Result: {}", result.content);
270        let parsed: Value = serde_json::from_str(&result.content).unwrap();
271        let text = parsed["text"].as_str().unwrap();
272        assert!(text.contains("Main Heading"));
273        assert!(text.contains("paragraph"));
274        assert!(text.contains("bold"));
275        assert!(!text.contains("alert"), "Scripts should be stripped");
276    }
277
278    #[tokio::test]
279    async fn test_strip_tags_alias() {
280        let skill = HtmlLoaderSkill::new();
281        let call =
282            make_call(json!({"operation": "strip_tags", "html": "<p>Hello <b>World</b></p>"}));
283        let result = skill.execute(call).await.unwrap();
284        assert!(!result.is_error);
285        let parsed: Value = serde_json::from_str(&result.content).unwrap();
286        let text = parsed["text"].as_str().unwrap();
287        assert!(text.contains("Hello"));
288        assert!(!text.contains('<'));
289    }
290
291    #[tokio::test]
292    async fn test_extract_links() {
293        let skill = HtmlLoaderSkill::new();
294        let call = make_call(json!({"operation": "extract_links", "html": SAMPLE_HTML}));
295        let result = skill.execute(call).await.unwrap();
296        assert!(!result.is_error);
297        let parsed: Value = serde_json::from_str(&result.content).unwrap();
298        // 2 external links, skips the #anchor
299        assert_eq!(parsed["count"], 2);
300        let links = parsed["links"].as_array().unwrap();
301        assert_eq!(links[0]["url"], "https://example.com");
302        assert_eq!(links[0]["text"], "Example Link");
303    }
304
305    #[tokio::test]
306    async fn test_extract_images() {
307        let skill = HtmlLoaderSkill::new();
308        let call = make_call(json!({"operation": "extract_images", "html": SAMPLE_HTML}));
309        let result = skill.execute(call).await.unwrap();
310        assert!(!result.is_error);
311        let parsed: Value = serde_json::from_str(&result.content).unwrap();
312        assert_eq!(parsed["count"], 2);
313        let images = parsed["images"].as_array().unwrap();
314        assert_eq!(images[0]["src"], "photo.jpg");
315        assert_eq!(images[0]["alt"], "A photo");
316        assert_eq!(images[1]["alt"], "");
317    }
318
319    #[tokio::test]
320    async fn test_extract_metadata() {
321        let skill = HtmlLoaderSkill::new();
322        let call = make_call(json!({"operation": "extract_metadata", "html": SAMPLE_HTML}));
323        let result = skill.execute(call).await.unwrap();
324        assert!(!result.is_error);
325        let parsed: Value = serde_json::from_str(&result.content).unwrap();
326        assert_eq!(parsed["title"], "Test Page");
327        assert_eq!(parsed["description"], "A sample page for testing");
328        assert_eq!(parsed["keywords"], "test, html, parser");
329        assert_eq!(parsed["lang"], "en");
330        assert_eq!(parsed["charset"], "UTF-8");
331    }
332
333    #[tokio::test]
334    async fn test_extract_metadata_missing_fields() {
335        let skill = HtmlLoaderSkill::new();
336        let html = "<html><body><p>no meta</p></body></html>";
337        let call = make_call(json!({"operation": "extract_metadata", "html": html}));
338        let result = skill.execute(call).await.unwrap();
339        assert!(!result.is_error);
340        let parsed: Value = serde_json::from_str(&result.content).unwrap();
341        assert!(parsed.get("title").is_none() || parsed["title"].is_null());
342        assert!(parsed.get("description").is_none() || parsed["description"].is_null());
343    }
344
345    #[tokio::test]
346    async fn test_extract_text_empty_html() {
347        let skill = HtmlLoaderSkill::new();
348        let call = make_call(json!({"operation": "extract_text", "html": ""}));
349        let result = skill.execute(call).await.unwrap();
350        assert!(!result.is_error);
351        let parsed: Value = serde_json::from_str(&result.content).unwrap();
352        assert_eq!(parsed["text"], "");
353        assert_eq!(parsed["length"], 0);
354    }
355
356    #[tokio::test]
357    async fn test_extract_links_no_links() {
358        let skill = HtmlLoaderSkill::new();
359        let html = "<p>No links at all.</p>";
360        let call = make_call(json!({"operation": "extract_links", "html": html}));
361        let result = skill.execute(call).await.unwrap();
362        assert!(!result.is_error);
363        let parsed: Value = serde_json::from_str(&result.content).unwrap();
364        assert_eq!(parsed["count"], 0);
365    }
366
367    #[tokio::test]
368    async fn test_extract_links_skips_javascript() {
369        let skill = HtmlLoaderSkill::new();
370        let html = r#"<a href="javascript:alert(1)">bad</a><a href="https://ok.com">ok</a>"#;
371        let call = make_call(json!({"operation": "extract_links", "html": html}));
372        let result = skill.execute(call).await.unwrap();
373        let parsed: Value = serde_json::from_str(&result.content).unwrap();
374        assert_eq!(parsed["count"], 1);
375        assert_eq!(parsed["links"][0]["url"], "https://ok.com");
376    }
377
378    #[tokio::test]
379    async fn test_decodes_entities() {
380        let skill = HtmlLoaderSkill::new();
381        let html = "<p>Tom &amp; Jerry &lt;3</p>";
382        let call = make_call(json!({"operation": "extract_text", "html": html}));
383        let result = skill.execute(call).await.unwrap();
384        let parsed: Value = serde_json::from_str(&result.content).unwrap();
385        let text = parsed["text"].as_str().unwrap();
386        assert!(text.contains("Tom & Jerry"));
387    }
388
389    #[tokio::test]
390    async fn test_missing_operation() {
391        let skill = HtmlLoaderSkill::new();
392        let call = make_call(json!({"html": "<p>hi</p>"}));
393        let result = skill.execute(call).await.unwrap();
394        assert!(result.is_error);
395    }
396
397    #[tokio::test]
398    async fn test_missing_html() {
399        let skill = HtmlLoaderSkill::new();
400        let call = make_call(json!({"operation": "extract_text"}));
401        let result = skill.execute(call).await.unwrap();
402        assert!(result.is_error);
403    }
404
405    #[tokio::test]
406    async fn test_unknown_operation() {
407        let skill = HtmlLoaderSkill::new();
408        let call = make_call(json!({"operation": "parse_dom", "html": "<p/>"}));
409        let result = skill.execute(call).await.unwrap();
410        assert!(result.is_error);
411        assert!(result.content.contains("Unknown operation"));
412    }
413
414    #[test]
415    fn test_descriptor_name() {
416        let skill = HtmlLoaderSkill::new();
417        assert_eq!(skill.descriptor().name, "html_loader");
418    }
419
420    #[test]
421    fn test_descriptor_no_capabilities_required() {
422        let skill = HtmlLoaderSkill::new();
423        assert!(skill.descriptor().required_capabilities.is_empty());
424    }
425}