Skip to main content

robinpath_modules/modules/
html_mod.rs

1use robinpath::{RobinPath, Value};
2
3pub fn register(rp: &mut RobinPath) {
4    rp.register_builtin("html.stripTags", |args, _| {
5        let s = args.first().map(|v| v.to_display_string()).unwrap_or_default();
6        let re = regex::Regex::new(r"<[^>]*>").unwrap();
7        Ok(Value::String(re.replace_all(&s, "").to_string()))
8    });
9
10    rp.register_builtin("html.extractText", |args, _| {
11        let s = args.first().map(|v| v.to_display_string()).unwrap_or_default();
12        let tag = args.get(1).map(|v| v.to_display_string()).unwrap_or_default();
13        let pattern = format!(r"(?is)<{0}[^>]*>(.*?)</{0}>", regex::escape(&tag));
14        let re = regex::Regex::new(&pattern).unwrap();
15        let texts: Vec<Value> = re.captures_iter(&s)
16            .map(|c| {
17                let inner_re = regex::Regex::new(r"<[^>]*>").unwrap();
18                Value::String(inner_re.replace_all(&c[1], "").trim().to_string())
19            })
20            .collect();
21        Ok(Value::Array(texts))
22    });
23
24    rp.register_builtin("html.extractLinks", |args, _| {
25        let s = args.first().map(|v| v.to_display_string()).unwrap_or_default();
26        let re = regex::Regex::new(r#"(?is)<a[^>]*href=["']([^"']+)["'][^>]*>(.*?)</a>"#).unwrap();
27        let links: Vec<Value> = re.captures_iter(&s)
28            .map(|c| {
29                let mut obj = indexmap::IndexMap::new();
30                obj.insert("href".to_string(), Value::String(c[1].to_string()));
31                let tag_re = regex::Regex::new(r"<[^>]*>").unwrap();
32                obj.insert("text".to_string(), Value::String(tag_re.replace_all(&c[2], "").trim().to_string()));
33                Value::Object(obj)
34            })
35            .collect();
36        Ok(Value::Array(links))
37    });
38
39    rp.register_builtin("html.extractImages", |args, _| {
40        let s = args.first().map(|v| v.to_display_string()).unwrap_or_default();
41        let re = regex::Regex::new(r#"(?i)<img[^>]*src=["']([^"']+)["'][^>]*>"#).unwrap();
42        let alt_re = regex::Regex::new(r#"(?i)alt=["']([^"']*)["']"#).unwrap();
43        let images: Vec<Value> = re.captures_iter(&s)
44            .map(|c| {
45                let mut obj = indexmap::IndexMap::new();
46                obj.insert("src".to_string(), Value::String(c[1].to_string()));
47                let alt = alt_re.captures(&c[0]).map(|a| a[1].to_string()).unwrap_or_default();
48                obj.insert("alt".to_string(), Value::String(alt));
49                Value::Object(obj)
50            })
51            .collect();
52        Ok(Value::Array(images))
53    });
54
55    rp.register_builtin("html.getAttribute", |args, _| {
56        let s = args.first().map(|v| v.to_display_string()).unwrap_or_default();
57        let tag = args.get(1).map(|v| v.to_display_string()).unwrap_or_default();
58        let attr = args.get(2).map(|v| v.to_display_string()).unwrap_or_default();
59        let pattern = format!(r#"(?i)<{}[^>]*\s{}=["']([^"']*)["'][^>]*>"#, regex::escape(&tag), regex::escape(&attr));
60        let re = regex::Regex::new(&pattern).unwrap();
61        let values: Vec<Value> = re.captures_iter(&s)
62            .map(|c| Value::String(c[1].to_string()))
63            .collect();
64        Ok(Value::Array(values))
65    });
66
67    rp.register_builtin("html.escape", |args, _| {
68        let s = args.first().map(|v| v.to_display_string()).unwrap_or_default();
69        Ok(Value::String(escape_html(&s)))
70    });
71
72    rp.register_builtin("html.unescape", |args, _| {
73        let s = args.first().map(|v| v.to_display_string()).unwrap_or_default();
74        Ok(Value::String(unescape_html(&s)))
75    });
76
77    rp.register_builtin("html.extractMeta", |args, _| {
78        let s = args.first().map(|v| v.to_display_string()).unwrap_or_default();
79        let re = regex::Regex::new(r#"(?i)<meta[^>]*name=["']([^"']+)["'][^>]*content=["']([^"']*)["'][^>]*>"#).unwrap();
80        let re2 = regex::Regex::new(r#"(?i)<meta[^>]*content=["']([^"']*)["'][^>]*name=["']([^"']+)["'][^>]*>"#).unwrap();
81        let mut obj = indexmap::IndexMap::new();
82        for c in re.captures_iter(&s) {
83            obj.insert(c[1].to_string(), Value::String(c[2].to_string()));
84        }
85        for c in re2.captures_iter(&s) {
86            obj.insert(c[2].to_string(), Value::String(c[1].to_string()));
87        }
88        Ok(Value::Object(obj))
89    });
90
91    rp.register_builtin("html.getTitle", |args, _| {
92        let s = args.first().map(|v| v.to_display_string()).unwrap_or_default();
93        let re = regex::Regex::new(r"(?is)<title[^>]*>(.*?)</title>").unwrap();
94        Ok(match re.captures(&s) {
95            Some(c) => Value::String(c[1].trim().to_string()),
96            None => Value::Null,
97        })
98    });
99
100    rp.register_builtin("html.wrap", |args, _| {
101        let text = args.first().map(|v| v.to_display_string()).unwrap_or_default();
102        let tag = args.get(1).map(|v| v.to_display_string()).unwrap_or_else(|| "div".to_string());
103        let attrs = args.get(2).cloned().unwrap_or(Value::Null);
104        let attr_str = if let Value::Object(obj) = &attrs {
105            let parts: Vec<String> = obj.iter().map(|(k, v)| format!(" {}=\"{}\"", k, v.to_display_string())).collect();
106            parts.join("")
107        } else {
108            String::new()
109        };
110        Ok(Value::String(format!("<{}{}>{}</{}>", tag, attr_str, text, tag)))
111    });
112
113    rp.register_builtin("html.minify", |args, _| {
114        let s = args.first().map(|v| v.to_display_string()).unwrap_or_default();
115        let ws_re = regex::Regex::new(r">\s+<").unwrap();
116        let result = ws_re.replace_all(&s, "><");
117        let nl_re = regex::Regex::new(r"\s+").unwrap();
118        let result = nl_re.replace_all(&result, " ");
119        Ok(Value::String(result.trim().to_string()))
120    });
121
122    rp.register_builtin("html.extractTables", |args, _| {
123        let s = args.first().map(|v| v.to_display_string()).unwrap_or_default();
124        let table_re = regex::Regex::new(r"(?is)<table[^>]*>(.*?)</table>").unwrap();
125        let row_re = regex::Regex::new(r"(?is)<tr[^>]*>(.*?)</tr>").unwrap();
126        let cell_re = regex::Regex::new(r"(?is)<t[dh][^>]*>(.*?)</t[dh]>").unwrap();
127        let tag_re = regex::Regex::new(r"<[^>]*>").unwrap();
128
129        let tables: Vec<Value> = table_re.captures_iter(&s)
130            .map(|tc| {
131                let rows: Vec<Value> = row_re.captures_iter(&tc[1])
132                    .map(|rc| {
133                        let cells: Vec<Value> = cell_re.captures_iter(&rc[1])
134                            .map(|cc| Value::String(tag_re.replace_all(&cc[1], "").trim().to_string()))
135                            .collect();
136                        Value::Array(cells)
137                    })
138                    .collect();
139                Value::Array(rows)
140            })
141            .collect();
142        Ok(Value::Array(tables))
143    });
144}
145
146fn escape_html(s: &str) -> String {
147    s.replace('&', "&amp;")
148        .replace('<', "&lt;")
149        .replace('>', "&gt;")
150        .replace('"', "&quot;")
151        .replace('\'', "&#39;")
152}
153
154fn unescape_html(s: &str) -> String {
155    s.replace("&amp;", "&")
156        .replace("&lt;", "<")
157        .replace("&gt;", ">")
158        .replace("&quot;", "\"")
159        .replace("&#39;", "'")
160}