robinpath_modules/modules/
html_mod.rs1use robinpath::{RobinPath, Value};
2
3pub fn register(rp: &mut RobinPath) {
4 rp.register_builtin("html.stripTags", |args, _| {
5 let s = args.first().map(|v| v.to_display_string()).unwrap_or_default();
6 let re = regex::Regex::new(r"<[^>]*>").unwrap();
7 Ok(Value::String(re.replace_all(&s, "").to_string()))
8 });
9
10 rp.register_builtin("html.extractText", |args, _| {
11 let s = args.first().map(|v| v.to_display_string()).unwrap_or_default();
12 let tag = args.get(1).map(|v| v.to_display_string()).unwrap_or_default();
13 let pattern = format!(r"(?is)<{0}[^>]*>(.*?)</{0}>", regex::escape(&tag));
14 let re = regex::Regex::new(&pattern).unwrap();
15 let texts: Vec<Value> = re.captures_iter(&s)
16 .map(|c| {
17 let inner_re = regex::Regex::new(r"<[^>]*>").unwrap();
18 Value::String(inner_re.replace_all(&c[1], "").trim().to_string())
19 })
20 .collect();
21 Ok(Value::Array(texts))
22 });
23
24 rp.register_builtin("html.extractLinks", |args, _| {
25 let s = args.first().map(|v| v.to_display_string()).unwrap_or_default();
26 let re = regex::Regex::new(r#"(?is)<a[^>]*href=["']([^"']+)["'][^>]*>(.*?)</a>"#).unwrap();
27 let links: Vec<Value> = re.captures_iter(&s)
28 .map(|c| {
29 let mut obj = indexmap::IndexMap::new();
30 obj.insert("href".to_string(), Value::String(c[1].to_string()));
31 let tag_re = regex::Regex::new(r"<[^>]*>").unwrap();
32 obj.insert("text".to_string(), Value::String(tag_re.replace_all(&c[2], "").trim().to_string()));
33 Value::Object(obj)
34 })
35 .collect();
36 Ok(Value::Array(links))
37 });
38
39 rp.register_builtin("html.extractImages", |args, _| {
40 let s = args.first().map(|v| v.to_display_string()).unwrap_or_default();
41 let re = regex::Regex::new(r#"(?i)<img[^>]*src=["']([^"']+)["'][^>]*>"#).unwrap();
42 let alt_re = regex::Regex::new(r#"(?i)alt=["']([^"']*)["']"#).unwrap();
43 let images: Vec<Value> = re.captures_iter(&s)
44 .map(|c| {
45 let mut obj = indexmap::IndexMap::new();
46 obj.insert("src".to_string(), Value::String(c[1].to_string()));
47 let alt = alt_re.captures(&c[0]).map(|a| a[1].to_string()).unwrap_or_default();
48 obj.insert("alt".to_string(), Value::String(alt));
49 Value::Object(obj)
50 })
51 .collect();
52 Ok(Value::Array(images))
53 });
54
55 rp.register_builtin("html.getAttribute", |args, _| {
56 let s = args.first().map(|v| v.to_display_string()).unwrap_or_default();
57 let tag = args.get(1).map(|v| v.to_display_string()).unwrap_or_default();
58 let attr = args.get(2).map(|v| v.to_display_string()).unwrap_or_default();
59 let pattern = format!(r#"(?i)<{}[^>]*\s{}=["']([^"']*)["'][^>]*>"#, regex::escape(&tag), regex::escape(&attr));
60 let re = regex::Regex::new(&pattern).unwrap();
61 let values: Vec<Value> = re.captures_iter(&s)
62 .map(|c| Value::String(c[1].to_string()))
63 .collect();
64 Ok(Value::Array(values))
65 });
66
67 rp.register_builtin("html.escape", |args, _| {
68 let s = args.first().map(|v| v.to_display_string()).unwrap_or_default();
69 Ok(Value::String(escape_html(&s)))
70 });
71
72 rp.register_builtin("html.unescape", |args, _| {
73 let s = args.first().map(|v| v.to_display_string()).unwrap_or_default();
74 Ok(Value::String(unescape_html(&s)))
75 });
76
77 rp.register_builtin("html.extractMeta", |args, _| {
78 let s = args.first().map(|v| v.to_display_string()).unwrap_or_default();
79 let re = regex::Regex::new(r#"(?i)<meta[^>]*name=["']([^"']+)["'][^>]*content=["']([^"']*)["'][^>]*>"#).unwrap();
80 let re2 = regex::Regex::new(r#"(?i)<meta[^>]*content=["']([^"']*)["'][^>]*name=["']([^"']+)["'][^>]*>"#).unwrap();
81 let mut obj = indexmap::IndexMap::new();
82 for c in re.captures_iter(&s) {
83 obj.insert(c[1].to_string(), Value::String(c[2].to_string()));
84 }
85 for c in re2.captures_iter(&s) {
86 obj.insert(c[2].to_string(), Value::String(c[1].to_string()));
87 }
88 Ok(Value::Object(obj))
89 });
90
91 rp.register_builtin("html.getTitle", |args, _| {
92 let s = args.first().map(|v| v.to_display_string()).unwrap_or_default();
93 let re = regex::Regex::new(r"(?is)<title[^>]*>(.*?)</title>").unwrap();
94 Ok(match re.captures(&s) {
95 Some(c) => Value::String(c[1].trim().to_string()),
96 None => Value::Null,
97 })
98 });
99
100 rp.register_builtin("html.wrap", |args, _| {
101 let text = args.first().map(|v| v.to_display_string()).unwrap_or_default();
102 let tag = args.get(1).map(|v| v.to_display_string()).unwrap_or_else(|| "div".to_string());
103 let attrs = args.get(2).cloned().unwrap_or(Value::Null);
104 let attr_str = if let Value::Object(obj) = &attrs {
105 let parts: Vec<String> = obj.iter().map(|(k, v)| format!(" {}=\"{}\"", k, v.to_display_string())).collect();
106 parts.join("")
107 } else {
108 String::new()
109 };
110 Ok(Value::String(format!("<{}{}>{}</{}>", tag, attr_str, text, tag)))
111 });
112
113 rp.register_builtin("html.minify", |args, _| {
114 let s = args.first().map(|v| v.to_display_string()).unwrap_or_default();
115 let ws_re = regex::Regex::new(r">\s+<").unwrap();
116 let result = ws_re.replace_all(&s, "><");
117 let nl_re = regex::Regex::new(r"\s+").unwrap();
118 let result = nl_re.replace_all(&result, " ");
119 Ok(Value::String(result.trim().to_string()))
120 });
121
122 rp.register_builtin("html.extractTables", |args, _| {
123 let s = args.first().map(|v| v.to_display_string()).unwrap_or_default();
124 let table_re = regex::Regex::new(r"(?is)<table[^>]*>(.*?)</table>").unwrap();
125 let row_re = regex::Regex::new(r"(?is)<tr[^>]*>(.*?)</tr>").unwrap();
126 let cell_re = regex::Regex::new(r"(?is)<t[dh][^>]*>(.*?)</t[dh]>").unwrap();
127 let tag_re = regex::Regex::new(r"<[^>]*>").unwrap();
128
129 let tables: Vec<Value> = table_re.captures_iter(&s)
130 .map(|tc| {
131 let rows: Vec<Value> = row_re.captures_iter(&tc[1])
132 .map(|rc| {
133 let cells: Vec<Value> = cell_re.captures_iter(&rc[1])
134 .map(|cc| Value::String(tag_re.replace_all(&cc[1], "").trim().to_string()))
135 .collect();
136 Value::Array(cells)
137 })
138 .collect();
139 Value::Array(rows)
140 })
141 .collect();
142 Ok(Value::Array(tables))
143 });
144}
145
146fn escape_html(s: &str) -> String {
147 s.replace('&', "&")
148 .replace('<', "<")
149 .replace('>', ">")
150 .replace('"', """)
151 .replace('\'', "'")
152}
153
154fn unescape_html(s: &str) -> String {
155 s.replace("&", "&")
156 .replace("<", "<")
157 .replace(">", ">")
158 .replace(""", "\"")
159 .replace("'", "'")
160}