agent_chain_core/utils/
html.rs1use regex::Regex;
6use std::collections::HashSet;
7
8pub const PREFIXES_TO_IGNORE: &[&str] = &["javascript:", "mailto:", "#"];
10
11pub const SUFFIXES_TO_IGNORE: &[&str] = &[
13 ".css", ".js", ".ico", ".png", ".jpg", ".jpeg", ".gif", ".svg", ".csv", ".bz2", ".zip", ".epub",
14];
15
16pub fn default_link_regex() -> Regex {
19 Regex::new(r#"href=["']([^"'#]+)["'#]"#).expect("Failed to compile default link regex")
22}
23
24fn should_ignore_prefix(link: &str) -> bool {
26 PREFIXES_TO_IGNORE
27 .iter()
28 .any(|prefix| link.starts_with(prefix))
29}
30
31fn should_ignore_suffix(link: &str) -> bool {
33 SUFFIXES_TO_IGNORE
34 .iter()
35 .any(|suffix| link.ends_with(suffix))
36}
37
38pub fn find_all_links(raw_html: &str, pattern: Option<&Regex>) -> Vec<String> {
59 let default_regex = default_link_regex();
60 let regex = pattern.unwrap_or(&default_regex);
61
62 regex
63 .captures_iter(raw_html)
64 .filter_map(|cap| cap.get(1).map(|m| m.as_str().to_string()))
65 .filter(|link| !should_ignore_prefix(link) && !should_ignore_suffix(link))
66 .collect::<HashSet<_>>()
67 .into_iter()
68 .collect()
69}
70
71pub fn extract_sub_links(
96 raw_html: &str,
97 url: &str,
98 base_url: Option<&str>,
99 pattern: Option<&Regex>,
100 prevent_outside: bool,
101 exclude_prefixes: &[&str],
102) -> Vec<String> {
103 let base_url_to_use = base_url.unwrap_or(url);
104
105 let parsed_base_url = match url::Url::parse(base_url_to_use) {
106 Ok(u) => u,
107 Err(_) => return vec![],
108 };
109
110 let parsed_url = match url::Url::parse(url) {
111 Ok(u) => u,
112 Err(_) => return vec![],
113 };
114
115 let all_links = find_all_links(raw_html, pattern);
116 let mut absolute_paths = HashSet::new();
117
118 for link in all_links {
119 let absolute_path = match url::Url::parse(&link) {
120 Ok(parsed_link) => {
121 if parsed_link.scheme() == "http" || parsed_link.scheme() == "https" {
122 link
123 } else {
124 continue;
125 }
126 }
127 Err(_) => {
128 if link.starts_with("//") {
129 format!("{}:{}", parsed_url.scheme(), link)
130 } else {
131 match parsed_url.join(&link) {
132 Ok(joined) => joined.to_string(),
133 Err(_) => continue,
134 }
135 }
136 }
137 };
138
139 absolute_paths.insert(absolute_path);
140 }
141
142 let mut results = Vec::new();
143
144 for path in absolute_paths {
145 if exclude_prefixes
146 .iter()
147 .any(|prefix| path.starts_with(prefix))
148 {
149 continue;
150 }
151
152 if prevent_outside {
153 let parsed_path = match url::Url::parse(&path) {
154 Ok(u) => u,
155 Err(_) => continue,
156 };
157
158 if parsed_base_url.host_str() != parsed_path.host_str() {
159 continue;
160 }
161
162 if !path.starts_with(base_url_to_use) {
163 continue;
164 }
165 }
166
167 results.push(path);
168 }
169
170 results
171}
172
173#[cfg(test)]
174mod tests {
175 use super::*;
176
177 #[test]
178 fn test_find_all_links() {
179 let html = r#"
180 <a href="https://example.com/page1">Link 1</a>
181 <a href="/page2">Link 2</a>
182 <a href="https://example.com/page1">Duplicate</a>
183 "#;
184
185 let links = find_all_links(html, None);
186 assert!(links.contains(&"https://example.com/page1".to_string()));
187 assert!(links.contains(&"/page2".to_string()));
188 }
189
190 #[test]
191 fn test_find_all_links_ignores_prefixes() {
192 let html = r##"
193 <a href="javascript:void(0)">JS Link</a>
194 <a href="mailto:test@example.com">Email</a>
195 <a href="#section">Anchor</a>
196 <a href="https://example.com">Valid</a>
197 "##;
198
199 let links = find_all_links(html, None);
200 assert!(!links.iter().any(|l| l.starts_with("javascript:")));
201 assert!(!links.iter().any(|l| l.starts_with("mailto:")));
202 assert!(!links.iter().any(|l| l.starts_with("#")));
203 }
204
205 #[test]
206 fn test_find_all_links_ignores_suffixes() {
207 let html = r##"
208 <a href="style.css">CSS</a>
209 <a href="script.js">JS</a>
210 <a href="image.png">Image</a>
211 <a href="https://example.com/page">Valid</a>
212 "##;
213
214 let links = find_all_links(html, None);
215 assert!(!links.iter().any(|l| l.ends_with(".css")));
216 assert!(!links.iter().any(|l| l.ends_with(".js")));
217 assert!(!links.iter().any(|l| l.ends_with(".png")));
218 }
219
220 #[test]
221 fn test_extract_sub_links() {
222 let html = r#"
223 <a href="/page1">Link 1</a>
224 <a href="https://example.com/page2">Link 2</a>
225 "#;
226
227 let links = extract_sub_links(html, "https://example.com", None, None, true, &[]);
228
229 for link in &links {
230 assert!(link.starts_with("https://example.com"));
231 }
232 }
233
234 #[test]
235 fn test_extract_sub_links_prevent_outside() {
236 let html = r#"
237 <a href="https://example.com/page">Internal</a>
238 <a href="https://other.com/page">External</a>
239 "#;
240
241 let links = extract_sub_links(html, "https://example.com", None, None, true, &[]);
242
243 assert!(links.iter().any(|l| l.contains("example.com")));
244 assert!(!links.iter().any(|l| l.contains("other.com")));
245 }
246
247 #[test]
248 fn test_extract_sub_links_exclude_prefixes() {
249 let html = r#"
250 <a href="https://example.com/api/v1">API</a>
251 <a href="https://example.com/page">Page</a>
252 "#;
253
254 let links = extract_sub_links(
255 html,
256 "https://example.com",
257 None,
258 None,
259 false,
260 &["https://example.com/api"],
261 );
262
263 assert!(!links.iter().any(|l| l.contains("/api/")));
264 assert!(links.iter().any(|l| l.contains("/page")));
265 }
266}