agent_chain_core/utils/
html.rs

1//! Utilities for working with HTML.
2//!
3//! Adapted from langchain_core/utils/html.py
4
5use regex::Regex;
6use std::collections::HashSet;
7
8/// Prefixes to ignore when extracting links.
9pub const PREFIXES_TO_IGNORE: &[&str] = &["javascript:", "mailto:", "#"];
10
11/// Suffixes to ignore when extracting links.
12pub const SUFFIXES_TO_IGNORE: &[&str] = &[
13    ".css", ".js", ".ico", ".png", ".jpg", ".jpeg", ".gif", ".svg", ".csv", ".bz2", ".zip", ".epub",
14];
15
16/// Default regex pattern for extracting links from HTML.
17/// This captures all href values, filtering is done in Rust code.
18pub fn default_link_regex() -> Regex {
19    // Simple pattern to match href values - filtering is done afterwards
20    // since Rust regex doesn't support look-around assertions
21    Regex::new(r#"href=["']([^"'#]+)["'#]"#).expect("Failed to compile default link regex")
22}
23
24/// Check if a link should be ignored based on its prefix.
25fn should_ignore_prefix(link: &str) -> bool {
26    PREFIXES_TO_IGNORE
27        .iter()
28        .any(|prefix| link.starts_with(prefix))
29}
30
31/// Check if a link should be ignored based on its suffix.
32fn should_ignore_suffix(link: &str) -> bool {
33    SUFFIXES_TO_IGNORE
34        .iter()
35        .any(|suffix| link.ends_with(suffix))
36}
37
38/// Extract all links from a raw HTML string.
39///
40/// # Arguments
41///
42/// * `raw_html` - The original HTML string.
43/// * `pattern` - Optional regex pattern for extracting links. If not provided, uses the default pattern.
44///
45/// # Returns
46///
47/// A list of all unique links found in the HTML.
48///
49/// # Example
50///
51/// ```
52/// use agent_chain_core::utils::html::find_all_links;
53///
54/// let html = r#"<a href="https://example.com">Link</a>"#;
55/// let links = find_all_links(html, None);
56/// assert!(links.contains(&"https://example.com".to_string()));
57/// ```
58pub fn find_all_links(raw_html: &str, pattern: Option<&Regex>) -> Vec<String> {
59    let default_regex = default_link_regex();
60    let regex = pattern.unwrap_or(&default_regex);
61
62    regex
63        .captures_iter(raw_html)
64        .filter_map(|cap| cap.get(1).map(|m| m.as_str().to_string()))
65        .filter(|link| !should_ignore_prefix(link) && !should_ignore_suffix(link))
66        .collect::<HashSet<_>>()
67        .into_iter()
68        .collect()
69}
70
71/// Extract all links from a raw HTML string and convert into absolute paths.
72///
73/// # Arguments
74///
75/// * `raw_html` - The original HTML string.
76/// * `url` - The URL of the HTML page.
77/// * `base_url` - The base URL to check for outside links against. If not provided, uses `url`.
78/// * `pattern` - Optional regex pattern for extracting links.
79/// * `prevent_outside` - If `true`, ignore external links which are not children of the base URL.
80/// * `exclude_prefixes` - Exclude any URLs that start with one of these prefixes.
81///
82/// # Returns
83///
84/// A list of absolute URLs found in the HTML.
85///
86/// # Example
87///
88/// ```
89/// use agent_chain_core::utils::html::extract_sub_links;
90///
91/// let html = r#"<a href="/page">Link</a>"#;
92/// let links = extract_sub_links(html, "https://example.com", None, None, true, &[]);
93/// // Would contain "https://example.com/page"
94/// ```
95pub fn extract_sub_links(
96    raw_html: &str,
97    url: &str,
98    base_url: Option<&str>,
99    pattern: Option<&Regex>,
100    prevent_outside: bool,
101    exclude_prefixes: &[&str],
102) -> Vec<String> {
103    let base_url_to_use = base_url.unwrap_or(url);
104
105    let parsed_base_url = match url::Url::parse(base_url_to_use) {
106        Ok(u) => u,
107        Err(_) => return vec![],
108    };
109
110    let parsed_url = match url::Url::parse(url) {
111        Ok(u) => u,
112        Err(_) => return vec![],
113    };
114
115    let all_links = find_all_links(raw_html, pattern);
116    let mut absolute_paths = HashSet::new();
117
118    for link in all_links {
119        let absolute_path = match url::Url::parse(&link) {
120            Ok(parsed_link) => {
121                if parsed_link.scheme() == "http" || parsed_link.scheme() == "https" {
122                    link
123                } else {
124                    continue;
125                }
126            }
127            Err(_) => {
128                if link.starts_with("//") {
129                    format!("{}:{}", parsed_url.scheme(), link)
130                } else {
131                    match parsed_url.join(&link) {
132                        Ok(joined) => joined.to_string(),
133                        Err(_) => continue,
134                    }
135                }
136            }
137        };
138
139        absolute_paths.insert(absolute_path);
140    }
141
142    let mut results = Vec::new();
143
144    for path in absolute_paths {
145        if exclude_prefixes
146            .iter()
147            .any(|prefix| path.starts_with(prefix))
148        {
149            continue;
150        }
151
152        if prevent_outside {
153            let parsed_path = match url::Url::parse(&path) {
154                Ok(u) => u,
155                Err(_) => continue,
156            };
157
158            if parsed_base_url.host_str() != parsed_path.host_str() {
159                continue;
160            }
161
162            if !path.starts_with(base_url_to_use) {
163                continue;
164            }
165        }
166
167        results.push(path);
168    }
169
170    results
171}
172
173#[cfg(test)]
174mod tests {
175    use super::*;
176
177    #[test]
178    fn test_find_all_links() {
179        let html = r#"
180            <a href="https://example.com/page1">Link 1</a>
181            <a href="/page2">Link 2</a>
182            <a href="https://example.com/page1">Duplicate</a>
183        "#;
184
185        let links = find_all_links(html, None);
186        assert!(links.contains(&"https://example.com/page1".to_string()));
187        assert!(links.contains(&"/page2".to_string()));
188    }
189
190    #[test]
191    fn test_find_all_links_ignores_prefixes() {
192        let html = r##"
193            <a href="javascript:void(0)">JS Link</a>
194            <a href="mailto:test@example.com">Email</a>
195            <a href="#section">Anchor</a>
196            <a href="https://example.com">Valid</a>
197        "##;
198
199        let links = find_all_links(html, None);
200        assert!(!links.iter().any(|l| l.starts_with("javascript:")));
201        assert!(!links.iter().any(|l| l.starts_with("mailto:")));
202        assert!(!links.iter().any(|l| l.starts_with("#")));
203    }
204
205    #[test]
206    fn test_find_all_links_ignores_suffixes() {
207        let html = r##"
208            <a href="style.css">CSS</a>
209            <a href="script.js">JS</a>
210            <a href="image.png">Image</a>
211            <a href="https://example.com/page">Valid</a>
212        "##;
213
214        let links = find_all_links(html, None);
215        assert!(!links.iter().any(|l| l.ends_with(".css")));
216        assert!(!links.iter().any(|l| l.ends_with(".js")));
217        assert!(!links.iter().any(|l| l.ends_with(".png")));
218    }
219
220    #[test]
221    fn test_extract_sub_links() {
222        let html = r#"
223            <a href="/page1">Link 1</a>
224            <a href="https://example.com/page2">Link 2</a>
225        "#;
226
227        let links = extract_sub_links(html, "https://example.com", None, None, true, &[]);
228
229        for link in &links {
230            assert!(link.starts_with("https://example.com"));
231        }
232    }
233
234    #[test]
235    fn test_extract_sub_links_prevent_outside() {
236        let html = r#"
237            <a href="https://example.com/page">Internal</a>
238            <a href="https://other.com/page">External</a>
239        "#;
240
241        let links = extract_sub_links(html, "https://example.com", None, None, true, &[]);
242
243        assert!(links.iter().any(|l| l.contains("example.com")));
244        assert!(!links.iter().any(|l| l.contains("other.com")));
245    }
246
247    #[test]
248    fn test_extract_sub_links_exclude_prefixes() {
249        let html = r#"
250            <a href="https://example.com/api/v1">API</a>
251            <a href="https://example.com/page">Page</a>
252        "#;
253
254        let links = extract_sub_links(
255            html,
256            "https://example.com",
257            None,
258            None,
259            false,
260            &["https://example.com/api"],
261        );
262
263        assert!(!links.iter().any(|l| l.contains("/api/")));
264        assert!(links.iter().any(|l| l.contains("/page")));
265    }
266}