reasonkit_web/extraction/
links.rs

1//! Link extraction
2//!
3//! This module extracts all links from web pages with context and metadata.
4
5use crate::browser::PageHandle;
6use crate::error::{ExtractionError, Result};
7use serde::{Deserialize, Serialize};
8use tracing::{debug, info, instrument};
9
10/// Type of link
11#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
12#[serde(rename_all = "lowercase")]
13pub enum LinkType {
14    /// Internal link (same domain)
15    Internal,
16    /// External link (different domain)
17    External,
18    /// Anchor link (same page)
19    Anchor,
20    /// mailto: link
21    Email,
22    /// tel: link
23    Phone,
24    /// JavaScript link
25    JavaScript,
26    /// Other/unknown
27    Other,
28}
29
30/// An extracted link with context
31#[derive(Debug, Clone, Serialize, Deserialize)]
32pub struct ExtractedLink {
33    /// The href URL
34    pub url: String,
35    /// Link text content
36    pub text: String,
37    /// Title attribute
38    pub title: Option<String>,
39    /// Type of link
40    pub link_type: LinkType,
41    /// Rel attribute
42    pub rel: Option<String>,
43    /// Whether it opens in a new tab
44    pub new_tab: bool,
45    /// Surrounding context (nearby text)
46    pub context: Option<String>,
47    /// Position in document (order found)
48    pub position: usize,
49}
50
51/// Link extraction functionality
52pub struct LinkExtractor;
53
54impl LinkExtractor {
55    /// Extract all links from the page
56    #[instrument(skip(page))]
57    pub async fn extract_all(page: &PageHandle) -> Result<Vec<ExtractedLink>> {
58        info!("Extracting all links");
59
60        let script = r#"
61            (() => {
62                const links = [];
63                const baseUrl = window.location.origin;
64                const currentHost = window.location.hostname;
65
66                document.querySelectorAll('a[href]').forEach((el, index) => {
67                    const href = el.getAttribute('href') || '';
68                    const text = el.innerText.trim() || el.textContent.trim();
69                    const title = el.getAttribute('title');
70                    const rel = el.getAttribute('rel');
71                    const target = el.getAttribute('target');
72
73                    // Get context (parent text or siblings)
74                    let context = '';
75                    try {
76                        const parent = el.parentElement;
77                        if (parent) {
78                            context = parent.innerText.substring(0, 200);
79                        }
80                    } catch (e) {}
81
82                    // Determine link type
83                    let linkType = 'other';
84                    if (href.startsWith('#')) {
85                        linkType = 'anchor';
86                    } else if (href.startsWith('mailto:')) {
87                        linkType = 'email';
88                    } else if (href.startsWith('tel:')) {
89                        linkType = 'phone';
90                    } else if (href.startsWith('javascript:')) {
91                        linkType = 'javascript';
92                    } else {
93                        try {
94                            const url = new URL(href, baseUrl);
95                            if (url.hostname === currentHost) {
96                                linkType = 'internal';
97                            } else {
98                                linkType = 'external';
99                            }
100                        } catch (e) {
101                            linkType = 'other';
102                        }
103                    }
104
105                    // Resolve relative URLs
106                    let fullUrl = href;
107                    if (!href.startsWith('http') && !href.startsWith('mailto:') &&
108                        !href.startsWith('tel:') && !href.startsWith('javascript:') &&
109                        !href.startsWith('#')) {
110                        try {
111                            fullUrl = new URL(href, baseUrl).href;
112                        } catch (e) {}
113                    }
114
115                    links.push({
116                        url: fullUrl,
117                        text: text.substring(0, 500),
118                        title: title,
119                        linkType: linkType,
120                        rel: rel,
121                        newTab: target === '_blank',
122                        context: context,
123                        position: index
124                    });
125                });
126
127                return links;
128            })()
129        "#;
130
131        let result: Vec<serde_json::Value> = page
132            .page
133            .evaluate(script)
134            .await
135            .map_err(|e| ExtractionError::ExtractionFailed(e.to_string()))?
136            .into_value()
137            .map_err(|e| ExtractionError::ExtractionFailed(e.to_string()))?;
138
139        let links: Vec<ExtractedLink> = result
140            .into_iter()
141            .map(|v| {
142                let link_type_str = v["linkType"].as_str().unwrap_or("other");
143                let link_type = match link_type_str {
144                    "internal" => LinkType::Internal,
145                    "external" => LinkType::External,
146                    "anchor" => LinkType::Anchor,
147                    "email" => LinkType::Email,
148                    "phone" => LinkType::Phone,
149                    "javascript" => LinkType::JavaScript,
150                    _ => LinkType::Other,
151                };
152
153                ExtractedLink {
154                    url: v["url"].as_str().unwrap_or("").to_string(),
155                    text: v["text"].as_str().unwrap_or("").to_string(),
156                    title: v["title"].as_str().map(String::from),
157                    link_type,
158                    rel: v["rel"].as_str().map(String::from),
159                    new_tab: v["newTab"].as_bool().unwrap_or(false),
160                    context: v["context"].as_str().map(String::from),
161                    position: v["position"].as_u64().unwrap_or(0) as usize,
162                }
163            })
164            .collect();
165
166        debug!("Extracted {} links", links.len());
167        Ok(links)
168    }
169
170    /// Extract only external links
171    #[instrument(skip(page))]
172    pub async fn extract_external(page: &PageHandle) -> Result<Vec<ExtractedLink>> {
173        let all = Self::extract_all(page).await?;
174        Ok(all
175            .into_iter()
176            .filter(|l| l.link_type == LinkType::External)
177            .collect())
178    }
179
180    /// Extract only internal links
181    #[instrument(skip(page))]
182    pub async fn extract_internal(page: &PageHandle) -> Result<Vec<ExtractedLink>> {
183        let all = Self::extract_all(page).await?;
184        Ok(all
185            .into_iter()
186            .filter(|l| l.link_type == LinkType::Internal)
187            .collect())
188    }
189
190    /// Extract links matching a pattern
191    #[instrument(skip(page))]
192    pub async fn extract_matching(page: &PageHandle, pattern: &str) -> Result<Vec<ExtractedLink>> {
193        let all = Self::extract_all(page).await?;
194        let regex = regex::Regex::new(pattern)
195            .map_err(|e| ExtractionError::InvalidSelector(format!("Invalid regex: {}", e)))?;
196
197        Ok(all.into_iter().filter(|l| regex.is_match(&l.url)).collect())
198    }
199
200    /// Extract links from a specific container
201    #[instrument(skip(page))]
202    pub async fn extract_from_selector(
203        page: &PageHandle,
204        selector: &str,
205    ) -> Result<Vec<ExtractedLink>> {
206        let script = format!(
207            r#"
208            (() => {{
209                const container = document.querySelector('{}');
210                if (!container) return [];
211
212                const links = [];
213                const baseUrl = window.location.origin;
214                const currentHost = window.location.hostname;
215
216                container.querySelectorAll('a[href]').forEach((el, index) => {{
217                    const href = el.getAttribute('href') || '';
218                    const text = el.innerText.trim() || el.textContent.trim();
219                    const title = el.getAttribute('title');
220                    const rel = el.getAttribute('rel');
221                    const target = el.getAttribute('target');
222
223                    let linkType = 'other';
224                    if (href.startsWith('#')) {{
225                        linkType = 'anchor';
226                    }} else if (href.startsWith('mailto:')) {{
227                        linkType = 'email';
228                    }} else if (href.startsWith('tel:')) {{
229                        linkType = 'phone';
230                    }} else if (href.startsWith('javascript:')) {{
231                        linkType = 'javascript';
232                    }} else {{
233                        try {{
234                            const url = new URL(href, baseUrl);
235                            linkType = url.hostname === currentHost ? 'internal' : 'external';
236                        }} catch (e) {{}}
237                    }}
238
239                    let fullUrl = href;
240                    if (!href.startsWith('http') && !href.startsWith('mailto:') &&
241                        !href.startsWith('tel:') && !href.startsWith('javascript:') &&
242                        !href.startsWith('#')) {{
243                        try {{
244                            fullUrl = new URL(href, baseUrl).href;
245                        }} catch (e) {{}}
246                    }}
247
248                    links.push({{
249                        url: fullUrl,
250                        text: text.substring(0, 500),
251                        title: title,
252                        linkType: linkType,
253                        rel: rel,
254                        newTab: target === '_blank',
255                        context: null,
256                        position: index
257                    }});
258                }});
259
260                return links;
261            }})()
262            "#,
263            selector.replace('\'', "\\'")
264        );
265
266        let result: Vec<serde_json::Value> = page
267            .page
268            .evaluate(script.as_str())
269            .await
270            .map_err(|e| ExtractionError::ExtractionFailed(e.to_string()))?
271            .into_value()
272            .map_err(|e| ExtractionError::ExtractionFailed(e.to_string()))?;
273
274        let links: Vec<ExtractedLink> = result
275            .into_iter()
276            .map(|v| {
277                let link_type_str = v["linkType"].as_str().unwrap_or("other");
278                let link_type = match link_type_str {
279                    "internal" => LinkType::Internal,
280                    "external" => LinkType::External,
281                    "anchor" => LinkType::Anchor,
282                    "email" => LinkType::Email,
283                    "phone" => LinkType::Phone,
284                    "javascript" => LinkType::JavaScript,
285                    _ => LinkType::Other,
286                };
287
288                ExtractedLink {
289                    url: v["url"].as_str().unwrap_or("").to_string(),
290                    text: v["text"].as_str().unwrap_or("").to_string(),
291                    title: v["title"].as_str().map(String::from),
292                    link_type,
293                    rel: v["rel"].as_str().map(String::from),
294                    new_tab: v["newTab"].as_bool().unwrap_or(false),
295                    context: None,
296                    position: v["position"].as_u64().unwrap_or(0) as usize,
297                }
298            })
299            .collect();
300
301        debug!("Extracted {} links from {}", links.len(), selector);
302        Ok(links)
303    }
304}
305
306#[cfg(test)]
307mod tests {
308    use super::*;
309
310    #[test]
311    fn test_link_type_serialization() {
312        let lt = LinkType::External;
313        let json = serde_json::to_string(&lt).unwrap();
314        assert_eq!(json, "\"external\"");
315    }
316
317    #[test]
318    fn test_extracted_link_structure() {
319        let link = ExtractedLink {
320            url: "https://example.com".to_string(),
321            text: "Example".to_string(),
322            title: Some("Example Site".to_string()),
323            link_type: LinkType::External,
324            rel: Some("nofollow".to_string()),
325            new_tab: true,
326            context: Some("Click here: Example to visit".to_string()),
327            position: 0,
328        };
329
330        assert_eq!(link.link_type, LinkType::External);
331        assert!(link.new_tab);
332        assert!(link.title.is_some());
333    }
334}
reasonkit_web/extraction/links.rs

reasonkit_web/extraction/
links.rs