reasonkit_web/extraction/
links.rs1use crate::browser::PageHandle;
6use crate::error::{ExtractionError, Result};
7use serde::{Deserialize, Serialize};
8use tracing::{debug, info, instrument};
9
10#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
12#[serde(rename_all = "lowercase")]
13pub enum LinkType {
14 Internal,
16 External,
18 Anchor,
20 Email,
22 Phone,
24 JavaScript,
26 Other,
28}
29
30#[derive(Debug, Clone, Serialize, Deserialize)]
32pub struct ExtractedLink {
33 pub url: String,
35 pub text: String,
37 pub title: Option<String>,
39 pub link_type: LinkType,
41 pub rel: Option<String>,
43 pub new_tab: bool,
45 pub context: Option<String>,
47 pub position: usize,
49}
50
51pub struct LinkExtractor;
53
54impl LinkExtractor {
55 #[instrument(skip(page))]
57 pub async fn extract_all(page: &PageHandle) -> Result<Vec<ExtractedLink>> {
58 info!("Extracting all links");
59
60 let script = r#"
61 (() => {
62 const links = [];
63 const baseUrl = window.location.origin;
64 const currentHost = window.location.hostname;
65
66 document.querySelectorAll('a[href]').forEach((el, index) => {
67 const href = el.getAttribute('href') || '';
68 const text = el.innerText.trim() || el.textContent.trim();
69 const title = el.getAttribute('title');
70 const rel = el.getAttribute('rel');
71 const target = el.getAttribute('target');
72
73 // Get context (parent text or siblings)
74 let context = '';
75 try {
76 const parent = el.parentElement;
77 if (parent) {
78 context = parent.innerText.substring(0, 200);
79 }
80 } catch (e) {}
81
82 // Determine link type
83 let linkType = 'other';
84 if (href.startsWith('#')) {
85 linkType = 'anchor';
86 } else if (href.startsWith('mailto:')) {
87 linkType = 'email';
88 } else if (href.startsWith('tel:')) {
89 linkType = 'phone';
90 } else if (href.startsWith('javascript:')) {
91 linkType = 'javascript';
92 } else {
93 try {
94 const url = new URL(href, baseUrl);
95 if (url.hostname === currentHost) {
96 linkType = 'internal';
97 } else {
98 linkType = 'external';
99 }
100 } catch (e) {
101 linkType = 'other';
102 }
103 }
104
105 // Resolve relative URLs
106 let fullUrl = href;
107 if (!href.startsWith('http') && !href.startsWith('mailto:') &&
108 !href.startsWith('tel:') && !href.startsWith('javascript:') &&
109 !href.startsWith('#')) {
110 try {
111 fullUrl = new URL(href, baseUrl).href;
112 } catch (e) {}
113 }
114
115 links.push({
116 url: fullUrl,
117 text: text.substring(0, 500),
118 title: title,
119 linkType: linkType,
120 rel: rel,
121 newTab: target === '_blank',
122 context: context,
123 position: index
124 });
125 });
126
127 return links;
128 })()
129 "#;
130
131 let result: Vec<serde_json::Value> = page
132 .page
133 .evaluate(script)
134 .await
135 .map_err(|e| ExtractionError::ExtractionFailed(e.to_string()))?
136 .into_value()
137 .map_err(|e| ExtractionError::ExtractionFailed(e.to_string()))?;
138
139 let links: Vec<ExtractedLink> = result
140 .into_iter()
141 .map(|v| {
142 let link_type_str = v["linkType"].as_str().unwrap_or("other");
143 let link_type = match link_type_str {
144 "internal" => LinkType::Internal,
145 "external" => LinkType::External,
146 "anchor" => LinkType::Anchor,
147 "email" => LinkType::Email,
148 "phone" => LinkType::Phone,
149 "javascript" => LinkType::JavaScript,
150 _ => LinkType::Other,
151 };
152
153 ExtractedLink {
154 url: v["url"].as_str().unwrap_or("").to_string(),
155 text: v["text"].as_str().unwrap_or("").to_string(),
156 title: v["title"].as_str().map(String::from),
157 link_type,
158 rel: v["rel"].as_str().map(String::from),
159 new_tab: v["newTab"].as_bool().unwrap_or(false),
160 context: v["context"].as_str().map(String::from),
161 position: v["position"].as_u64().unwrap_or(0) as usize,
162 }
163 })
164 .collect();
165
166 debug!("Extracted {} links", links.len());
167 Ok(links)
168 }
169
170 #[instrument(skip(page))]
172 pub async fn extract_external(page: &PageHandle) -> Result<Vec<ExtractedLink>> {
173 let all = Self::extract_all(page).await?;
174 Ok(all
175 .into_iter()
176 .filter(|l| l.link_type == LinkType::External)
177 .collect())
178 }
179
180 #[instrument(skip(page))]
182 pub async fn extract_internal(page: &PageHandle) -> Result<Vec<ExtractedLink>> {
183 let all = Self::extract_all(page).await?;
184 Ok(all
185 .into_iter()
186 .filter(|l| l.link_type == LinkType::Internal)
187 .collect())
188 }
189
190 #[instrument(skip(page))]
192 pub async fn extract_matching(page: &PageHandle, pattern: &str) -> Result<Vec<ExtractedLink>> {
193 let all = Self::extract_all(page).await?;
194 let regex = regex::Regex::new(pattern)
195 .map_err(|e| ExtractionError::InvalidSelector(format!("Invalid regex: {}", e)))?;
196
197 Ok(all.into_iter().filter(|l| regex.is_match(&l.url)).collect())
198 }
199
200 #[instrument(skip(page))]
202 pub async fn extract_from_selector(
203 page: &PageHandle,
204 selector: &str,
205 ) -> Result<Vec<ExtractedLink>> {
206 let script = format!(
207 r#"
208 (() => {{
209 const container = document.querySelector('{}');
210 if (!container) return [];
211
212 const links = [];
213 const baseUrl = window.location.origin;
214 const currentHost = window.location.hostname;
215
216 container.querySelectorAll('a[href]').forEach((el, index) => {{
217 const href = el.getAttribute('href') || '';
218 const text = el.innerText.trim() || el.textContent.trim();
219 const title = el.getAttribute('title');
220 const rel = el.getAttribute('rel');
221 const target = el.getAttribute('target');
222
223 let linkType = 'other';
224 if (href.startsWith('#')) {{
225 linkType = 'anchor';
226 }} else if (href.startsWith('mailto:')) {{
227 linkType = 'email';
228 }} else if (href.startsWith('tel:')) {{
229 linkType = 'phone';
230 }} else if (href.startsWith('javascript:')) {{
231 linkType = 'javascript';
232 }} else {{
233 try {{
234 const url = new URL(href, baseUrl);
235 linkType = url.hostname === currentHost ? 'internal' : 'external';
236 }} catch (e) {{}}
237 }}
238
239 let fullUrl = href;
240 if (!href.startsWith('http') && !href.startsWith('mailto:') &&
241 !href.startsWith('tel:') && !href.startsWith('javascript:') &&
242 !href.startsWith('#')) {{
243 try {{
244 fullUrl = new URL(href, baseUrl).href;
245 }} catch (e) {{}}
246 }}
247
248 links.push({{
249 url: fullUrl,
250 text: text.substring(0, 500),
251 title: title,
252 linkType: linkType,
253 rel: rel,
254 newTab: target === '_blank',
255 context: null,
256 position: index
257 }});
258 }});
259
260 return links;
261 }})()
262 "#,
263 selector.replace('\'', "\\'")
264 );
265
266 let result: Vec<serde_json::Value> = page
267 .page
268 .evaluate(script.as_str())
269 .await
270 .map_err(|e| ExtractionError::ExtractionFailed(e.to_string()))?
271 .into_value()
272 .map_err(|e| ExtractionError::ExtractionFailed(e.to_string()))?;
273
274 let links: Vec<ExtractedLink> = result
275 .into_iter()
276 .map(|v| {
277 let link_type_str = v["linkType"].as_str().unwrap_or("other");
278 let link_type = match link_type_str {
279 "internal" => LinkType::Internal,
280 "external" => LinkType::External,
281 "anchor" => LinkType::Anchor,
282 "email" => LinkType::Email,
283 "phone" => LinkType::Phone,
284 "javascript" => LinkType::JavaScript,
285 _ => LinkType::Other,
286 };
287
288 ExtractedLink {
289 url: v["url"].as_str().unwrap_or("").to_string(),
290 text: v["text"].as_str().unwrap_or("").to_string(),
291 title: v["title"].as_str().map(String::from),
292 link_type,
293 rel: v["rel"].as_str().map(String::from),
294 new_tab: v["newTab"].as_bool().unwrap_or(false),
295 context: None,
296 position: v["position"].as_u64().unwrap_or(0) as usize,
297 }
298 })
299 .collect();
300
301 debug!("Extracted {} links from {}", links.len(), selector);
302 Ok(links)
303 }
304}
305
306#[cfg(test)]
307mod tests {
308 use super::*;
309
310 #[test]
311 fn test_link_type_serialization() {
312 let lt = LinkType::External;
313 let json = serde_json::to_string(<).unwrap();
314 assert_eq!(json, "\"external\"");
315 }
316
317 #[test]
318 fn test_extracted_link_structure() {
319 let link = ExtractedLink {
320 url: "https://example.com".to_string(),
321 text: "Example".to_string(),
322 title: Some("Example Site".to_string()),
323 link_type: LinkType::External,
324 rel: Some("nofollow".to_string()),
325 new_tab: true,
326 context: Some("Click here: Example to visit".to_string()),
327 position: 0,
328 };
329
330 assert_eq!(link.link_type, LinkType::External);
331 assert!(link.new_tab);
332 assert!(link.title.is_some());
333 }
334}