use lazy_static::lazy_static;
use regex::Regex;
use scraper::{Html, Selector, ElementRef};
use std::collections::HashSet;
use url::Url;
use crate::types::{
LinkMedia, LinkType, MediaType, MediaResult,
};
lazy_static! {
static ref EMAIL: Regex = Regex::new(
r"^mailto:(.+)$"
).unwrap();
static ref PHONE: Regex = Regex::new(
r"^tel:(.+)$"
).unwrap();
static ref ANCHOR: Regex = Regex::new(
r"^#(.*)$"
).unwrap();
static ref JS_VOID: Regex = Regex::new(
r"^javascript:"
).unwrap();
static ref DOWNLOAD_EXT: Regex = Regex::new(
r"\.(pdf|doc|docx|xls|xlsx|ppt|pptx|zip|rar|7z|tar|gz|exe|dmg|apk|ipa|csv|txt|epub|mobi)(\?|$)"
).unwrap();
static ref IMAGE_EXT: Regex = Regex::new(
r"\.(jpg|jpeg|png|gif|webp|svg|avif|ico|bmp)(\?|$)"
).unwrap();
static ref VIDEO_EXT: Regex = Regex::new(
r"\.(mp4|webm|avi|mov|mkv|m4v|wmv|flv)(\?|$)"
).unwrap();
static ref AUDIO_EXT: Regex = Regex::new(
r"\.(mp3|wav|ogg|flac|aac|m4a|wma)(\?|$)"
).unwrap();
}
pub fn extract_links(document: &Html, base_url: Option<&Url>) -> Vec<LinkMedia> {
let mut links = Vec::new();
let mut seen_urls: HashSet<String> = HashSet::new();
if let Ok(sel) = Selector::parse("a[href]") {
for el in document.select(&sel) {
if let Some(link) = extract_link_element(&el, base_url) {
if JS_VOID.is_match(&link.href) {
continue;
}
let key = link.absolute_url.as_ref().unwrap_or(&link.href).clone();
if seen_urls.insert(key) {
links.push(link);
}
}
}
}
links
}
fn extract_link_element(el: &ElementRef, base_url: Option<&Url>) -> Option<LinkMedia> {
let href = el.value().attr("href")?;
if href.is_empty() {
return None;
}
let absolute_url = resolve_url(href, base_url);
let link_type = detect_link_type(href, base_url);
let text = el.text().collect::<String>().trim().to_string();
let rel: Vec<String> = el.value().attr("rel")
.map(|r| r.split_whitespace().map(|s| s.to_string()).collect())
.unwrap_or_default();
let is_nofollow = rel.iter().any(|r| r == "nofollow");
let is_sponsored = rel.iter().any(|r| r == "sponsored");
let is_ugc = rel.iter().any(|r| r == "ugc");
let download = el.value().attr("download").and_then(|d| {
if d.is_empty() {
extract_filename(href)
} else {
Some(d.to_string())
}
});
let media_type = detect_media_type_from_url(href);
Some(LinkMedia {
href: href.to_string(),
absolute_url,
text,
title: el.value().attr("title").map(|s| s.to_string()),
rel,
link_type,
is_nofollow,
is_sponsored,
is_ugc,
target: el.value().attr("target").map(|s| s.to_string()),
download,
hreflang: el.value().attr("hreflang").map(|s| s.to_string()),
media_type,
})
}
fn detect_link_type(href: &str, base_url: Option<&Url>) -> LinkType {
if EMAIL.is_match(href) {
return LinkType::Mailto;
}
if PHONE.is_match(href) {
return LinkType::Tel;
}
if ANCHOR.is_match(href) {
return LinkType::Anchor;
}
if DOWNLOAD_EXT.is_match(&href.to_lowercase()) {
return LinkType::Download;
}
if let Some(base) = base_url {
if let Ok(parsed) = Url::parse(href) {
if parsed.host() != base.host() {
return LinkType::External;
}
} else if href.starts_with("//") {
if let Ok(parsed) = Url::parse(&format!("https:{}", href)) {
if parsed.host() != base.host() {
return LinkType::External;
}
}
}
return LinkType::Internal;
}
if href.starts_with("http://") || href.starts_with("https://") || href.starts_with("//") {
return LinkType::External;
}
LinkType::Internal
}
fn detect_media_type_from_url(url: &str) -> Option<MediaType> {
let url_lower = url.to_lowercase();
if IMAGE_EXT.is_match(&url_lower) {
return Some(MediaType::Image);
}
if VIDEO_EXT.is_match(&url_lower) {
return Some(MediaType::Video);
}
if AUDIO_EXT.is_match(&url_lower) {
return Some(MediaType::Audio);
}
if DOWNLOAD_EXT.is_match(&url_lower) {
return Some(MediaType::Document);
}
None
}
fn extract_filename(url: &str) -> Option<String> {
let path = url.split('?').next()?;
let filename = path.rsplit('/').next()?;
if filename.is_empty() || !filename.contains('.') {
None
} else {
urlencoding::decode(filename).ok().map(|s| s.into_owned())
}
}
fn resolve_url(href: &str, base_url: Option<&Url>) -> Option<String> {
if href.starts_with("http://") || href.starts_with("https://") {
return Some(href.to_string());
}
if href.starts_with("//") {
return Some(format!("https:{}", href));
}
if href.starts_with("mailto:") || href.starts_with("tel:") || href.starts_with("#") {
return Some(href.to_string());
}
base_url.and_then(|base| base.join(href).ok().map(|u| u.to_string()))
}
pub fn extract_links_from_html(html: &str, base_url: Option<&str>) -> MediaResult<Vec<LinkMedia>> {
let document = Html::parse_document(html);
let base = base_url.and_then(|u| Url::parse(u).ok());
Ok(extract_links(&document, base.as_ref()))
}
pub fn get_link_urls(html: &str, base_url: Option<&str>) -> Vec<String> {
extract_links_from_html(html, base_url)
.unwrap_or_default()
.into_iter()
.filter_map(|l| l.absolute_url)
.collect()
}
pub fn has_links(document: &Html) -> bool {
if let Ok(sel) = Selector::parse("a[href]") {
document.select(&sel).next().is_some()
} else {
false
}
}
pub fn get_internal_links(links: &[LinkMedia]) -> Vec<&LinkMedia> {
links.iter()
.filter(|l| l.link_type == LinkType::Internal)
.collect()
}
pub fn get_external_links(links: &[LinkMedia]) -> Vec<&LinkMedia> {
links.iter()
.filter(|l| l.link_type == LinkType::External)
.collect()
}
pub fn get_download_links(links: &[LinkMedia]) -> Vec<&LinkMedia> {
links.iter()
.filter(|l| l.link_type == LinkType::Download)
.collect()
}
pub fn get_nofollow_links(links: &[LinkMedia]) -> Vec<&LinkMedia> {
links.iter()
.filter(|l| l.is_nofollow)
.collect()
}
pub fn get_mailto_links(links: &[LinkMedia]) -> Vec<&LinkMedia> {
links.iter()
.filter(|l| l.link_type == LinkType::Mailto)
.collect()
}
pub fn extract_emails(links: &[LinkMedia]) -> Vec<String> {
links.iter()
.filter(|l| l.link_type == LinkType::Mailto)
.filter_map(|l| {
EMAIL.captures(&l.href)
.and_then(|c| c.get(1))
.map(|m| m.as_str().to_string())
})
.collect()
}
pub fn extract_phones(links: &[LinkMedia]) -> Vec<String> {
links.iter()
.filter(|l| l.link_type == LinkType::Tel)
.filter_map(|l| {
PHONE.captures(&l.href)
.and_then(|c| c.get(1))
.map(|m| m.as_str().to_string())
})
.collect()
}
pub fn count_by_type(links: &[LinkMedia]) -> std::collections::HashMap<LinkType, usize> {
let mut counts = std::collections::HashMap::new();
for link in links {
*counts.entry(link.link_type).or_insert(0) += 1;
}
counts
}
pub fn filter_by_domain<'a>(links: &'a [LinkMedia], domain: &str) -> Vec<&'a LinkMedia> {
links.iter()
.filter(|l| {
l.absolute_url.as_ref()
.and_then(|u| Url::parse(u).ok())
.and_then(|u| u.host_str().map(|h| h.contains(domain)))
.unwrap_or(false)
})
.collect()
}
pub fn get_unique_domains(links: &[LinkMedia]) -> Vec<String> {
let mut domains: HashSet<String> = HashSet::new();
for link in links {
if let Some(url) = &link.absolute_url {
if let Ok(parsed) = Url::parse(url) {
if let Some(host) = parsed.host_str() {
domains.insert(host.to_string());
}
}
}
}
domains.into_iter().collect()
}
#[cfg(test)]
mod tests {
use super::*;
fn parse_html(html: &str) -> Html {
Html::parse_document(html)
}
#[test]
fn test_extract_basic_link() {
let html = r#"<a href="https://example.com/page">Click here</a>"#;
let doc = parse_html(html);
let links = extract_links(&doc, None);
assert_eq!(links.len(), 1);
assert_eq!(links[0].href, "https://example.com/page");
assert_eq!(links[0].text, "Click here");
}
#[test]
fn test_extract_relative_link() {
let html = r#"<a href="/about">About</a>"#;
let doc = parse_html(html);
let base = Url::parse("https://example.com").unwrap();
let links = extract_links(&doc, Some(&base));
assert_eq!(links.len(), 1);
assert_eq!(links[0].absolute_url, Some("https://example.com/about".to_string()));
assert_eq!(links[0].link_type, LinkType::Internal);
}
#[test]
fn test_external_link() {
let html = r#"<a href="https://other.com">External</a>"#;
let doc = parse_html(html);
let base = Url::parse("https://example.com").unwrap();
let links = extract_links(&doc, Some(&base));
assert_eq!(links[0].link_type, LinkType::External);
}
#[test]
fn test_mailto_link() {
let html = r#"<a href="mailto:test@example.com">Email</a>"#;
let doc = parse_html(html);
let links = extract_links(&doc, None);
assert_eq!(links[0].link_type, LinkType::Mailto);
}
#[test]
fn test_tel_link() {
let html = r#"<a href="tel:+1234567890">Call</a>"#;
let doc = parse_html(html);
let links = extract_links(&doc, None);
assert_eq!(links[0].link_type, LinkType::Tel);
}
#[test]
fn test_anchor_link() {
let html = r#"<a href='#section'>Jump</a>"#;
let doc = parse_html(html);
let links = extract_links(&doc, None);
assert_eq!(links[0].link_type, LinkType::Anchor);
}
#[test]
fn test_download_link() {
let html = r#"<a href="/files/doc.pdf">PDF</a>"#;
let doc = parse_html(html);
let links = extract_links(&doc, None);
assert_eq!(links[0].link_type, LinkType::Download);
assert_eq!(links[0].media_type, Some(MediaType::Document));
}
#[test]
fn test_nofollow_link() {
let html = r#"<a href="https://example.com" rel="nofollow">Link</a>"#;
let doc = parse_html(html);
let links = extract_links(&doc, None);
assert!(links[0].is_nofollow);
}
#[test]
fn test_sponsored_ugc_link() {
let html = r#"<a href="https://example.com" rel="sponsored ugc nofollow">Ad</a>"#;
let doc = parse_html(html);
let links = extract_links(&doc, None);
assert!(links[0].is_nofollow);
assert!(links[0].is_sponsored);
assert!(links[0].is_ugc);
}
#[test]
fn test_download_attribute() {
let html = r#"<a href="/file.zip" download="archive.zip">Download</a>"#;
let doc = parse_html(html);
let links = extract_links(&doc, None);
assert_eq!(links[0].download, Some("archive.zip".to_string()));
}
#[test]
fn test_has_links() {
let with_link = r#"<a href="/page">Link</a>"#;
let without_link = r#"<div>No links</div>"#;
assert!(has_links(&parse_html(with_link)));
assert!(!has_links(&parse_html(without_link)));
}
#[test]
fn test_get_internal_external() {
let links = vec![
LinkMedia { link_type: LinkType::Internal, ..Default::default() },
LinkMedia { link_type: LinkType::External, ..Default::default() },
LinkMedia { link_type: LinkType::Internal, ..Default::default() },
];
assert_eq!(get_internal_links(&links).len(), 2);
assert_eq!(get_external_links(&links).len(), 1);
}
#[test]
fn test_extract_emails() {
let links = vec![
LinkMedia { href: "mailto:a@b.com".to_string(), link_type: LinkType::Mailto, ..Default::default() },
LinkMedia { href: "mailto:x@y.com".to_string(), link_type: LinkType::Mailto, ..Default::default() },
LinkMedia { href: "/page".to_string(), link_type: LinkType::Internal, ..Default::default() },
];
let emails = extract_emails(&links);
assert_eq!(emails.len(), 2);
assert!(emails.contains(&"a@b.com".to_string()));
}
#[test]
fn test_extract_phones() {
let links = vec![
LinkMedia { href: "tel:+1234567890".to_string(), link_type: LinkType::Tel, ..Default::default() },
];
let phones = extract_phones(&links);
assert_eq!(phones, vec!["+1234567890"]);
}
#[test]
fn test_count_by_type() {
let links = vec![
LinkMedia { link_type: LinkType::Internal, ..Default::default() },
LinkMedia { link_type: LinkType::Internal, ..Default::default() },
LinkMedia { link_type: LinkType::External, ..Default::default() },
];
let counts = count_by_type(&links);
assert_eq!(counts.get(&LinkType::Internal), Some(&2));
assert_eq!(counts.get(&LinkType::External), Some(&1));
}
#[test]
fn test_get_unique_domains() {
let links = vec![
LinkMedia { absolute_url: Some("https://a.com/1".to_string()), ..Default::default() },
LinkMedia { absolute_url: Some("https://a.com/2".to_string()), ..Default::default() },
LinkMedia { absolute_url: Some("https://b.com/1".to_string()), ..Default::default() },
];
let domains = get_unique_domains(&links);
assert_eq!(domains.len(), 2);
}
#[test]
fn test_skip_javascript_links() {
let html = r#"<a href="javascript:void(0)">Click</a><a href="/real">Real</a>"#;
let doc = parse_html(html);
let links = extract_links(&doc, None);
assert_eq!(links.len(), 1);
assert_eq!(links[0].href, "/real");
}
#[test]
fn test_detect_media_type() {
assert_eq!(detect_media_type_from_url("/img.jpg"), Some(MediaType::Image));
assert_eq!(detect_media_type_from_url("/vid.mp4"), Some(MediaType::Video));
assert_eq!(detect_media_type_from_url("/aud.mp3"), Some(MediaType::Audio));
assert_eq!(detect_media_type_from_url("/doc.pdf"), Some(MediaType::Document));
assert_eq!(detect_media_type_from_url("/page"), None);
}
#[test]
fn test_hreflang() {
let html = r#"<a href="/fr" hreflang="fr">Français</a>"#;
let doc = parse_html(html);
let links = extract_links(&doc, None);
assert_eq!(links[0].hreflang, Some("fr".to_string()));
}
}