use scraper::{Html, Selector};
use serde::{Deserialize, Serialize};
use std::sync::LazyLock;
use url::Url;
use crate::discovery::subdomains::registrable_domain;
static SCRIPT_SRC_SELECTOR: LazyLock<Selector> =
LazyLock::new(|| Selector::parse("script[src]").expect("static selector"));
static LINK_HREF_SELECTOR: LazyLock<Selector> =
LazyLock::new(|| Selector::parse("link[href]").expect("static selector"));
static IMG_SELECTOR: LazyLock<Selector> =
LazyLock::new(|| Selector::parse("img").expect("static selector"));
static SOURCE_SELECTOR: LazyLock<Selector> =
LazyLock::new(|| Selector::parse("source").expect("static selector"));
static IFRAME_SRC_SELECTOR: LazyLock<Selector> =
LazyLock::new(|| Selector::parse("iframe[src]").expect("static selector"));
static DATA_SITEKEY_SELECTOR: LazyLock<Selector> =
LazyLock::new(|| Selector::parse("[data-sitekey]").expect("static selector"));
static VIDEO_SRC_SELECTOR: LazyLock<Selector> =
LazyLock::new(|| Selector::parse("video[src]").expect("static selector"));
static AUDIO_SRC_SELECTOR: LazyLock<Selector> =
LazyLock::new(|| Selector::parse("audio[src]").expect("static selector"));
static A_HREF_SELECTOR: LazyLock<Selector> =
LazyLock::new(|| Selector::parse("a[href]").expect("static selector"));
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord, Serialize, Deserialize)]
#[serde(rename_all = "kebab-case")]
pub enum Category {
Analytics,
Cdn,
Ads,
Social,
FontService,
TagManager,
CloudStorage,
VideoHost,
Payments,
Auth,
Support,
Maps,
Captcha,
Other,
}
impl Category {
pub fn as_str(self) -> &'static str {
match self {
Self::Analytics => "analytics",
Self::Cdn => "cdn",
Self::Ads => "ads",
Self::Social => "social",
Self::FontService => "font-service",
Self::TagManager => "tag-manager",
Self::CloudStorage => "cloud-storage",
Self::VideoHost => "video-host",
Self::Payments => "payments",
Self::Auth => "auth",
Self::Support => "support",
Self::Maps => "maps",
Self::Captcha => "captcha",
Self::Other => "other",
}
}
}
pub fn categorise(host: &str) -> Vec<Category> {
use std::collections::BTreeSet;
let host_lc = host.trim().to_ascii_lowercase();
if host_lc.is_empty() {
return Vec::new();
}
let registrable = registrable_domain(&host_lc).unwrap_or_else(|| host_lc.clone());
let mut found: BTreeSet<Category> = BTreeSet::new();
for (pattern, cats) in CATEGORY_TABLE {
if matches_host(pattern, &host_lc, ®istrable) {
for c in *cats {
found.insert(*c);
}
}
}
found.into_iter().collect()
}
fn matches_host(pattern: &str, host: &str, registrable: &str) -> bool {
let domain_part = pattern.split('/').next().unwrap_or(pattern);
if domain_part.is_empty() {
return false;
}
if host == domain_part {
return true;
}
if registrable == domain_part {
return true;
}
if host.len() > domain_part.len() + 1 && host.ends_with(domain_part) {
let cut = host.len() - domain_part.len() - 1;
if host.as_bytes()[cut] == b'.' {
return true;
}
}
false
}
const CATEGORY_TABLE: &[(&str, &[Category])] = &[
("google-analytics.com", &[Category::Analytics]),
(
"googletagmanager.com",
&[Category::Analytics, Category::TagManager],
),
("amplitude.com", &[Category::Analytics]),
("mixpanel.com", &[Category::Analytics]),
("segment.com", &[Category::Analytics, Category::TagManager]),
("segment.io", &[Category::Analytics, Category::TagManager]),
("hotjar.com", &[Category::Analytics]),
("clarity.ms", &[Category::Analytics]),
("fullstory.com", &[Category::Analytics]),
("heap.io", &[Category::Analytics]),
("plausible.io", &[Category::Analytics]),
("matomo.org", &[Category::Analytics]),
("clicky.com", &[Category::Analytics]),
("tealium.com", &[Category::Analytics, Category::TagManager]),
("cloudflare.com", &[Category::Cdn]),
("cloudfront.net", &[Category::Cdn]),
("fastly.net", &[Category::Cdn]),
("fastlylb.net", &[Category::Cdn]),
("akamai.net", &[Category::Cdn]),
("akamaihd.net", &[Category::Cdn]),
("akamaized.net", &[Category::Cdn]),
("jsdelivr.net", &[Category::Cdn]),
("unpkg.com", &[Category::Cdn]),
("bunnycdn.com", &[Category::Cdn]),
("keycdn.com", &[Category::Cdn]),
("cdn.jsdelivr.net", &[Category::Cdn]),
("cdnjs.cloudflare.com", &[Category::Cdn]),
("doubleclick.net", &[Category::Ads]),
("googlesyndication.com", &[Category::Ads]),
("googleadservices.com", &[Category::Ads]),
("google.com/ads", &[Category::Ads]),
("adnxs.com", &[Category::Ads]),
("rubiconproject.com", &[Category::Ads]),
("criteo.com", &[Category::Ads]),
("taboola.com", &[Category::Ads]),
("outbrain.com", &[Category::Ads]),
("facebook.com", &[Category::Social]),
("fbcdn.net", &[Category::Social]),
("twitter.com", &[Category::Social]),
("x.com", &[Category::Social]),
("t.co", &[Category::Social]),
("linkedin.com", &[Category::Social]),
("instagram.com", &[Category::Social]),
("youtube.com", &[Category::Social, Category::VideoHost]),
("tiktok.com", &[Category::Social]),
("pinterest.com", &[Category::Social]),
("reddit.com", &[Category::Social]),
("discord.gg", &[Category::Social]),
("discord.com", &[Category::Social]),
("fonts.googleapis.com", &[Category::FontService]),
("fonts.gstatic.com", &[Category::FontService]),
("use.typekit.net", &[Category::FontService]),
("kit.fontawesome.com", &[Category::FontService]),
("cloud.typenetwork.com", &[Category::FontService]),
("launchdarkly.com", &[Category::TagManager]),
("amazonaws.com", &[Category::CloudStorage]),
("s3.amazonaws.com", &[Category::CloudStorage]),
("storage.googleapis.com", &[Category::CloudStorage]),
("blob.core.windows.net", &[Category::CloudStorage]),
("youtu.be", &[Category::VideoHost]),
("vimeo.com", &[Category::VideoHost]),
("wistia.com", &[Category::VideoHost]),
("jwplatform.com", &[Category::VideoHost]),
("stripe.com", &[Category::Payments]),
("paypal.com", &[Category::Payments]),
("pagar.me", &[Category::Payments]),
("braintreegateway.com", &[Category::Payments]),
("adyen.com", &[Category::Payments]),
("checkout.com", &[Category::Payments]),
("auth0.com", &[Category::Auth]),
("okta.com", &[Category::Auth]),
("onelogin.com", &[Category::Auth]),
("intercom.io", &[Category::Support]),
("zendesk.com", &[Category::Support]),
("freshdesk.com", &[Category::Support]),
("drift.com", &[Category::Support]),
("tidio.com", &[Category::Support]),
("google.com/maps", &[Category::Maps]),
("maps.googleapis.com", &[Category::Maps]),
("mapbox.com", &[Category::Maps]),
("google.com/recaptcha", &[Category::Captcha]),
("hcaptcha.com", &[Category::Captcha]),
("arkoselabs.com", &[Category::Captcha]),
("challenges.cloudflare.com", &[Category::Captcha]),
];
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub enum AssetRefKind {
Script,
Style,
Font,
Image,
Video,
Audio,
Iframe,
Link, Turnstile,
Other,
}
impl AssetRefKind {
pub fn as_str(self) -> &'static str {
match self {
Self::Script => "script",
Self::Style => "style",
Self::Font => "font",
Self::Image => "image",
Self::Video => "video",
Self::Audio => "audio",
Self::Iframe => "iframe",
Self::Link => "link",
Self::Turnstile => "turnstile",
Self::Other => "other",
}
}
}
#[derive(Debug, Clone)]
pub struct AssetRef {
pub from_page_url: String,
pub to_url: String,
pub to_domain: String,
pub kind: AssetRefKind,
pub is_internal: bool,
}
pub fn extract_asset_refs(base: &Url, html: &str, target_root: &str) -> Vec<AssetRef> {
let doc = Html::parse_document(html);
extract_asset_refs_from_document(base, &doc, target_root)
}
pub fn extract_asset_refs_from_document(
base: &Url,
doc: &Html,
target_root: &str,
) -> Vec<AssetRef> {
let mut out: Vec<AssetRef> = Vec::new();
let from_page_url = base.to_string();
let push = |out: &mut Vec<AssetRef>, raw: &str, kind: AssetRefKind| {
let Ok(u) = base.join(raw) else { return };
if !matches!(u.scheme(), "http" | "https") {
return;
}
let host = match u.host_str() {
Some(h) => h.to_ascii_lowercase(),
None => return,
};
let to_domain = registrable_domain(&host).unwrap_or(host.clone());
let is_internal = to_domain.eq_ignore_ascii_case(target_root);
out.push(AssetRef {
from_page_url: from_page_url.clone(),
to_url: u.to_string(),
to_domain,
kind,
is_internal,
});
};
for el in doc.select(&SCRIPT_SRC_SELECTOR) {
if let Some(src) = el.value().attr("src") {
push(&mut out, src, AssetRefKind::Script);
}
}
for el in doc.select(&LINK_HREF_SELECTOR) {
let Some(href) = el.value().attr("href") else {
continue;
};
let rel = el.value().attr("rel").unwrap_or("").to_ascii_lowercase();
let as_ = el.value().attr("as").unwrap_or("").to_ascii_lowercase();
let is_stylesheet = rel.split_whitespace().any(|r| r == "stylesheet");
let is_preload_font = rel.split_whitespace().any(|r| r == "preload") && as_ == "font";
let looks_like_font = href_ends_with_font(href);
let kind = if is_stylesheet {
AssetRefKind::Style
} else if is_preload_font || looks_like_font {
AssetRefKind::Font
} else {
AssetRefKind::Other
};
push(&mut out, href, kind);
}
for el in doc.select(&IMG_SELECTOR) {
if let Some(src) = el.value().attr("src") {
push(&mut out, src, AssetRefKind::Image);
}
if let Some(set) = el.value().attr("srcset") {
for candidate in split_srcset(set) {
push(&mut out, candidate, AssetRefKind::Image);
}
}
}
for el in doc.select(&SOURCE_SELECTOR) {
let ty = el.value().attr("type").unwrap_or("").to_ascii_lowercase();
let kind = if ty.starts_with("video/") {
AssetRefKind::Video
} else if ty.starts_with("audio/") {
AssetRefKind::Audio
} else {
AssetRefKind::Image
};
if let Some(src) = el.value().attr("src") {
push(&mut out, src, kind);
}
if let Some(set) = el.value().attr("srcset") {
for candidate in split_srcset(set) {
push(&mut out, candidate, kind);
}
}
}
for el in doc.select(&IFRAME_SRC_SELECTOR) {
if let Some(src) = el.value().attr("src") {
push(&mut out, src, AssetRefKind::Iframe);
}
}
for el in doc.select(&DATA_SITEKEY_SELECTOR) {
let class = el.value().attr("class").unwrap_or("");
let is_turnstile = class
.split_ascii_whitespace()
.any(|c| c.eq_ignore_ascii_case("cf-turnstile"));
if !is_turnstile {
continue;
}
let Some(sitekey) = el.value().attr("data-sitekey").map(str::trim) else {
continue;
};
if sitekey.is_empty() {
continue;
}
let synthetic = format!(
"https://challenges.cloudflare.com/turnstile/v0/api.js#sitekey={}",
sitekey
);
push(&mut out, &synthetic, AssetRefKind::Turnstile);
}
for el in doc.select(&VIDEO_SRC_SELECTOR) {
if let Some(src) = el.value().attr("src") {
push(&mut out, src, AssetRefKind::Video);
}
}
for el in doc.select(&AUDIO_SRC_SELECTOR) {
if let Some(src) = el.value().attr("src") {
push(&mut out, src, AssetRefKind::Audio);
}
}
for el in doc.select(&A_HREF_SELECTOR) {
if let Some(href) = el.value().attr("href") {
push(&mut out, href, AssetRefKind::Link);
}
}
out.sort_by(|a, b| a.to_url.cmp(&b.to_url).then(a.kind.cmp(&b.kind)));
out.dedup_by(|a, b| a.to_url == b.to_url && a.kind == b.kind);
out
}
impl PartialOrd for AssetRefKind {
fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
Some(self.cmp(other))
}
}
impl Ord for AssetRefKind {
fn cmp(&self, other: &Self) -> std::cmp::Ordering {
self.as_str().cmp(other.as_str())
}
}
fn split_srcset(v: &str) -> impl Iterator<Item = &str> {
v.split(',')
.filter_map(|part| part.split_whitespace().next())
.filter(|s| !s.is_empty())
}
fn href_ends_with_font(href: &str) -> bool {
let lower = href.to_ascii_lowercase();
let head = lower
.split('?')
.next()
.unwrap_or(&lower)
.split('#')
.next()
.unwrap_or(&lower);
head.ends_with(".woff")
|| head.ends_with(".woff2")
|| head.ends_with(".ttf")
|| head.ends_with(".otf")
|| head.ends_with(".eot")
}
#[cfg(test)]
mod tests {
use super::*;
fn base() -> Url {
Url::parse("https://example.com/page").unwrap()
}
#[test]
fn classifies_script_style_image() {
let html = r#"
<html><head>
<script src="https://cdn.example.com/app.js"></script>
<link rel="stylesheet" href="/assets/site.css">
<link rel="preload" as="font" href="/fonts/Inter.woff2">
</head><body>
<img src="https://images.fastly.net/x.jpg">
</body></html>
"#;
let refs = extract_asset_refs(&base(), html, "example.com");
let by_kind: std::collections::HashMap<AssetRefKind, Vec<String>> =
refs.iter()
.fold(std::collections::HashMap::new(), |mut m, r| {
m.entry(r.kind).or_default().push(r.to_url.clone());
m
});
assert!(by_kind.contains_key(&AssetRefKind::Script));
assert!(by_kind.contains_key(&AssetRefKind::Style));
assert!(by_kind.contains_key(&AssetRefKind::Font));
assert!(by_kind.contains_key(&AssetRefKind::Image));
}
#[test]
fn internal_vs_external_flag() {
let html = r#"
<html><body>
<a href="/about">about</a>
<a href="https://sub.example.com/x">sub</a>
<a href="https://other.net/y">other</a>
</body></html>
"#;
let refs = extract_asset_refs(&base(), html, "example.com");
let map: std::collections::HashMap<String, bool> = refs
.iter()
.map(|r| (r.to_domain.clone(), r.is_internal))
.collect();
assert_eq!(map.get("example.com"), Some(&true));
assert_eq!(map.get("other.net"), Some(&false));
}
#[test]
fn skips_non_http_schemes() {
let html = r#"
<html><body>
<a href="mailto:test@example.com">mail</a>
<a href="javascript:void(0)">js</a>
<img src="data:image/png;base64,iVBOR">
<a href="https://real.test/">real</a>
</body></html>
"#;
let refs = extract_asset_refs(&base(), html, "example.com");
assert_eq!(refs.len(), 1);
assert_eq!(refs[0].to_domain, "real.test");
}
#[test]
fn srcset_splits_candidates() {
let html = r#"
<html><body>
<img srcset="https://a.test/one.jpg 1x, https://b.test/two.jpg 2x">
</body></html>
"#;
let refs = extract_asset_refs(&base(), html, "example.com");
let domains: std::collections::HashSet<_> =
refs.iter().map(|r| r.to_domain.clone()).collect();
assert!(domains.contains("a.test"));
assert!(domains.contains("b.test"));
}
#[test]
fn dedupes_same_url_same_kind() {
let html = r#"
<html><body>
<img src="https://a.test/x.jpg">
<img src="https://a.test/x.jpg">
<img src="https://a.test/x.jpg" alt="dup">
</body></html>
"#;
let refs = extract_asset_refs(&base(), html, "example.com");
assert_eq!(refs.len(), 1);
}
#[test]
fn detects_turnstile_widget_with_sitekey() {
let html = r#"
<html><body>
<div class="cf-turnstile" data-sitekey="0x4AAAAAAAdummySitekey"></div>
<script src="https://challenges.cloudflare.com/turnstile/v0/api.js"></script>
</body></html>
"#;
let refs = extract_asset_refs(&base(), html, "example.com");
let turnstile: Vec<_> = refs
.iter()
.filter(|r| r.kind == AssetRefKind::Turnstile)
.collect();
assert_eq!(turnstile.len(), 1, "refs: {:?}", refs);
assert!(
turnstile[0]
.to_url
.contains("sitekey=0x4AAAAAAAdummySitekey"),
"got: {}",
turnstile[0].to_url
);
assert_eq!(turnstile[0].to_domain, "cloudflare.com");
}
#[test]
fn turnstile_ignored_when_class_missing() {
let html = r#"
<html><body>
<div class="h-captcha" data-sitekey="not-cf"></div>
</body></html>
"#;
let refs = extract_asset_refs(&base(), html, "example.com");
assert!(refs.iter().all(|r| r.kind != AssetRefKind::Turnstile));
}
#[test]
fn categorises_googletagmanager_as_analytics_and_tagmanager() {
let cats = categorise("googletagmanager.com");
assert!(cats.contains(&Category::Analytics), "got {:?}", cats);
assert!(cats.contains(&Category::TagManager), "got {:?}", cats);
}
#[test]
fn categorises_cloudfront_as_cdn() {
assert_eq!(categorise("cloudfront.net"), vec![Category::Cdn]);
assert_eq!(categorise("d1234abcd.cloudfront.net"), vec![Category::Cdn]);
}
#[test]
fn categorises_unknown_as_empty_vec() {
assert!(categorise("some-random-vendor.example").is_empty());
assert!(categorise("").is_empty());
}
#[test]
fn exact_host_vs_suffix_match() {
assert_eq!(categorise("facebook.com"), vec![Category::Social]);
assert_eq!(categorise("www.facebook.com"), vec![Category::Social]);
}
#[test]
fn category_as_str_is_kebab_case() {
assert_eq!(Category::TagManager.as_str(), "tag-manager");
assert_eq!(Category::FontService.as_str(), "font-service");
assert_eq!(Category::Cdn.as_str(), "cdn");
}
#[test]
fn font_detection_by_extension_and_rel() {
let html = r#"
<html><head>
<link rel="preload" as="font" href="/a.woff2" crossorigin>
<link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Inter">
<link rel="canonical" href="https://example.com/page">
<link href="/bare.ttf">
</head></html>
"#;
let refs = extract_asset_refs(&base(), html, "example.com");
let by_kind: std::collections::HashMap<AssetRefKind, usize> =
refs.iter()
.fold(std::collections::HashMap::new(), |mut m, r| {
*m.entry(r.kind).or_insert(0) += 1;
m
});
assert_eq!(by_kind.get(&AssetRefKind::Font).copied().unwrap_or(0), 2);
assert_eq!(by_kind.get(&AssetRefKind::Style).copied().unwrap_or(0), 1);
assert_eq!(by_kind.get(&AssetRefKind::Other).copied().unwrap_or(0), 1);
}
}