use std::sync::LazyLock;
use comfy_table::presets::UTF8_FULL_CONDENSED;
use comfy_table::{Attribute, Cell, CellAlignment, ContentArrangement, Table};
use dom_content_extraction::scraper::{Html, Selector};
use serde::{Deserialize, Serialize};
use url::Url;
use crate::output::RenderOutput;
static A_HREF: LazyLock<Selector> =
LazyLock::new(|| Selector::parse("a[href]").unwrap());
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct RawLink {
pub href: String,
pub text: Option<String>,
pub rel: Option<String>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Link {
pub raw_url: String,
pub url: Url,
pub text: Option<String>,
pub rel: Option<String>,
pub is_internal: bool,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum LinkFilter {
All,
Internal,
External,
}
impl LinkFilter {
pub fn parse(value: &str) -> Option<Self> {
match value {
"all" => Some(Self::All),
"internal" => Some(Self::Internal),
"external" => Some(Self::External),
_ => None,
}
}
pub fn as_str(self) -> &'static str {
match self {
Self::All => "all",
Self::Internal => "internal",
Self::External => "external",
}
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct LinkGroup {
pub section: String,
pub count: usize,
pub samples: Vec<String>,
}
#[derive(Debug, Clone)]
pub struct LinksOutput {
pub url: String,
pub filter: LinkFilter,
pub total_internal: usize,
pub total_external: usize,
pub links: Vec<Link>,
pub groups: Vec<LinkGroup>,
pub depth_distribution: Vec<(usize, usize)>,
pub utility_urls: Vec<String>,
}
pub fn extract_raw_links(document: &Html) -> Vec<RawLink> {
document
.select(&A_HREF)
.filter_map(|element| {
let href = element.value().attr("href")?;
let text = element
.text()
.collect::<Vec<_>>()
.join("")
.trim()
.to_string();
let text = if text.is_empty() { None } else { Some(text) };
Some(RawLink {
href: href.to_string(),
text,
rel: element.value().attr("rel").map(String::from),
})
})
.collect()
}
pub fn extract_links(document: &Html, base_url: &Url) -> Vec<Link> {
let opts = LinkOptions {
normalize: true,
..Default::default()
};
extract_links_inner(document, base_url, &opts)
}
fn extract_links_inner(
document: &Html,
base_url: &Url,
opts: &LinkOptions,
) -> Vec<Link> {
let page_domain = extract_registered_domain(base_url);
let mut links: Vec<Link> = extract_raw_links(document)
.into_iter()
.filter_map(|raw| {
let resolved = base_url.join(&raw.href).ok()?;
let resolved_domain = extract_registered_domain(&resolved);
let is_internal = match (&page_domain, &resolved_domain) {
(Some(a), Some(b)) => a == b,
_ => false,
};
Some(Link {
raw_url: raw.href,
url: resolved,
text: raw.text,
rel: raw.rel,
is_internal,
})
})
.collect();
for link in &mut links {
if opts.normalize {
link.normalize();
}
if opts.strip_tracking_params {
link.strip_tracking();
}
}
if opts.max > 0 && links.len() > opts.max {
links.truncate(opts.max);
}
links
}
pub fn extract_registered_domain(url: &Url) -> Option<String> {
psl::domain_str(url.host_str()?).map(String::from)
}
impl Link {
pub fn normalize(&mut self) {
if let Some(host) = self.url.host_str().map(|h| h.to_ascii_lowercase()) {
let _ = self.url.set_host(Some(&host));
}
self.url.set_fragment(None);
}
pub fn strip_tracking(&mut self) {
const TRACKING: &[&str] = &[
"utm_source",
"utm_medium",
"utm_campaign",
"utm_term",
"utm_content",
"utm_id",
"fbclid",
"gclid",
];
let pairs: Vec<(String, String)> = self
.url
.query_pairs()
.filter(|(k, _)| !TRACKING.iter().any(|tk| k.eq_ignore_ascii_case(tk)))
.map(|(k, v)| (k.into_owned(), v.into_owned()))
.collect();
if pairs.is_empty() {
self.url.set_query(None);
} else {
self.url.query_pairs_mut().clear().extend_pairs(pairs);
}
}
#[allow(dead_code)]
pub fn is_same_host(&self, other: &Url) -> bool {
self.url.host_str() == other.host_str()
}
#[allow(dead_code)]
pub fn is_asset(&self) -> bool {
const EXTS: &[&str] = &[
"css", "js", "mjs", "png", "jpg", "jpeg", "gif", "svg", "webp", "ico",
"bmp", "woff", "woff2", "ttf", "eot", "otf", "mp4", "webm", "mp3",
"ogg", "wav", "pdf", "doc", "docx", "xls", "xlsx", "zip", "tar", "gz",
];
if let Some(ext) = self.url.path().rsplit('.').next() {
return EXTS.iter().any(|e| e.eq_ignore_ascii_case(ext));
}
false
}
}
impl LinksOutput {
fn render_value(&self) -> serde_json::Value {
let depth_distribution: Vec<serde_json::Value> = self
.depth_distribution
.iter()
.map(|(depth, count)| serde_json::json!([depth, count]))
.collect();
let links: Vec<serde_json::Value> = self
.links
.iter()
.map(|link| {
serde_json::json!({
"raw_url": link.raw_url,
"url": link.url.as_str(),
"text": link.text,
"rel": link.rel,
"is_internal": link.is_internal,
})
})
.collect();
let mut obj = serde_json::json!({
"url": &self.url,
"filter": self.filter.as_str(),
"total_internal": self.total_internal,
"total_external": self.total_external,
"links": links,
"groups": &self.groups,
"depth_distribution": depth_distribution,
"utility_urls": &self.utility_urls,
});
if self.filter == LinkFilter::Internal {
obj.as_object_mut().unwrap().remove("total_external");
}
if self.filter == LinkFilter::External {
obj.as_object_mut().unwrap().remove("total_internal");
}
obj
}
}
impl RenderOutput for LinksOutput {
fn render_text(&self) -> String {
let mut out = String::new();
out.push_str("\n## Links\n");
out.push_str(&format!("URL: {}\n", self.url));
out.push_str(&format!("Filter: {}\n", self.filter.as_str()));
out.push_str(&format!("Internal: {}\n", self.total_internal));
out.push_str(&format!("External: {}\n", self.total_external));
let mut links_table = Table::new();
links_table.set_content_arrangement(ContentArrangement::Dynamic);
links_table.load_preset(UTF8_FULL_CONDENSED);
links_table.set_header(vec![
Cell::new("Type").add_attribute(Attribute::Bold),
Cell::new("URL").add_attribute(Attribute::Bold),
Cell::new("Raw").add_attribute(Attribute::Bold),
Cell::new("Text").add_attribute(Attribute::Bold),
Cell::new("Rel").add_attribute(Attribute::Bold),
]);
for link in &self.links {
links_table.add_row(vec![
Cell::new(if link.is_internal {
"internal"
} else {
"external"
}),
Cell::new(link.url.as_str()),
Cell::new(&link.raw_url),
Cell::new(link.text.as_deref().unwrap_or("")),
Cell::new(link.rel.as_deref().unwrap_or("")),
]);
}
if self.links.is_empty() {
out.push_str("(no links matched)\n");
} else {
out.push_str(&links_table.to_string());
out.push('\n');
}
if !self.groups.is_empty() {
out.push_str("\n## URL Groups\n");
let mut sections_table = Table::new();
sections_table.set_content_arrangement(ContentArrangement::Dynamic);
sections_table.load_preset(UTF8_FULL_CONDENSED);
sections_table.set_header(vec![
Cell::new("Section").add_attribute(Attribute::Bold),
Cell::new("Links")
.add_attribute(Attribute::Bold)
.set_alignment(CellAlignment::Right),
Cell::new("Sample URLs").add_attribute(Attribute::Bold),
]);
for group in &self.groups {
sections_table.add_row(vec![
Cell::new(&group.section),
Cell::new(group.count).set_alignment(CellAlignment::Right),
Cell::new(group.samples.join("\n")),
]);
}
out.push_str(§ions_table.to_string());
out.push('\n');
}
if !self.depth_distribution.is_empty() {
out.push_str("\n## Path Depth\n");
let mut depth_table = Table::new();
depth_table.set_content_arrangement(ContentArrangement::Dynamic);
depth_table.load_preset(UTF8_FULL_CONDENSED);
depth_table.set_header(vec![
Cell::new("Depth").add_attribute(Attribute::Bold),
Cell::new("Count").add_attribute(Attribute::Bold),
]);
for (depth, count) in &self.depth_distribution {
depth_table.add_row(vec![Cell::new(depth), Cell::new(count)]);
}
out.push_str(&depth_table.to_string());
out.push('\n');
}
if !self.utility_urls.is_empty() {
out.push_str("\n## Utility URLs\n");
let mut util_table = Table::new();
util_table.set_content_arrangement(ContentArrangement::Dynamic);
util_table.load_preset(UTF8_FULL_CONDENSED);
for url in &self.utility_urls {
util_table.add_row(vec![Cell::new(url)]);
}
out.push_str(&util_table.to_string());
out.push('\n');
}
out
}
fn render_json(&self) -> String {
serde_json::to_string_pretty(&self.render_value()).unwrap_or_default()
}
fn render_toon(&self) -> String {
toon_format::encode_default(&self.render_value()).unwrap_or_default()
}
}
#[derive(Debug, Clone, Default)]
pub struct LinkOptions {
pub normalize: bool,
pub strip_tracking_params: bool,
pub max: usize,
}
#[cfg(test)]
mod tests {
use super::*;
fn make_link(url: &str, is_internal: bool) -> Link {
Link {
raw_url: url.to_string(),
url: Url::parse(url).unwrap(),
text: None,
rel: None,
is_internal,
}
}
#[test]
fn extract_registered_domain_simple() {
let url = Url::parse("https://example.com/page").unwrap();
assert_eq!(extract_registered_domain(&url), Some("example.com".into()));
}
#[test]
fn extract_registered_domain_www() {
let url = Url::parse("https://www.example.com/page").unwrap();
assert_eq!(extract_registered_domain(&url), Some("example.com".into()));
}
#[test]
fn extract_registered_domain_multi_part_tld() {
let url = Url::parse("https://www.example.co.uk/page").unwrap();
assert_eq!(
extract_registered_domain(&url),
Some("example.co.uk".into())
);
}
#[test]
fn extract_registered_domain_multi_part_jp() {
let url = Url::parse("https://www.example.co.jp/page").unwrap();
assert_eq!(
extract_registered_domain(&url),
Some("example.co.jp".into())
);
}
#[test]
fn extract_registered_domain_no_host() {
let url = Url::parse("file:///path/to/file").unwrap();
assert_eq!(extract_registered_domain(&url), None);
}
#[test]
fn extract_raw_links_preserves_href_text_rel_and_order() {
let html = r#"<html><body>
<a href="/about" rel="nofollow"> About </a>
<a href=" contact "></a>
<a href="https://other.com/page">External</a>
</body></html>"#;
let doc = Html::parse_document(html);
let links = extract_raw_links(&doc);
assert_eq!(links.len(), 3);
assert_eq!(links[0].href, "/about");
assert_eq!(links[0].text.as_deref(), Some("About"));
assert_eq!(links[0].rel.as_deref(), Some("nofollow"));
assert_eq!(links[1].href, " contact ");
assert!(links[1].text.is_none());
assert_eq!(links[2].href, "https://other.com/page");
}
#[test]
fn extract_links_basic() {
let html = r#"<html><body>
<a href="https://example.com/about">About Us</a>
<a href="https://other.com/page">External</a>
</body></html>"#;
let doc = Html::parse_document(html);
let base = Url::parse("https://example.com/").unwrap();
let links = extract_links(&doc, &base);
assert_eq!(links.len(), 2);
assert!(links[0].is_internal);
assert!(!links[1].is_internal);
}
#[test]
fn extract_links_resolves_relative() {
let html = r#"<html><body>
<a href="/about">About</a>
<a href="contact">Contact</a>
</body></html>"#;
let doc = Html::parse_document(html);
let base = Url::parse("https://example.com/").unwrap();
let links = extract_links(&doc, &base);
assert_eq!(links[0].raw_url, "/about");
assert_eq!(links[0].url.as_str(), "https://example.com/about");
assert_eq!(links[1].raw_url, "contact");
assert_eq!(links[1].url.as_str(), "https://example.com/contact");
}
#[test]
fn extract_links_text_and_rel() {
let html = r#"<html><body>
<a href="https://example.com" rel="nofollow">Click here</a>
</body></html>"#;
let doc = Html::parse_document(html);
let base = Url::parse("https://example.com/").unwrap();
let links = extract_links(&doc, &base);
assert_eq!(links[0].text.as_deref(), Some("Click here"));
assert_eq!(links[0].rel.as_deref(), Some("nofollow"));
}
#[test]
fn extract_links_empty_text() {
let html = r#"<html><body>
<a href="https://example.com"> </a>
</body></html>"#;
let doc = Html::parse_document(html);
let base = Url::parse("https://example.com/").unwrap();
let links = extract_links(&doc, &base);
assert!(links[0].text.is_none());
}
#[test]
fn extract_links_no_href_ignored() {
let html = r#"<html><body>
<a name="anchor">No href</a>
<a href="https://example.com">Has href</a>
</body></html>"#;
let doc = Html::parse_document(html);
let base = Url::parse("https://example.com/").unwrap();
let links = extract_links(&doc, &base);
assert_eq!(links.len(), 1);
}
#[test]
fn normalize_lowercases_host() {
let mut link = make_link("https://EXAMPLE.COM/Page", true);
link.normalize();
assert_eq!(link.url.as_str(), "https://example.com/Page");
}
#[test]
fn normalize_drops_fragment() {
let mut link = make_link("https://example.com/page#section", true);
link.normalize();
assert_eq!(link.url.as_str(), "https://example.com/page");
}
#[test]
fn normalize_preserves_query() {
let mut link = make_link("https://example.com/page?foo=bar", true);
link.normalize();
assert_eq!(link.url.as_str(), "https://example.com/page?foo=bar");
}
#[test]
fn normalize_handles_mailto() {
let mut link = make_link("mailto:foo@bar.com", false);
link.normalize();
assert_eq!(link.url.as_str(), "mailto:foo@bar.com");
}
#[test]
fn strip_tracking_removes_utm() {
let mut link =
make_link("https://example.com/page?utm_source=fb&keep=1", true);
link.strip_tracking();
assert_eq!(link.url.as_str(), "https://example.com/page?keep=1");
}
#[test]
fn strip_tracking_removes_fbclid_gclid() {
let mut link =
make_link("https://example.com/page?fbclid=abc&gclid=def&real=1", true);
link.strip_tracking();
assert_eq!(link.url.as_str(), "https://example.com/page?real=1");
}
#[test]
fn strip_tracking_case_insensitive() {
let mut link =
make_link("https://example.com/page?UTM_SOURCE=x&utm_medium=y", true);
link.strip_tracking();
assert_eq!(link.url.as_str(), "https://example.com/page");
}
#[test]
fn strip_tracking_removes_all_query_when_only_tracking() {
let mut link =
make_link("https://example.com/page?utm_source=x&utm_medium=y", true);
link.strip_tracking();
assert_eq!(link.url.as_str(), "https://example.com/page");
}
#[test]
fn strip_tracking_preserves_no_query_url() {
let mut link = make_link("https://example.com/page", true);
link.strip_tracking();
assert_eq!(link.url.as_str(), "https://example.com/page");
}
#[test]
fn link_options_defaults() {
let opts = LinkOptions::default();
assert!(!opts.normalize);
assert!(!opts.strip_tracking_params);
assert_eq!(opts.max, 0);
}
#[test]
fn links_output_json_includes_selected_facts() {
let output = LinksOutput {
url: "https://example.com/".to_string(),
filter: LinkFilter::All,
total_internal: 2,
total_external: 1,
links: vec![Link {
raw_url: "/docs".to_string(),
url: Url::parse("https://example.com/docs").unwrap(),
text: Some("Docs".to_string()),
rel: None,
is_internal: true,
}],
groups: vec![LinkGroup {
section: "docs".to_string(),
count: 2,
samples: vec!["/docs".to_string()],
}],
depth_distribution: vec![(1, 2)],
utility_urls: vec!["https://example.com/privacy".to_string()],
};
let parsed: serde_json::Value =
serde_json::from_str(&output.render_json()).unwrap();
assert_eq!(parsed["url"], "https://example.com/");
assert_eq!(parsed["filter"], "all");
assert_eq!(parsed["total_internal"], 2);
assert_eq!(parsed["total_external"], 1);
assert_eq!(parsed["links"][0]["raw_url"], "/docs");
assert_eq!(parsed["links"][0]["url"], "https://example.com/docs");
assert_eq!(parsed["links"][0]["text"], "Docs");
assert_eq!(parsed["links"][0]["is_internal"], true);
assert_eq!(parsed["groups"][0]["section"], "docs");
assert_eq!(parsed["depth_distribution"][0][0], 1);
}
#[test]
fn links_output_toon_uses_same_value() {
let output = LinksOutput {
url: "https://example.com/".to_string(),
filter: LinkFilter::Internal,
total_internal: 2,
total_external: 1,
links: Vec::new(),
groups: Vec::new(),
depth_distribution: Vec::new(),
utility_urls: Vec::new(),
};
let expected = toon_format::encode_default(&output.render_value()).unwrap();
assert_eq!(output.render_toon(), expected);
assert!(!output.render_json().contains("total_external"));
}
#[test]
fn is_same_host_match() {
let link = make_link("https://example.com/page", true);
let other = Url::parse("https://example.com/other").unwrap();
assert!(link.is_same_host(&other));
}
#[test]
fn is_same_host_different_subdomain() {
let link = make_link("https://www.example.com/page", true);
let other = Url::parse("https://blog.example.com/other").unwrap();
assert!(!link.is_same_host(&other));
}
#[test]
fn is_same_host_different_domain() {
let link = make_link("https://example.com/page", true);
let other = Url::parse("https://other.com/page").unwrap();
assert!(!link.is_same_host(&other));
}
#[test]
fn is_asset_css() {
let link = make_link("https://example.com/static/style.css", true);
assert!(link.is_asset());
}
#[test]
fn is_asset_js() {
let link = make_link("https://example.com/js/main.js", true);
assert!(link.is_asset());
}
#[test]
fn is_asset_image() {
let link = make_link("https://example.com/img/photo.png", true);
assert!(link.is_asset());
}
#[test]
fn is_asset_font() {
let link = make_link("https://example.com/fonts/roboto.woff2", true);
assert!(link.is_asset());
}
#[test]
fn is_asset_html_page() {
let link = make_link("https://example.com/about", true);
assert!(!link.is_asset());
}
#[test]
fn is_asset_query_after_extension() {
let link = make_link("https://example.com/bundle.js?v=3", true);
assert!(link.is_asset());
}
#[test]
fn is_asset_no_extension() {
let link = make_link("https://example.com/page", true);
assert!(!link.is_asset());
}
}