use regex::Regex;
use std::collections::HashSet;
use std::sync::LazyLock;
use texting_robots::Robot;
use url::Url;
static FILE_EXTENSIONS: &[&str] = &[
".png", ".jpg", ".jpeg", ".gif", ".css", ".js", ".ico", ".svg", ".tiff", ".zip", ".exe",
".dmg", ".mp4", ".mp3", ".wav", ".pptx", ".xlsx", ".avi", ".flv", ".woff", ".ttf", ".woff2",
".webp", ".inc",
];
static FILE_EXT_SET: LazyLock<HashSet<&'static str>> =
LazyLock::new(|| FILE_EXTENSIONS.iter().copied().collect());
const NON_WEB_PROTOCOLS: &[&str] = &[
"mailto:", "tel:", "telnet:", "ftp:", "ftps:", "ssh:", "file:",
];
const SOCIAL_MEDIA_OR_EMAIL: &[&str] = &[
"facebook.com",
"twitter.com",
"linkedin.com",
"instagram.com",
"pinterest.com",
"github.com",
"calendly.com",
"discord.gg",
"discord.com",
];
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum DenyReason {
UrlParse,
DepthLimit,
FileType,
SectionLink,
BackwardCrawling,
ExcludePattern,
IncludePattern,
RobotsTxt,
NonWebProtocol,
SocialMedia,
ExternalLink,
}
impl DenyReason {
pub fn as_str(self) -> &'static str {
match self {
Self::UrlParse => "url-parse",
Self::DepthLimit => "depth-limit",
Self::FileType => "file-type",
Self::SectionLink => "section-link",
Self::BackwardCrawling => "backward-crawling",
Self::ExcludePattern => "exclude-pattern",
Self::IncludePattern => "include-pattern",
Self::RobotsTxt => "robots-txt",
Self::NonWebProtocol => "non-web-protocol",
Self::SocialMedia => "social-media",
Self::ExternalLink => "external-link",
}
}
}
pub struct FilterLinksInput<'a> {
pub links: Vec<String>,
pub limit: Option<usize>,
pub max_depth: u32,
pub base_url: &'a Url,
pub initial_url: &'a Url,
pub regex_on_full_url: bool,
pub excludes: &'a [Regex],
pub includes: &'a [Regex],
pub allow_backward_crawling: bool,
pub ignore_robots_txt: bool,
pub robots_txt: &'a str,
pub robots_user_agent: Option<&'a str>,
pub allow_external_content_links: bool,
pub allow_subdomains: bool,
}
#[derive(Debug, Clone)]
pub struct FilterLinksResult {
pub links: Vec<String>,
pub denials: Vec<(String, DenyReason)>,
}
pub fn is_file(path: &str) -> bool {
if let Some(dot_pos) = path.rfind('.') {
let extension = &path[dot_pos..].to_ascii_lowercase();
FILE_EXT_SET.contains(extension.as_str())
} else {
false
}
}
pub fn get_url_depth(path: &str) -> u32 {
path.split('/')
.filter(|seg| !seg.is_empty() && *seg != "index.php" && *seg != "index.html")
.count() as u32
}
pub fn is_internal_link(url: &Url, base_url: &Url) -> bool {
let base_domain = url_host_bare(base_url);
let link_domain = url_host_bare(url);
link_domain == base_domain
}
fn url_host_bare(u: &Url) -> String {
u.host_str()
.unwrap_or("")
.trim_start_matches("www.")
.trim()
.to_ascii_lowercase()
}
pub fn no_sections(url_str: &str) -> bool {
if !url_str.contains('#') {
return true;
}
if let Some(hash_part) = url_str.split('#').nth(1) {
hash_part.len() > 1 && hash_part.contains('/')
} else {
false
}
}
pub fn is_non_web_protocol(url_str: &str) -> bool {
NON_WEB_PROTOCOLS.iter().any(|p| url_str.starts_with(p))
}
pub fn is_social_media_or_email(url_str: &str) -> bool {
SOCIAL_MEDIA_OR_EMAIL
.iter()
.any(|domain| url_str.contains(domain))
}
pub fn is_subdomain(url: &Url, base_url: &Url) -> bool {
match (url.host_str(), base_url.host_str()) {
(Some(link_host), Some(base_host)) => {
match (psl::domain_str(link_host), psl::domain_str(base_host)) {
(Some(a), Some(b)) => a == b,
_ => false,
}
}
_ => false,
}
}
pub fn is_external_main_page(url_str: &str) -> bool {
if let Ok(url) = Url::parse(url_str) {
let segs: Vec<&str> = url
.path_segments()
.map(|s| s.filter(|x| !x.is_empty()).collect())
.unwrap_or_default();
segs.is_empty()
} else {
false
}
}
fn build_robot(
ignore_robots_txt: bool,
robots_txt: &str,
robots_user_agent: Option<&str>,
) -> Option<Robot> {
if ignore_robots_txt || robots_txt.is_empty() {
return None;
}
if let Some(ua) = robots_user_agent {
return Robot::new(ua, robots_txt.as_bytes()).ok();
}
Robot::new("crawlex", robots_txt.as_bytes()).ok()
}
pub fn filter_links(data: FilterLinksInput<'_>) -> FilterLinksResult {
let limit = data.limit.unwrap_or(usize::MAX);
if limit == 0 {
return FilterLinksResult {
links: Vec::new(),
denials: Vec::new(),
};
}
let initial_path = data.initial_url.path().to_string();
let robot = build_robot(
data.ignore_robots_txt,
data.robots_txt,
data.robots_user_agent,
);
let mut out = Vec::new();
let mut denials: Vec<(String, DenyReason)> = Vec::new();
let push_deny = |denials: &mut Vec<(String, DenyReason)>, link: String, r: DenyReason| {
denials.push((link, r));
};
for link in data.links {
if out.len() >= limit {
break;
}
let url = match data.base_url.join(&link) {
Ok(u) => u,
Err(_) => {
push_deny(&mut denials, link, DenyReason::UrlParse);
continue;
}
};
let path = url.path().to_string();
let url_str = url.as_str().to_string();
if is_non_web_protocol(&url_str) {
push_deny(&mut denials, link, DenyReason::NonWebProtocol);
continue;
}
if get_url_depth(&path) > data.max_depth {
push_deny(&mut denials, link, DenyReason::DepthLimit);
continue;
}
if is_file(&path) {
push_deny(&mut denials, link, DenyReason::FileType);
continue;
}
if is_internal_link(&url, data.base_url) {
if !no_sections(&url_str) {
push_deny(&mut denials, link, DenyReason::SectionLink);
continue;
}
if !data.allow_backward_crawling && !path.starts_with(&initial_path) {
push_deny(&mut denials, link, DenyReason::BackwardCrawling);
continue;
}
let match_target: &str = if data.regex_on_full_url {
&url_str
} else {
&path
};
if !data.excludes.is_empty() && data.excludes.iter().any(|r| r.is_match(match_target)) {
push_deny(&mut denials, link, DenyReason::ExcludePattern);
continue;
}
if !data.includes.is_empty() && !data.includes.iter().any(|r| r.is_match(match_target))
{
push_deny(&mut denials, link, DenyReason::IncludePattern);
continue;
}
if let Some(ref robot) = robot {
if !robot.allowed(&url_str) {
push_deny(&mut denials, link, DenyReason::RobotsTxt);
continue;
}
}
out.push(link);
} else {
if is_social_media_or_email(&url_str) {
push_deny(&mut denials, link, DenyReason::SocialMedia);
continue;
}
if !data.excludes.is_empty() && data.excludes.iter().any(|r| r.is_match(&url_str)) {
push_deny(&mut denials, link, DenyReason::ExcludePattern);
continue;
}
if is_internal_link(data.initial_url, data.base_url)
&& data.allow_external_content_links
&& !is_external_main_page(&url_str)
{
out.push(link);
continue;
}
if data.allow_subdomains
&& !is_social_media_or_email(&url_str)
&& is_subdomain(&url, data.base_url)
{
let match_target: &str = if data.regex_on_full_url {
&url_str
} else {
&path
};
if !data.includes.is_empty()
&& !data.includes.iter().any(|r| r.is_match(match_target))
{
push_deny(&mut denials, link, DenyReason::IncludePattern);
continue;
}
out.push(link);
continue;
}
push_deny(&mut denials, link, DenyReason::ExternalLink);
}
}
FilterLinksResult {
links: out,
denials,
}
}