use color_eyre::eyre::{Context, Result};
use lazy_static::lazy_static;
use scraper::{Html, Selector};
use std::fmt;
use url::Url;
lazy_static! {
static ref ABSOLUTE_URL_REGEX: regex::Regex = regex::Regex::new(r"(https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*))").unwrap();
static ref RELATIVE_URL_REGEX: regex::Regex = regex::Regex::new(r"^/.*").unwrap();
}
const ATTRIBUTES: [&str; 4] = ["href", "src", "data-src", "content"];
#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone, Copy)]
pub enum DocumentType {
Html,
PlainText,
}
pub struct Document {
pub base: Url,
pub body: String,
pub document_type: DocumentType,
}
#[derive(Debug, PartialEq, Eq, PartialOrd, Ord)]
pub enum LinkType {
Internal,
External,
}
#[derive(Debug, PartialEq, Eq, PartialOrd, Ord)]
pub struct Link {
pub url: Url,
pub link_type: LinkType,
}
#[derive(Debug, PartialEq, Eq, PartialOrd, Ord)]
pub struct GraphNode {
pub url: Url,
pub document_type: DocumentType,
}
impl GraphNode {
pub fn new(url: Url, document_type: DocumentType) -> Self {
Self { url, document_type }
}
}
impl fmt::Display for GraphNode {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "{} ({:?})", self.url, self.document_type)
}
}
impl Link {
pub fn new(url: Url, link_type: LinkType) -> Self {
Self { url, link_type }
}
}
impl fmt::Display for Link {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self.link_type {
LinkType::Internal => write!(f, "Internal: {}", self.url),
LinkType::External => write!(f, "External: {}", self.url),
}
}
}
pub fn is_same_domain(url: &Url, base: &Url, allow_subdomain: bool) -> Result<bool> {
let url_domain = url.domain().ok_or_else(|| {
color_eyre::eyre::eyre!("Could not parse domain from URL: {}", url.to_string())
})?;
let base_domain = base.domain().ok_or_else(|| {
color_eyre::eyre::eyre!("Could not parse domain from URL: {}", base.to_string())
})?;
if allow_subdomain {
Ok(url_domain == base_domain || url_domain.ends_with(&format!(".{}", base_domain)))
} else {
Ok(url_domain == base_domain)
}
}
impl Document {
pub fn parse(base: &Url, body: &str) -> Self {
let document_type = if body.trim().starts_with("<!DOCTYPE html>") {
DocumentType::Html
} else {
DocumentType::PlainText
};
Self {
base: base.clone(),
body: body.to_string(),
document_type,
}
}
pub fn links(
&self,
allow_subdomain: bool,
attributes: Option<Vec<String>>,
) -> Result<Vec<Link>> {
match self.document_type {
DocumentType::Html => {
let html = Html::parse_document(&self.body);
let mut links = Vec::new();
for attribute in if let Some(attributes) = attributes {
attributes
} else {
ATTRIBUTES.iter().map(|s| s.to_string()).collect()
} {
for element in
html.select(&Selector::parse(&format!("[{}]", attribute)).unwrap())
{
let value = element.value().attr(&attribute).unwrap_or_default();
let maybe_absolute_url = ABSOLUTE_URL_REGEX.find(value);
let maybe_relative_url = RELATIVE_URL_REGEX.find(value);
let link = match (maybe_absolute_url, maybe_relative_url) {
(Some(absolute_url), _) => {
let url = Url::parse(absolute_url.as_str()).context(format!(
"Could not parse URL: {}",
absolute_url.as_str()
))?;
if is_same_domain(&url, &self.base, allow_subdomain)? {
Link::new(url, LinkType::Internal)
} else {
Link::new(url, LinkType::External)
}
}
(_, Some(relative_url)) => {
let url = self.base.join(relative_url.as_str())?;
if is_same_domain(&url, &self.base, allow_subdomain)? {
Link::new(url, LinkType::Internal)
} else {
Link::new(url, LinkType::External)
}
}
_ => continue,
};
links.push(link);
}
}
links.sort_unstable();
links.dedup();
Ok(links)
}
DocumentType::PlainText => {
let mut links = Vec::new();
for url in ABSOLUTE_URL_REGEX.find_iter(&self.body) {
let url = Url::parse(url.as_str())?;
if is_same_domain(&url, &self.base, allow_subdomain)? {
links.push(Link::new(url, LinkType::Internal));
} else {
links.push(Link::new(url, LinkType::External));
}
}
links.sort_unstable();
links.dedup();
Ok(links)
}
}
}
}