use core::fmt;
use url::{ParseError, ParseOptions, Url};
use super::{
dom_walker::{self, DomVisitor},
Script,
};
const BANNED_EXTENSIONS: [&str; 3] = [".pdf", ".png", ".jpg"];
pub(crate) struct UrlExtractor<'html> {
page_url: &'html Url,
opts: ParseOptions<'html>,
pages: Vec<Url>,
scripts: Vec<Script>,
}
impl fmt::Debug for UrlExtractor<'_> {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
f.debug_struct("UrlExtractor")
.field("pages", &self.pages)
.field("scripts", &self.scripts)
.finish()
}
}
impl<'html> UrlExtractor<'html> {
pub fn new(base_url: &'html Url, page_url: &'html Url) -> Self {
const CAP: usize = 10;
debug_assert!(!base_url.cannot_be_a_base());
Self {
page_url,
opts: Url::options().base_url(Some(base_url)),
pages: Vec::with_capacity(CAP),
scripts: Vec::with_capacity(CAP),
}
}
#[must_use]
pub fn into_inner(self) -> (Vec<Url>, Vec<Script>) {
(self.pages, self.scripts)
}
fn resolve(&self, url: &'html str) -> Result<Url, ParseError> {
self.opts.parse(url)
}
fn record_remote_script(&mut self, script_url: &'html str) {
let Ok(script_url) = self.resolve(script_url) else {
return;
};
self.scripts.push(Script::Url(script_url));
}
fn record_embedded_script(&mut self, script: &str) {
if script.is_empty() {
return;
}
self.scripts
.push(Script::Embedded(script.to_string(), self.page_url.clone()));
}
fn record_page(&mut self, page_url: &'html str) {
let page_url = page_url.trim();
if page_url.is_empty()
|| page_url.starts_with('#')
|| page_url.starts_with("mailto:")
|| page_url.starts_with("javascript:")
{
return;
}
let Ok(page_url) = self.resolve(page_url) else {
return;
};
if BANNED_EXTENSIONS
.iter()
.any(|ext| page_url.path().ends_with(ext))
{
return;
}
self.pages.push(page_url);
}
}
impl<'dom> DomVisitor<'dom> for UrlExtractor<'dom> {
fn visit_element(&mut self, node: dom_walker::ElementRef<'dom>) {
match node.name() {
"script" => {
let r#type = node.attr("type");
if r#type.is_some_and(|t| !t.contains("javascript")) {
return;
}
match node.attr("src") {
Some(script_url) => self.record_remote_script(script_url),
None => {
self.record_embedded_script(node.text().collect::<String>().trim());
}
}
}
"a" => {
let Some(page_url) = node.attr("href") else {
return;
};
self.record_page(page_url);
}
_ => { }
}
}
}
#[cfg(test)]
mod test {
use crate::walk::website::dom_walker::DomWalker;
use super::*;
use url::Url;
#[test]
fn test_basic() {
let url = Url::parse("https://example.com").unwrap();
let html = r#"
<html>
<head>
<script src="main.js"></script>
</head>
<body>
<a href="https://example.com/foo">foo</a>
<a href="bar">bar</a>
<a href="/baz">baz</a>
</body>
</html>
"#;
let mut extractor = UrlExtractor::new(&url, &url);
let dom = DomWalker::new(html).unwrap();
dom.walk(&mut extractor);
let (pages, scripts) = extractor.into_inner();
assert_eq!(
scripts,
vec![Script::Url(
Url::parse("https://example.com/main.js").unwrap()
)]
);
assert_eq!(pages.len(), 3);
for expected in [
"https://example.com/foo",
"https://example.com/bar",
"https://example.com/baz",
] {
let u = Url::parse(expected).unwrap();
assert!(pages.contains(&u), "{u} is not in extracted pages list");
}
}
#[test]
fn test_ignored() {
let url = Url::parse("https://example.com").unwrap();
let html = r"
<html>
<body>
<a href='#section'>intra-page links</a>
<a href='mailto:foo@example.com'>emails</a>
<a href='javascript:void(0)'>js</a>
<a href='/assets/pic.jpg?id=123'>images</a>
</body>
</html>
";
let mut extractor = UrlExtractor::new(&url, &url);
let dom = DomWalker::new(html).unwrap();
dom.walk(&mut extractor);
let (pages, scripts) = extractor.into_inner();
assert!(pages.is_empty(), "found pages: {pages:#?}");
assert!(scripts.is_empty());
}
#[test]
fn test_embedded_script() {
let url = Url::parse("https://example.com").unwrap();
let html = r#"
<html>
<head>
<script>
console.log("hello, world");
</script>
<script>
console.log("goodbye, world");
</script>
</head>
<body></body>
</html>
"#;
let mut extractor = UrlExtractor::new(&url, &url);
let dom = DomWalker::new(html).unwrap();
dom.walk(&mut extractor);
let (pages, scripts) = extractor.into_inner();
assert!(pages.is_empty(), "found pages: {pages:#?}");
assert_eq!(
scripts,
vec![
Script::Embedded("console.log(\"hello, world\");".to_string(), url.clone()),
Script::Embedded("console.log(\"goodbye, world\");".to_string(), url),
]
);
}
#[test]
fn test_embedded_script_empty() {
let url = Url::parse("https://example.com").unwrap();
let html = "
<html>
<head>
<script></script>
<script> </script>
<script>
\t
</script>
</head>
<body></body>
</html>
";
let mut extractor = UrlExtractor::new(&url, &url);
let dom = DomWalker::new(html).unwrap();
dom.walk(&mut extractor);
let (pages, scripts) = extractor.into_inner();
assert!(pages.is_empty(), "found pages: {pages:#?}");
assert!(scripts.is_empty());
}
#[test]
fn test_non_js_embedded_script() {
let url = Url::parse("https://example.com").unwrap();
let html = r#"
<html>
<head>
<script type="application/json">
{ "foo": "bar" }
</script>
<script type="text/javascript">
console.log("hello, world");
</script>
</head>
<body></body>
</html>
"#;
let mut extractor = UrlExtractor::new(&url, &url);
let dom = DomWalker::new(html).unwrap();
dom.walk(&mut extractor);
let (pages, scripts) = extractor.into_inner();
assert!(pages.is_empty(), "found pages: {pages:#?}");
assert_eq!(
scripts,
vec![Script::Embedded(
"console.log(\"hello, world\");".to_string(),
url.clone()
),]
);
}
}