use codas::types::Text;
use lol_html::{RewriteStrSettings, element, rewrite_str};
use url::Url;
use super::{Asset, Context, MediaType, ProcessesAssets, ProcessingError};
pub struct CanonicalizeProcessor {
root: Url,
}
impl CanonicalizeProcessor {
pub fn new(root: impl AsRef<str>) -> Option<Self> {
let mut root_str = root.as_ref().to_string();
if !root_str.ends_with('/') {
root_str.push('/');
}
let root = Url::parse(&root_str).ok()?;
Some(Self { root })
}
fn canonicalize_url(&self, url: &str, asset_path: &str) -> String {
let url = url.trim();
if url.is_empty() {
return url.to_string();
}
if url.starts_with("http://")
|| url.starts_with("https://")
|| url.starts_with("//")
|| url.starts_with("data:")
|| url.starts_with("javascript:")
|| url.starts_with("mailto:")
|| url.starts_with('#')
{
return url.to_string();
}
if let Some(stripped) = url.strip_prefix('/') {
return self
.root
.join(stripped)
.map(|u| u.to_string())
.unwrap_or_else(|_| url.to_string());
}
let asset_dir = asset_path
.rsplit_once('/')
.map(|(dir, _)| dir)
.unwrap_or("");
let base = if asset_dir.is_empty() {
self.root.clone()
} else {
let dir = asset_dir.trim_start_matches('/');
self.root
.join(&format!("{}/", dir))
.unwrap_or_else(|_| self.root.clone())
};
base.join(url)
.map(|u| u.to_string())
.unwrap_or_else(|_| url.to_string())
}
fn process_css(&self, css: &str, asset_path: &str) -> String {
let mut result = String::with_capacity(css.len());
let mut chars = css.char_indices().peekable();
while let Some((i, c)) = chars.next() {
if c == 'u' && css[i..].starts_with("url(") {
result.push_str("url(");
chars.next(); chars.next(); chars.next();
while let Some(&(_, c)) = chars.peek() {
if c.is_whitespace() {
result.push(c);
chars.next();
} else {
break;
}
}
let quote_char = match chars.peek() {
Some(&(_, '"')) | Some(&(_, '\'')) => {
let q = chars.next().unwrap().1;
result.push(q);
Some(q)
}
_ => None,
};
let mut url = String::new();
while let Some(&(_, c)) = chars.peek() {
if let Some(q) = quote_char {
if c == q {
break;
}
} else if c == ')' || c.is_whitespace() {
break;
}
url.push(c);
chars.next();
}
result.push_str(&self.canonicalize_url(&url, asset_path));
if quote_char.is_some()
&& let Some(&(_, c)) = chars.peek()
{
result.push(c);
chars.next();
}
} else {
result.push(c);
}
}
result
}
fn process_html(&self, html: &str, asset_path: &Text) -> Result<String, ProcessingError> {
let url_attrs = [
"href",
"src",
"action",
"poster",
"data",
"cite",
"formaction",
];
let path = asset_path.clone();
rewrite_str(
html,
RewriteStrSettings {
element_content_handlers: vec![element!("*", move |el| {
if el.tag_name() == "script" {
if let Some(value) = el.get_attribute("src") {
let canonical = self.canonicalize_url(&value, &path);
if canonical != value {
el.set_attribute("src", &canonical).ok();
}
}
return Ok(());
}
for attr in &url_attrs {
if let Some(value) = el.get_attribute(attr) {
let canonical = self.canonicalize_url(&value, &path);
if canonical != value {
el.set_attribute(attr, &canonical).ok();
}
}
}
if el.tag_name() == "meta"
&& let Some(value) = el.get_attribute("content")
{
let trimmed = value.trim();
if trimmed.starts_with('/')
|| trimmed.starts_with("./")
|| trimmed.starts_with("../")
{
let canonical = self.canonicalize_url(&value, &path);
if canonical != value {
el.set_attribute("content", &canonical).ok();
}
}
}
if let Some(style) = el.get_attribute("style") {
let canonical = self.process_css(&style, &path);
if canonical != style {
el.set_attribute("style", &canonical).ok();
}
}
Ok(())
})],
..Default::default()
},
)
.map_err(|e| ProcessingError::Malformed {
message: e.to_string().into(),
})
}
}
impl ProcessesAssets for CanonicalizeProcessor {
fn process(&self, _context: &mut Context, asset: &mut Asset) -> Result<(), ProcessingError> {
match asset.media_type() {
MediaType::Html => {
tracing::trace!("canonicalize: {}", asset.path());
let canonical = self.process_html(asset.as_text()?, asset.path())?;
asset.replace_with_text(canonical.into(), MediaType::Html);
}
MediaType::Css => {
tracing::trace!("canonicalize: {}", asset.path());
let canonical = self.process_css(asset.as_text()?, asset.path());
asset.replace_with_text(canonical.into(), MediaType::Css);
}
_ => {}
}
Ok(())
}
}
#[cfg(test)]
mod tests {
use super::*;
fn processor() -> CanonicalizeProcessor {
CanonicalizeProcessor::new("https://example.com").unwrap()
}
#[test]
fn canonicalizes_absolute_paths() {
let p = processor();
assert_eq!(
p.canonicalize_url("/path/to/file.css", "/some/asset.html"),
"https://example.com/path/to/file.css"
);
assert_eq!(
p.canonicalize_url("/images/logo.png", "/deep/nested/page.html"),
"https://example.com/images/logo.png"
);
}
#[test]
fn canonicalizes_relative_paths_with_asset_context() {
let p = processor();
assert_eq!(
p.canonicalize_url("./styles.css", "/path/to/file.html"),
"https://example.com/path/to/styles.css"
);
assert_eq!(
p.canonicalize_url("../styles.css", "/path/to/file.html"),
"https://example.com/path/styles.css"
);
assert_eq!(
p.canonicalize_url("../../styles.css", "/path/to/deep/file.html"),
"https://example.com/path/styles.css"
);
assert_eq!(
p.canonicalize_url("styles.css", "/path/to/file.html"),
"https://example.com/path/to/styles.css"
);
}
#[test]
fn canonicalizes_from_root_asset() {
let p = processor();
assert_eq!(
p.canonicalize_url("./styles.css", "index.html"),
"https://example.com/styles.css"
);
assert_eq!(
p.canonicalize_url("styles.css", "index.html"),
"https://example.com/styles.css"
);
}
#[test]
fn preserves_qualified_urls() {
let p = processor();
assert_eq!(
p.canonicalize_url("https://cdn.example.com/lib.js", "/any/path.html"),
"https://cdn.example.com/lib.js"
);
assert_eq!(
p.canonicalize_url("http://example.com/page", "/any/path.html"),
"http://example.com/page"
);
assert_eq!(
p.canonicalize_url("//cdn.example.com/lib.js", "/any/path.html"),
"//cdn.example.com/lib.js"
);
}
#[test]
fn preserves_special_urls() {
let p = processor();
assert_eq!(p.canonicalize_url("#section", "/any/path.html"), "#section");
assert_eq!(
p.canonicalize_url("data:image/png;base64,abc", "/any/path.html"),
"data:image/png;base64,abc"
);
assert_eq!(
p.canonicalize_url("javascript:void(0)", "/any/path.html"),
"javascript:void(0)"
);
assert_eq!(
p.canonicalize_url("mailto:test@example.com", "/any/path.html"),
"mailto:test@example.com"
);
}
#[test]
fn processes_html_attributes() {
let p = processor();
let html = r#"
<a href="/about">About</a>
<img src="./images/photo.jpg" alt="Photo">
<link rel="stylesheet" href="../styles.css">
<script src="/app.js"></script>
"#;
let result = p.process_html(html, &"/path/to/page.html".into()).unwrap();
assert!(result.contains(r#"href="https://example.com/about""#));
assert!(result.contains(r#"src="https://example.com/path/to/images/photo.jpg""#));
assert!(result.contains(r#"href="https://example.com/path/styles.css""#));
assert!(result.contains(r#"src="https://example.com/app.js""#));
}
#[test]
fn processes_inline_styles() {
let p = processor();
let html = r#"<div style="background: url(../bg.png)">Content</div>"#;
let result = p.process_html(html, &"/path/to/page.html".into()).unwrap();
assert!(result.contains("url(https://example.com/path/bg.png)"));
}
#[test]
fn handles_root_with_trailing_slash() {
let p = CanonicalizeProcessor::new("https://example.com/").unwrap();
assert_eq!(
p.canonicalize_url("/path", "/index.html"),
"https://example.com/path"
);
}
#[test]
fn skips_non_html_css_assets() {
let p = processor();
let mut asset = Asset::new("script.js".into(), b"const x = '/api'".to_vec());
p.process(&mut Context::default(), &mut asset).unwrap();
assert_eq!(asset.as_text().unwrap(), "const x = '/api'");
}
#[test]
fn processes_css_asset() {
let p = processor();
let mut asset = Asset::new(
"styles/main.css".into(),
b"@font-face { src: url('/fonts/test.ttf'); }".to_vec(),
);
assert_eq!(asset.media_type(), &MediaType::Css);
p.process(&mut Context::default(), &mut asset).unwrap();
assert_eq!(
asset.as_text().unwrap(),
"@font-face { src: url('https://example.com/fonts/test.ttf'); }"
);
}
#[test]
fn processes_html_asset_with_path() {
let p = processor();
let mut asset = Asset::new(
"/blog/posts/article.html".into(),
b"<a href=\"../index.html\">Back</a>".to_vec(),
);
p.process(&mut Context::default(), &mut asset).unwrap();
assert!(
asset
.as_text()
.unwrap()
.contains("https://example.com/blog/index.html")
);
}
#[test]
fn canonicalizes_meta_content_attribute() {
let p = processor();
let html = r#"<meta property="og:image" content="/images/og.png">"#;
let result = p.process_html(html, &"/page.html".into()).unwrap();
assert!(result.contains(r#"content="https://example.com/images/og.png""#));
}
#[test]
fn preserves_non_url_meta_content() {
let p = processor();
let html = r#"<meta name="description" content="A description">"#;
let result = p.process_html(html, &"/page.html".into()).unwrap();
assert!(result.contains(r#"content="A description""#));
}
}