use super::helpers::rfc2822_to_iso8601;
use crate::plugin::{Plugin, PluginContext};
use anyhow::{Context, Result};
use std::fs;
#[derive(Debug, Clone, Copy)]
pub struct HtmlFixPlugin;
impl Plugin for HtmlFixPlugin {
fn name(&self) -> &'static str {
"html-fix"
}
fn after_compile(&self, ctx: &PluginContext) -> Result<()> {
if !ctx.site_dir.exists() {
return Ok(());
}
let all_html = crate::walk::walk_files(&ctx.site_dir, "html")?;
let cache = ctx.cache.as_ref();
let html_files: Vec<_> = all_html
.into_iter()
.filter(|p| cache.is_none_or(|c| c.has_changed(p)))
.collect();
let fixed = std::sync::atomic::AtomicUsize::new(0);
html_files.iter().try_for_each(|path| -> Result<()> {
let html = fs::read_to_string(path)
.with_context(|| format!("cannot read {}", path.display()))?;
let modified = apply_html_fixes(&html);
if modified != html {
fs::write(path, &modified).with_context(|| {
format!("cannot write {}", path.display())
})?;
let _ =
fixed.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
}
Ok(())
})?;
let count = fixed.load(std::sync::atomic::Ordering::Relaxed);
if count > 0 {
log::info!("[html-fix] Repaired {count} HTML file(s)");
}
Ok(())
}
}
fn apply_html_fixes(html: &str) -> String {
let mut modified = html.to_string();
if needs_schema_context_fix(&modified) {
modified = modified
.replace("\"http://schema.org/\"", "\"https://schema.org\"")
.replace("\"http://schema.org\"", "\"https://schema.org\"");
}
if modified.contains("application/ld+json") {
modified = fix_jsonld_dates(&modified);
}
if modified.contains("<p src=") {
modified = fix_broken_img_tags(&modified);
}
if needs_class_syntax_fix(&modified) {
modified = fix_literal_class_syntax(&modified);
}
if needs_mobile_web_app_capable_meta(&modified) {
modified = inject_mobile_web_app_capable_meta(&modified);
}
if has_empty_preload(&modified) {
modified = remove_empty_preload_links(&modified);
}
modified
}
fn needs_schema_context_fix(html: &str) -> bool {
html.contains("\"http://schema.org/\"")
|| html.contains("\"http://schema.org\"")
}
fn needs_class_syntax_fix(html: &str) -> bool {
html.contains(".class="") || html.contains(".class=\"")
}
fn has_empty_preload(html: &str) -> bool {
let has_preload = html.contains("rel=preload")
|| html.contains("rel=\"preload\"")
|| html.contains("rel='preload'");
let has_empty_href = html.contains("href=\"\"")
|| html.contains("href=''")
|| html.contains(" href ")
|| html.contains(" href>")
|| html.contains(" href/>");
has_preload && has_empty_href
}
pub(super) fn remove_empty_preload_links(html: &str) -> String {
let mut out = String::with_capacity(html.len());
let mut cursor = 0;
while cursor < html.len() {
let Some(rel_offset) =
html[cursor..].to_ascii_lowercase().find("<link")
else {
out.push_str(&html[cursor..]);
break;
};
let tag_start = cursor + rel_offset;
out.push_str(&html[cursor..tag_start]);
let bytes = html.as_bytes();
let mut j = tag_start;
let mut quote: Option<u8> = None;
while j < bytes.len() {
let b = bytes[j];
match quote {
Some(q) if b == q => quote = None,
Some(_) => {}
None => match b {
b'"' | b'\'' => quote = Some(b),
b'>' => break,
_ => {}
},
}
j += 1;
}
let tag_end = (j + 1).min(html.len());
let tag = &html[tag_start..tag_end];
let lower = tag.to_ascii_lowercase();
let is_preload = lower.contains("rel=\"preload\"")
|| lower.contains("rel='preload'")
|| lower.contains("rel=preload");
let has_real_href = href_is_present_and_non_empty(&lower);
if !is_preload || has_real_href {
out.push_str(tag);
}
cursor = tag_end;
}
out
}
fn href_is_present_and_non_empty(lower_tag: &str) -> bool {
if lower_tag.contains("href=\"\"") || lower_tag.contains("href=''") {
return false;
}
let Some(idx) = lower_tag.find("href") else {
return false;
};
let after = lower_tag[idx + 4..].trim_start();
let Some(rest) = after.strip_prefix('=') else {
return false;
};
let rest = rest.trim_start();
match rest.chars().next() {
None | Some('>') => false,
Some('"') => rest.len() > 1 && !rest.starts_with("\"\""),
Some('\'') => rest.len() > 1 && !rest.starts_with("''"),
Some(c) if c.is_whitespace() => false,
Some(_) => true,
}
}
fn needs_mobile_web_app_capable_meta(html: &str) -> bool {
let has_legacy = html.contains("apple-mobile-web-app-capable");
let has_modern = find_modern_mobile_web_app_capable(html).is_some();
has_legacy && !has_modern
}
fn find_modern_mobile_web_app_capable(html: &str) -> Option<usize> {
let needles = [
"name=\"mobile-web-app-capable\"",
"name='mobile-web-app-capable'",
"name=mobile-web-app-capable",
];
for n in &needles {
if let Some(pos) = html.find(n) {
return Some(pos);
}
}
None
}
pub(super) fn inject_mobile_web_app_capable_meta(html: &str) -> String {
let modern = "<meta name=\"mobile-web-app-capable\" content=\"yes\">";
let candidates = [
"name=\"apple-mobile-web-app-capable\"",
"name='apple-mobile-web-app-capable'",
"name=apple-mobile-web-app-capable",
];
let name_pos = candidates.iter().find_map(|n| html.find(n));
let Some(name_pos) = name_pos else {
return html.to_string();
};
let after = &html[name_pos..];
let Some(rel_close) = after.find('>') else {
return html.to_string();
};
let insert_at = name_pos + rel_close + 1;
format!("{}{modern}{}", &html[..insert_at], &html[insert_at..])
}
pub(super) fn fix_jsonld_dates(html: &str) -> String {
let mut result = html.to_string();
for field in &["datePublished", "dateModified"] {
let pattern = format!("\"{field}\":\"");
let mut search_from = 0;
while let Some(start) = result[search_from..].find(&pattern) {
let abs_start = search_from + start + pattern.len();
if let Some(end) = result[abs_start..].find('"') {
let date_str = &result[abs_start..abs_start + end];
if date_str.len() > 5
&& date_str.as_bytes()[3] == b','
&& date_str.as_bytes()[0].is_ascii_alphabetic()
{
let iso = rfc2822_to_iso8601(date_str);
if iso != date_str {
result = format!(
"{}{}{}",
&result[..abs_start],
iso,
&result[abs_start + end..]
);
}
}
search_from = abs_start + 1;
} else {
break;
}
}
}
result
}
pub(super) fn fix_broken_img_tags(html: &str) -> String {
let mut result = html.to_string();
while let Some(p_pos) = result.find("<p src=") {
let before = &result[..p_pos];
if let Some(img_start) = before.rfind("<img") {
let after_p = &result[p_pos..]; if let Some(quote_start) = after_p.find("src=\"") {
let val_start = quote_start + 5; let remaining = &after_p[val_start..];
if let Some(quote_end) = remaining.find('"') {
let src_value = remaining[..quote_end].to_string();
let close_offset = remaining[quote_end..]
.find('>')
.map_or(result.len(), |i| {
p_pos + val_start + quote_end + i + 1
});
let img_attrs = result[img_start + 4..p_pos].trim();
let img_attrs_clean =
img_attrs.trim_end_matches(|c: char| {
c.is_whitespace() || c == '<'
});
let new_img = format!(
"<img {img_attrs_clean} src=\"{src_value}\" />"
);
result = format!(
"{}{}{}",
&result[..img_start],
new_img,
&result[close_offset..]
);
continue;
}
}
}
break;
}
result
}
pub(super) fn fix_literal_class_syntax(html: &str) -> String {
let mut result = html.to_string();
result = fix_class_syntax_variant(&result, ".class="", """);
result = fix_class_syntax_variant(&result, ".class=\"", "\"");
result
}
fn fix_class_syntax_variant(
html: &str,
open_pattern: &str,
close_pattern: &str,
) -> String {
let mut result = html.to_string();
while let Some(start) = result.find(open_pattern) {
let after = &result[start + open_pattern.len()..];
if let Some(end) = after.find(close_pattern) {
let class_value = after[..end].to_string();
let remove_end =
start + open_pattern.len() + end + close_pattern.len();
result = format!("{}{}", &result[..start], &result[remove_end..]);
inject_class_attr(&mut result, start, &class_value);
} else {
break;
}
}
result
}
fn inject_class_attr(html: &mut String, pos: usize, class_value: &str) {
if let Some(tag_end) = html[..pos].rfind('>') {
if let Some(tag_start) = html[..tag_end].rfind('<') {
let tag = &html[tag_start..tag_end];
if !tag.contains("class=") {
let insert_pos = tag_end;
*html = format!(
"{} class=\"{}\"{}",
&html[..insert_pos],
class_value,
&html[insert_pos..]
);
}
}
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::plugin::PluginContext;
use std::path::Path;
use tempfile::tempdir;
fn test_ctx(site_dir: &Path) -> PluginContext {
crate::test_support::init_logger();
PluginContext::new(
Path::new("content"),
Path::new("build"),
site_dir,
Path::new("templates"),
)
}
#[test]
fn test_html_fix_upgrades_jsonld_context() -> Result<()> {
let tmp = tempdir()?;
let html_path = tmp.path().join("index.html");
fs::write(
&html_path,
r#"<html><head>
<script type="application/ld+json">
{"@context":"http://schema.org/","@type":"WebPage"}
</script>
</head><body></body></html>"#,
)?;
let ctx = test_ctx(tmp.path());
HtmlFixPlugin.after_compile(&ctx)?;
let result = fs::read_to_string(&html_path)?;
assert!(result.contains("\"https://schema.org\""));
assert!(!result.contains("\"http://schema.org/\""));
Ok(())
}
#[test]
fn test_html_fix_converts_jsonld_dates() -> Result<()> {
let tmp = tempdir()?;
let html_path = tmp.path().join("article.html");
fs::write(
&html_path,
r#"<html><head>
<script type="application/ld+json">
{"@context":"https://schema.org","@type":"Article","datePublished":"Thu, 11 Apr 2026 06:06:06 +0000","dateModified":"Mon, 01 Sep 2025 06:06:06 +0000"}
</script>
</head><body></body></html>"#,
)?;
let ctx = test_ctx(tmp.path());
HtmlFixPlugin.after_compile(&ctx)?;
let result = fs::read_to_string(&html_path)?;
assert!(
result.contains("\"datePublished\":\"2026-04-11"),
"Expected ISO date, got: {result}"
);
assert!(
result.contains("\"dateModified\":\"2025-09-01"),
"Expected ISO date, got: {result}"
);
assert!(!result.contains("Thu, 11 Apr"));
Ok(())
}
#[test]
fn test_fix_broken_img_tags() {
let input =
r#"<img alt="test" class="w-25" title="test" <p src="image.jpg">"#;
let result = fix_broken_img_tags(input);
assert!(result.contains("src=\"image.jpg\""));
assert!(!result.contains("<p src="));
}
#[test]
fn test_fix_literal_class_syntax() {
let input = r#"<img alt="test" src="img.jpg">.class="w-25 float-start""#;
let result = fix_literal_class_syntax(input);
assert!(!result.contains(".class=""));
}
#[test]
fn test_fix_jsonld_dates_iso_passthrough() {
let input =
r#"{"datePublished":"2026-04-11","dateModified":"2025-09-01"}"#;
let result = fix_jsonld_dates(input);
assert_eq!(result, input, "ISO dates should pass through unchanged");
}
#[test]
fn test_fix_jsonld_dates_converts_rfc2822() {
let input = r#"{"datePublished":"Thu, 11 Apr 2026 06:06:06 +0000"}"#;
let result = fix_jsonld_dates(input);
assert!(
result.contains("\"datePublished\":\"2026-04-11T06:06:06+00:00\""),
"Should convert RFC 2822 to ISO 8601, got: {result}"
);
}
#[test]
fn test_fix_jsonld_dates_both_fields() {
let input = r#"{"datePublished":"Mon, 01 Sep 2025 12:00:00 +0000","dateModified":"Tue, 02 Sep 2025 14:30:00 +0000"}"#;
let result = fix_jsonld_dates(input);
assert!(result.contains("2025-09-01T12:00:00+00:00"));
assert!(result.contains("2025-09-02T14:30:00+00:00"));
}
#[test]
fn test_fix_broken_img_tags_multiple() {
let input =
r#"<img alt="a" <p src="one.jpg"><img alt="b" <p src="two.jpg">"#;
let result = fix_broken_img_tags(input);
assert!(result.contains("src=\"one.jpg\""), "first img: {result}");
assert!(result.contains("src=\"two.jpg\""), "second img: {result}");
assert!(
!result.contains("<p src="),
"no broken tags remain: {result}"
);
}
#[test]
fn test_fix_broken_img_tags_none() {
let input = r#"<img alt="ok" src="good.jpg" />"#;
let result = fix_broken_img_tags(input);
assert_eq!(
result, input,
"No broken tags should leave input unchanged"
);
}
#[test]
fn test_fix_literal_class_syntax_html_encoded() {
let input =
r#"<img src="img.jpg">.class="w-25 float-start" rest"#;
let result = fix_literal_class_syntax(input);
assert!(
!result.contains(".class=""),
"should remove .class=""
);
assert!(
result.contains("class=\"w-25 float-start\""),
"should inject class attr, got: {result}"
);
}
#[test]
fn test_fix_literal_class_syntax_literal_quotes() {
let input = r#"<img src="img.jpg">.class="my-class" rest"#;
let result = fix_literal_class_syntax(input);
assert!(
!result.contains(".class=\""),
"should remove .class=\", got: {result}"
);
assert!(
result.contains("class=\"my-class\""),
"should inject class attr, got: {result}"
);
}
#[test]
fn test_fix_literal_class_syntax_no_class() {
let input = r#"<img src="img.jpg"> some text"#;
let result = fix_literal_class_syntax(input);
assert_eq!(result, input, "No .class= should leave input unchanged");
}
#[test]
fn test_inject_mobile_web_app_capable_meta_added() {
let input = r#"<head><meta name="apple-mobile-web-app-capable" content="yes"></head>"#;
let result = inject_mobile_web_app_capable_meta(input);
assert!(
result.contains(
r#"<meta name="mobile-web-app-capable" content="yes">"#
),
"modern meta should be injected, got: {result}"
);
assert!(
result.contains(
r#"<meta name="apple-mobile-web-app-capable" content="yes">"#
),
"legacy meta must remain for backwards compatibility"
);
}
#[test]
fn test_remove_empty_preload_drops_bare_href() {
let input = r#"<head><link as=image fetchpriority=high href rel=preload type=image/webp><title>x</title></head>"#;
let result = remove_empty_preload_links(input);
assert!(
!result.contains("rel=preload"),
"empty preload should be removed, got: {result}"
);
assert!(result.contains("<title>x</title>"), "rest preserved");
}
#[test]
fn test_remove_empty_preload_drops_quoted_empty_href() {
let input = r#"<link rel="preload" href="" as="image">"#;
let result = remove_empty_preload_links(input);
assert_eq!(result, "");
}
#[test]
fn test_remove_empty_preload_keeps_valid_preload() {
let input = r#"<link rel="preload" href="/banner.webp" as="image">"#;
let result = remove_empty_preload_links(input);
assert_eq!(result, input);
}
#[test]
fn test_remove_empty_preload_preserves_utf8() {
let input = r#"<title>日本語</title><link rel=preload href as=image><p>テスト</p>"#;
let result = remove_empty_preload_links(input);
assert!(result.contains("日本語"));
assert!(result.contains("テスト"));
assert!(!result.contains("rel=preload"));
}
#[test]
fn test_apply_html_fixes_idempotent_on_modern_meta() {
let input = r#"<head><meta name="apple-mobile-web-app-capable" content="yes"><meta name="mobile-web-app-capable" content="yes"></head>"#;
let result = apply_html_fixes(input);
let count = result.matches("name=\"mobile-web-app-capable\"").count();
assert_eq!(count, 1, "no duplicate injection, got: {result}");
}
}