use crate::plugin::{Plugin, PluginContext};
use anyhow::{Context, Result};
use rayon::prelude::*;
use std::fs;
use std::path::{Path, PathBuf};
fn extract_title(html: &str) -> String {
if let Some(start) = html.find("<title>") {
let after = &html[start + 7..];
if let Some(end) = after.find("</title>") {
let title = strip_tags(&after[..end]);
let trimmed = title.trim();
if !trimmed.is_empty() {
return trimmed.to_string();
}
}
}
String::new()
}
fn extract_description(html: &str, max_len: usize) -> String {
let content = if let Some(start) = html.find("<main") {
let after = &html[start..];
if let Some(gt) = after.find('>') {
let inner = &after[gt + 1..];
if let Some(end) = inner.find("</main>") {
inner[..end].to_string()
} else {
inner.to_string()
}
} else {
String::new()
}
} else {
let body = if let Some(start) = html.find("<body") {
let after = &html[start..];
if let Some(gt) = after.find('>') {
let inner = &after[gt + 1..];
if let Some(end) = inner.find("</body>") {
inner[..end].to_string()
} else {
inner.to_string()
}
} else {
String::new()
}
} else {
html.to_string()
};
let mut clean = body;
for tag in &["script", "style", "nav", "header", "footer"] {
let open = format!("<{tag}");
let close = format!("</{tag}>");
while let Some(start) = clean.find(&open) {
if let Some(end) = clean[start..].find(&close) {
clean.replace_range(start..start + end + close.len(), " ");
} else {
break;
}
}
}
clean
};
let mut clean = content;
for tag in &["script", "style"] {
let open = format!("<{tag}");
let close = format!("</{tag}>");
while let Some(start) = clean.find(&open) {
if let Some(end) = clean[start..].find(&close) {
clean.replace_range(start..start + end + close.len(), " ");
} else {
break;
}
}
}
let text = strip_tags(&clean);
let trimmed = text.trim();
if trimmed.len() <= max_len {
trimmed.to_string()
} else {
let mut end = max_len;
while end > 0 && !trimmed.is_char_boundary(end) {
end -= 1;
}
let truncated = &trimmed[..end];
if let Some(last_space) = truncated.rfind(' ') {
truncated[..last_space].to_string()
} else {
truncated.to_string()
}
}
}
fn strip_tags(html: &str) -> String {
let mut result = String::with_capacity(html.len());
let mut in_tag = false;
for ch in html.chars() {
match ch {
'<' => in_tag = true,
'>' => {
in_tag = false;
result.push(' ');
}
_ if !in_tag => result.push(ch),
_ => {}
}
}
let mut collapsed = String::with_capacity(result.len());
let mut prev_space = false;
for ch in result.chars() {
if ch.is_whitespace() {
if !prev_space {
collapsed.push(' ');
prev_space = true;
}
} else {
collapsed.push(ch);
prev_space = false;
}
}
collapsed.trim().to_string()
}
fn collect_html_files(dir: &Path) -> Result<Vec<PathBuf>> {
crate::walk::walk_files(dir, "html")
}
fn escape_attr(s: &str) -> String {
s.replace('&', "&")
.replace('"', """)
.replace('<', "<")
.replace('>', ">")
}
#[derive(Debug, Clone, Copy)]
pub struct SeoPlugin;
impl Plugin for SeoPlugin {
fn name(&self) -> &'static str {
"seo"
}
fn after_compile(&self, ctx: &PluginContext) -> Result<()> {
if !ctx.site_dir.exists() {
return Ok(());
}
let html_files = collect_html_files(&ctx.site_dir)?;
html_files
.par_iter()
.try_for_each(|path| inject_seo_tags(path))?;
Ok(())
}
}
fn inject_seo_tags(path: &Path) -> Result<()> {
let html = fs::read_to_string(path)
.with_context(|| format!("cannot read {}", path.display()))?;
let mut tags = Vec::new();
let title = extract_title(&html);
let description = extract_description(&html, 160);
if !html.contains("<meta name=\"description\"")
&& !html.contains("<meta name='description'")
&& !description.is_empty()
{
tags.push(format!(
"<meta name=\"description\" content=\"{}\">",
escape_attr(&description)
));
}
if !html.contains("<meta property=\"og:title\"")
&& !html.contains("<meta property='og:title'")
&& !title.is_empty()
{
tags.push(format!(
"<meta property=\"og:title\" content=\"{}\">",
escape_attr(&title)
));
}
if !html.contains("<meta property=\"og:description\"")
&& !html.contains("<meta property='og:description'")
&& !description.is_empty()
{
tags.push(format!(
"<meta property=\"og:description\" content=\"{}\">",
escape_attr(&description)
));
}
if !html.contains("<meta property=\"og:type\"")
&& !html.contains("<meta property='og:type'")
{
tags.push(
"<meta property=\"og:type\" content=\"website\">".to_string(),
);
}
if !html.contains("<meta name=\"twitter:card\"")
&& !html.contains("<meta name='twitter:card'")
{
tags.push(
"<meta name=\"twitter:card\" content=\"summary\">".to_string(),
);
}
if tags.is_empty() {
return Ok(());
}
let injection = tags.join("\n");
let result = if let Some(pos) = html.find("</head>") {
format!("{}{}\n{}", &html[..pos], injection, &html[pos..])
} else {
html
};
fs::write(path, result)
.with_context(|| format!("cannot write {}", path.display()))?;
Ok(())
}
#[derive(Debug, Clone)]
pub struct RobotsPlugin {
base_url: String,
}
impl RobotsPlugin {
pub fn new(base_url: impl Into<String>) -> Self {
Self {
base_url: base_url.into(),
}
}
}
impl Plugin for RobotsPlugin {
fn name(&self) -> &'static str {
"robots"
}
fn after_compile(&self, ctx: &PluginContext) -> Result<()> {
if !ctx.site_dir.exists() {
return Ok(());
}
let robots_path = ctx.site_dir.join("robots.txt");
if robots_path.exists() {
return Ok(());
}
let content = format!(
"User-agent: *\nAllow: /\nSitemap: {}/sitemap.xml\n",
self.base_url.trim_end_matches('/')
);
fs::write(&robots_path, content).with_context(|| {
format!("cannot write {}", robots_path.display())
})?;
Ok(())
}
}
#[derive(Debug, Clone)]
pub struct CanonicalPlugin {
base_url: String,
}
impl CanonicalPlugin {
pub fn new(base_url: impl Into<String>) -> Self {
Self {
base_url: base_url.into(),
}
}
}
impl Plugin for CanonicalPlugin {
fn name(&self) -> &'static str {
"canonical"
}
fn after_compile(&self, ctx: &PluginContext) -> Result<()> {
if !ctx.site_dir.exists() {
return Ok(());
}
let html_files = collect_html_files(&ctx.site_dir)?;
let base = self.base_url.trim_end_matches('/');
let site_dir = &ctx.site_dir;
html_files.par_iter().try_for_each(|path| -> Result<()> {
let html = fs::read_to_string(path)
.with_context(|| format!("cannot read {}", path.display()))?;
if html.contains("<link rel=\"canonical\"")
|| html.contains("<link rel='canonical'")
{
return Ok(());
}
let rel_path = path
.strip_prefix(site_dir)
.unwrap_or(path)
.to_string_lossy()
.replace('\\', "/");
let canonical_url = format!("{base}/{rel_path}");
let tag = format!(
"<link rel=\"canonical\" href=\"{}\">",
escape_attr(&canonical_url)
);
let result = if let Some(pos) = html.find("</head>") {
format!("{}{}\n{}", &html[..pos], tag, &html[pos..])
} else {
html
};
fs::write(path, result)
.with_context(|| format!("cannot write {}", path.display()))?;
Ok(())
})?;
Ok(())
}
}
#[derive(Debug, Clone)]
pub struct JsonLdConfig {
pub base_url: String,
pub org_name: String,
pub breadcrumbs: bool,
}
#[derive(Debug, Clone)]
pub struct JsonLdPlugin {
config: JsonLdConfig,
}
impl JsonLdPlugin {
#[must_use]
pub const fn new(config: JsonLdConfig) -> Self {
Self { config }
}
#[must_use]
pub fn from_site(base_url: &str, site_name: &str) -> Self {
Self {
config: JsonLdConfig {
base_url: base_url.to_string(),
org_name: site_name.to_string(),
breadcrumbs: true,
},
}
}
}
impl Plugin for JsonLdPlugin {
fn name(&self) -> &'static str {
"json-ld"
}
fn after_compile(&self, ctx: &PluginContext) -> Result<()> {
if !ctx.site_dir.exists() {
return Ok(());
}
let html_files = collect_html_files_recursive(&ctx.site_dir)?;
let injected = std::sync::atomic::AtomicUsize::new(0);
let base = self.config.base_url.trim_end_matches('/');
let site_dir = &ctx.site_dir;
html_files.par_iter().try_for_each(|path| -> Result<()> {
let html = fs::read_to_string(path)?;
if html.contains("application/ld+json") {
return Ok(());
}
let head_pos = match html.find("</head>") {
Some(p) => p,
None => return Ok(()),
};
let title = extract_title(&html);
let description = extract_description(&html, 160);
let rel_path = path
.strip_prefix(site_dir)
.unwrap_or(path)
.to_string_lossy()
.replace('\\', "/");
let page_url = format!("{base}/{rel_path}");
let mut scripts = Vec::new();
if html.contains("<article") {
let article = serde_json::json!({
"@context": "https://schema.org",
"@type": "Article",
"headline": title,
"description": description,
"url": page_url,
"mainEntityOfPage": {
"@type": "WebPage",
"@id": page_url
},
"publisher": {
"@type": "Organization",
"name": self.config.org_name
}
});
scripts.push(article);
} else {
let webpage = serde_json::json!({
"@context": "https://schema.org",
"@type": "WebPage",
"name": title,
"description": description,
"url": page_url
});
scripts.push(webpage);
}
if self.config.breadcrumbs {
let parts: Vec<&str> = rel_path
.trim_matches('/')
.split('/')
.filter(|p| !p.is_empty() && *p != "index.html")
.collect();
if !parts.is_empty() {
let mut items = vec![serde_json::json!({
"@type": "ListItem",
"position": 1,
"name": "Home",
"item": format!("{}/", base)
})];
let mut accumulated = String::new();
for (i, part) in parts.iter().enumerate() {
accumulated = format!("{accumulated}/{part}");
let name =
part.trim_end_matches(".html").replace('-', " ");
items.push(serde_json::json!({
"@type": "ListItem",
"position": i + 2,
"name": name,
"item": format!("{}{}", base, accumulated)
}));
}
let breadcrumb = serde_json::json!({
"@context": "https://schema.org",
"@type": "BreadcrumbList",
"itemListElement": items
});
scripts.push(breadcrumb);
}
}
let mut injection = String::new();
for script in &scripts {
let json = serde_json::to_string(script)?;
injection.push_str(&format!(
"<script type=\"application/ld+json\">{json}</script>\n"
));
}
let result = format!(
"{}{}{}",
&html[..head_pos],
injection,
&html[head_pos..]
);
fs::write(path, result)?;
let _ = injected.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
Ok(())
})?;
let count = injected.load(std::sync::atomic::Ordering::Relaxed);
if count > 0 {
log::info!(
"[json-ld] Injected structured data into {count} page(s)"
);
}
Ok(())
}
}
fn collect_html_files_recursive(dir: &Path) -> Result<Vec<PathBuf>> {
crate::walk::walk_files(dir, "html")
}
#[cfg(test)]
mod tests {
use super::*;
use crate::plugin::PluginManager;
use tempfile::tempdir;
fn make_html(title: &str, body: &str) -> String {
format!(
"<html><head><title>{title}</title></head>\
<body>{body}</body></html>"
)
}
fn test_ctx(site_dir: &Path) -> PluginContext {
crate::test_support::init_logger();
PluginContext::new(
Path::new("content"),
Path::new("build"),
site_dir,
Path::new("templates"),
)
}
#[test]
fn test_extract_title_present() {
let html = "<html><head><title>My Page</title></head></html>";
assert_eq!(extract_title(html), "My Page");
}
#[test]
fn test_extract_title_missing() {
let html = "<html><head></head><body></body></html>";
assert_eq!(extract_title(html), "");
}
#[test]
fn test_extract_description_truncates() {
let long = "word ".repeat(100);
let html =
format!("<html><head></head><body><p>{long}</p></body></html>");
let desc = extract_description(&html, 160);
assert!(desc.len() <= 160);
assert!(!desc.is_empty());
}
#[test]
fn test_seo_plugin_name() {
assert_eq!(SeoPlugin.name(), "seo");
}
#[test]
fn test_seo_plugin_injects_meta_tags() -> Result<()> {
let tmp = tempdir()?;
fs::write(
tmp.path().join("index.html"),
make_html("Hello World", "<p>Some content here</p>"),
)?;
let ctx = test_ctx(tmp.path());
SeoPlugin.after_compile(&ctx)?;
let result = fs::read_to_string(tmp.path().join("index.html"))?;
assert!(result.contains("<meta name=\"description\""));
assert!(result.contains("<meta property=\"og:title\""));
assert!(result.contains("Hello World"));
assert!(result.contains("<meta property=\"og:description\""));
assert!(
result.contains("<meta property=\"og:type\" content=\"website\"")
);
assert!(
result.contains("<meta name=\"twitter:card\" content=\"summary\"")
);
Ok(())
}
#[test]
fn test_seo_plugin_idempotent() -> Result<()> {
let tmp = tempdir()?;
fs::write(
tmp.path().join("page.html"),
make_html("Test", "<p>Content</p>"),
)?;
let ctx = test_ctx(tmp.path());
SeoPlugin.after_compile(&ctx)?;
let first = fs::read_to_string(tmp.path().join("page.html"))?;
SeoPlugin.after_compile(&ctx)?;
let second = fs::read_to_string(tmp.path().join("page.html"))?;
assert_eq!(first, second);
Ok(())
}
#[test]
fn test_extract_description_excludes_nav_header_footer() {
let html = r##"<html><head></head><body>
<a href="#main">Skip to content</a>
<nav><ul><li>Home</li><li>About</li><li>Search</li></ul></nav>
<header><h1>Site Header</h1></header>
<main><p>This is the actual page content that should be extracted.</p></main>
<footer><p>Copyright 2026</p></footer>
</body></html>"##;
let desc = extract_description(html, 160);
assert!(
desc.contains("actual page content"),
"description should contain main content, got: {desc}"
);
assert!(
!desc.contains("Skip to content"),
"description should not contain skip link text"
);
assert!(
!desc.contains("Site Header"),
"description should not contain header text"
);
assert!(
!desc.contains("Copyright"),
"description should not contain footer text"
);
}
#[test]
fn test_seo_plugin_handles_missing_title() -> Result<()> {
let tmp = tempdir()?;
fs::write(
tmp.path().join("no-title.html"),
"<html><head></head><body><p>No title here</p></body></html>",
)?;
let ctx = test_ctx(tmp.path());
SeoPlugin.after_compile(&ctx)?;
let result = fs::read_to_string(tmp.path().join("no-title.html"))?;
assert!(result.contains("<meta property=\"og:type\""));
assert!(result.contains("<meta name=\"twitter:card\""));
assert!(!result.contains("<meta property=\"og:title\""));
Ok(())
}
#[test]
fn test_seo_plugin_empty_dir() -> Result<()> {
let tmp = tempdir()?;
let ctx = test_ctx(tmp.path());
assert!(SeoPlugin.after_compile(&ctx).is_ok());
Ok(())
}
#[test]
fn test_seo_plugin_nonexistent_dir() -> Result<()> {
let ctx = test_ctx(Path::new("/nonexistent/path"));
assert!(SeoPlugin.after_compile(&ctx).is_ok());
Ok(())
}
#[test]
fn test_robots_plugin_name() {
let plugin = RobotsPlugin::new("https://example.com");
assert_eq!(plugin.name(), "robots");
}
#[test]
fn test_robots_plugin_creates_file() -> Result<()> {
let tmp = tempdir()?;
let ctx = test_ctx(tmp.path());
let plugin = RobotsPlugin::new("https://example.com");
plugin.after_compile(&ctx)?;
let path = tmp.path().join("robots.txt");
assert!(path.exists());
Ok(())
}
#[test]
fn test_robots_plugin_correct_content() -> Result<()> {
let tmp = tempdir()?;
let ctx = test_ctx(tmp.path());
let plugin = RobotsPlugin::new("https://example.com");
plugin.after_compile(&ctx)?;
let content = fs::read_to_string(tmp.path().join("robots.txt"))?;
assert!(content.contains("User-agent: *"));
assert!(content.contains("Allow: /"));
assert!(content.contains("Sitemap: https://example.com/sitemap.xml"));
Ok(())
}
#[test]
fn test_robots_plugin_does_not_overwrite() -> Result<()> {
let tmp = tempdir()?;
let robots_path = tmp.path().join("robots.txt");
fs::write(&robots_path, "User-agent: *\nDisallow: /secret\n")?;
let ctx = test_ctx(tmp.path());
let plugin = RobotsPlugin::new("https://example.com");
plugin.after_compile(&ctx)?;
let content = fs::read_to_string(&robots_path)?;
assert!(content.contains("Disallow: /secret"));
assert!(!content.contains("Sitemap:"));
Ok(())
}
#[test]
fn test_robots_plugin_custom_base_url() -> Result<()> {
let tmp = tempdir()?;
let ctx = test_ctx(tmp.path());
let plugin = RobotsPlugin::new("https://my-site.org");
plugin.after_compile(&ctx)?;
let content = fs::read_to_string(tmp.path().join("robots.txt"))?;
assert!(content.contains("Sitemap: https://my-site.org/sitemap.xml"));
Ok(())
}
#[test]
fn test_canonical_plugin_name() {
let plugin = CanonicalPlugin::new("https://example.com");
assert_eq!(plugin.name(), "canonical");
}
#[test]
fn test_canonical_plugin_injects_tag() -> Result<()> {
let tmp = tempdir()?;
fs::write(
tmp.path().join("index.html"),
make_html("Home", "<p>Welcome</p>"),
)?;
let ctx = test_ctx(tmp.path());
let plugin = CanonicalPlugin::new("https://example.com");
plugin.after_compile(&ctx)?;
let result = fs::read_to_string(tmp.path().join("index.html"))?;
assert!(result.contains("<link rel=\"canonical\""));
assert!(result.contains("https://example.com/index.html"));
Ok(())
}
#[test]
fn test_canonical_plugin_idempotent() -> Result<()> {
let tmp = tempdir()?;
fs::write(
tmp.path().join("page.html"),
make_html("Page", "<p>Content</p>"),
)?;
let ctx = test_ctx(tmp.path());
let plugin = CanonicalPlugin::new("https://example.com");
plugin.after_compile(&ctx)?;
let first = fs::read_to_string(tmp.path().join("page.html"))?;
plugin.after_compile(&ctx)?;
let second = fs::read_to_string(tmp.path().join("page.html"))?;
assert_eq!(first, second);
Ok(())
}
#[test]
fn test_canonical_plugin_nested_files() -> Result<()> {
let tmp = tempdir()?;
fs::create_dir_all(tmp.path().join("blog"))?;
fs::write(
tmp.path().join("blog/post.html"),
make_html("Post", "<p>Blog post</p>"),
)?;
let ctx = test_ctx(tmp.path());
let plugin = CanonicalPlugin::new("https://example.com");
plugin.after_compile(&ctx)?;
let result = fs::read_to_string(tmp.path().join("blog/post.html"))?;
assert!(result.contains("https://example.com/blog/post.html"));
Ok(())
}
#[test]
fn test_all_plugins_register() {
let mut pm = PluginManager::new();
pm.register(SeoPlugin);
pm.register(RobotsPlugin::new("https://example.com"));
pm.register(CanonicalPlugin::new("https://example.com"));
assert_eq!(pm.len(), 3);
assert_eq!(pm.names(), vec!["seo", "robots", "canonical"]);
}
#[test]
fn extract_description_unicode_truncation_respects_char_boundary() {
let text = "café 日本語 ".repeat(30);
let html =
format!("<html><head></head><body><p>{text}</p></body></html>");
let desc = extract_description(&html, 50);
assert!(desc.len() <= 50);
assert!(!desc.is_empty());
let _ = desc.chars().count();
}
#[test]
fn extract_description_empty_main_falls_back_to_body() {
let html = "<html><head></head><body>\
<main></main>\
<p>Body fallback text</p>\
</body></html>";
let desc = extract_description(html, 160);
assert!(
desc.is_empty(),
"expected empty description from empty <main>, got: {desc}"
);
}
#[test]
fn extract_description_no_body_uses_raw_html() {
let html = "<div><p>Raw content without body</p></div>";
let desc = extract_description(html, 160);
assert!(
desc.contains("Raw content without body"),
"expected raw content fallback, got: {desc}"
);
}
#[test]
fn extract_title_with_nested_tags() {
let html = "<html><head><title><span>Foo</span></title></head></html>";
let title = extract_title(html);
assert_eq!(title, "Foo");
}
#[test]
fn escape_attr_all_special_chars() {
let input = r#"Tom & "Jerry" <script>alert('xss')</script>"#;
let escaped = escape_attr(input);
assert!(escaped.contains("&"), "& should be escaped");
assert!(escaped.contains("""), "\" should be escaped");
assert!(escaped.contains("<"), "< should be escaped");
assert!(escaped.contains(">"), "> should be escaped");
assert_eq!(
escaped,
"Tom & "Jerry" <script>alert('xss')</script>"
);
}
#[test]
fn seo_plugin_skips_existing_single_quote_meta() -> Result<()> {
let html = "<html><head>\
<meta name='description' content='Already set'>\
<meta property='og:title' content='Title'>\
<meta property='og:description' content='Desc'>\
<meta property='og:type' content='website'>\
<meta name='twitter:card' content='summary'>\
<title>Test</title></head>\
<body><p>Content</p></body></html>";
let tmp = tempdir()?;
let path = tmp.path().join("single-quote.html");
fs::write(&path, html)?;
let ctx = test_ctx(tmp.path());
SeoPlugin.after_compile(&ctx)?;
let result = fs::read_to_string(&path)?;
assert_eq!(
result.matches("meta name=\"description\"").count()
+ result.matches("meta name='description'").count(),
1,
"description meta should not be duplicated"
);
assert_eq!(
result.matches("og:title").count(),
1,
"og:title should not be duplicated"
);
Ok(())
}
#[test]
fn canonical_plugin_trailing_slash_base_url() -> Result<()> {
let tmp = tempdir()?;
fs::write(
tmp.path().join("index.html"),
make_html("Home", "<p>Welcome</p>"),
)?;
let ctx = test_ctx(tmp.path());
let plugin = CanonicalPlugin::new("https://example.com/");
plugin.after_compile(&ctx)?;
let result = fs::read_to_string(tmp.path().join("index.html"))?;
assert!(
result.contains("https://example.com/index.html"),
"should produce clean URL without double slash"
);
assert!(
!result.contains("https://example.com//"),
"should not contain double slash in canonical URL"
);
Ok(())
}
#[test]
fn robots_plugin_trailing_slash_base_url() -> Result<()> {
let tmp = tempdir()?;
let ctx = test_ctx(tmp.path());
let plugin = RobotsPlugin::new("https://example.com/");
plugin.after_compile(&ctx)?;
let content = fs::read_to_string(tmp.path().join("robots.txt"))?;
assert!(
content.contains("Sitemap: https://example.com/sitemap.xml"),
"sitemap URL should not have double slash, got: {content}"
);
assert!(
!content.contains("https://example.com//"),
"should not contain double slash"
);
Ok(())
}
#[test]
fn extract_description_nested_script_in_main() {
let html = "<html><head></head><body>\
<main>\
<script>var x = 'ignore me';</script>\
<p>Visible text after script</p>\
</main></body></html>";
let desc = extract_description(html, 160);
assert!(
desc.contains("Visible text after script"),
"should contain the paragraph text, got: {desc}"
);
assert!(
!desc.contains("ignore me"),
"should not contain script content, got: {desc}"
);
}
#[test]
fn test_jsonld_injects_webpage() {
let dir = tempdir().unwrap();
let site = dir.path().join("site");
fs::create_dir_all(&site).unwrap();
let html = make_html("About", "<p>About us</p>");
fs::write(site.join("about.html"), &html).unwrap();
let ctx = test_ctx(&site);
let plugin = JsonLdPlugin::from_site("https://example.com", "Test Org");
plugin.after_compile(&ctx).unwrap();
let output = fs::read_to_string(site.join("about.html")).unwrap();
assert!(output.contains("application/ld+json"));
assert!(output.contains("\"@type\":\"WebPage\""));
assert!(output.contains("\"name\":\"About\""));
}
#[test]
fn test_jsonld_injects_article() {
let dir = tempdir().unwrap();
let site = dir.path().join("site");
fs::create_dir_all(&site).unwrap();
let html = "<html><head><title>Post</title></head>\
<body><article><h1>Post</h1></article></body></html>";
fs::write(site.join("post.html"), html).unwrap();
let ctx = test_ctx(&site);
let plugin = JsonLdPlugin::from_site("https://example.com", "My Org");
plugin.after_compile(&ctx).unwrap();
let output = fs::read_to_string(site.join("post.html")).unwrap();
assert!(output.contains("\"@type\":\"Article\""));
assert!(output.contains("\"headline\":\"Post\""));
assert!(output.contains("My Org"));
}
#[test]
fn test_jsonld_breadcrumbs() {
let dir = tempdir().unwrap();
let site = dir.path().join("site");
let blog = site.join("blog");
fs::create_dir_all(&blog).unwrap();
let html = make_html("My Post", "<p>Content</p>");
fs::write(blog.join("my-post.html"), &html).unwrap();
let ctx = test_ctx(&site);
let plugin = JsonLdPlugin::from_site("https://example.com", "Org");
plugin.after_compile(&ctx).unwrap();
let output = fs::read_to_string(blog.join("my-post.html")).unwrap();
assert!(output.contains("BreadcrumbList"));
assert!(output.contains("\"name\":\"Home\""));
assert!(output.contains("\"name\":\"blog\""));
}
#[test]
fn test_jsonld_idempotent() {
let dir = tempdir().unwrap();
let site = dir.path().join("site");
fs::create_dir_all(&site).unwrap();
let html = "<html><head><title>X</title>\
<script type=\"application/ld+json\">{}</script>\
</head><body></body></html>";
fs::write(site.join("x.html"), html).unwrap();
let ctx = test_ctx(&site);
let plugin = JsonLdPlugin::from_site("https://example.com", "Org");
plugin.after_compile(&ctx).unwrap();
let output = fs::read_to_string(site.join("x.html")).unwrap();
let count = output.matches("application/ld+json").count();
assert_eq!(count, 1);
}
#[test]
fn extract_title_empty_tag_returns_empty_string() {
assert_eq!(extract_title("<title></title>"), "");
assert_eq!(extract_title("<title> </title>"), "");
assert_eq!(extract_title("<title>\n\t </title>"), "");
}
#[test]
fn extract_title_without_closing_tag_returns_empty() {
assert_eq!(extract_title("<title>Unterminated"), "");
}
#[test]
fn extract_title_strips_inner_html_tags() {
let out = extract_title("<title>Hello <em>World</em></title>");
assert!(out.contains("Hello"));
assert!(out.contains("World"));
}
#[test]
fn extract_description_prefers_main_over_body() {
let html = r#"<html><head></head><body>
<nav>menu</nav>
<main>The primary content.</main>
<footer>Bottom</footer>
</body></html>"#;
let desc = extract_description(html, 200);
assert!(desc.contains("primary content"));
assert!(!desc.contains("menu"));
}
#[test]
fn extract_description_main_without_closing_tag_takes_rest() {
let html = r#"<html><body><main>content without close"#;
let desc = extract_description(html, 200);
assert!(desc.contains("content without close"));
}
#[test]
fn extract_description_main_without_angle_bracket_returns_empty_fallback() {
let html = "<html><body><main";
let desc = extract_description(html, 200);
assert_eq!(desc, "");
}
#[test]
fn extract_description_fallback_to_body_strips_script_and_style() {
let html = r#"<html><head></head><body>
<script>alert('skip');</script>
<style>body { color: red; }</style>
<nav>menu items here</nav>
<header>site title</header>
<p>The body text.</p>
<footer>copyright</footer>
</body></html>"#;
let desc = extract_description(html, 200);
assert!(desc.contains("body text"));
assert!(!desc.contains("alert"));
assert!(!desc.contains("color: red"));
assert!(!desc.contains("menu items"));
assert!(!desc.contains("site title"));
assert!(!desc.contains("copyright"));
}
#[test]
fn extract_description_body_without_closing_tag_uses_rest() {
let html = "<html><body><p>open-ended body paragraph";
let desc = extract_description(html, 200);
assert!(desc.contains("open-ended body paragraph"));
}
#[test]
fn extract_description_body_without_angle_bracket_returns_empty() {
let html = "<html><body";
let desc = extract_description(html, 200);
assert_eq!(desc, "");
}
#[test]
fn extract_description_no_body_no_main_uses_entire_html() {
let html = "just plain text no tags here";
let desc = extract_description(html, 200);
assert!(desc.contains("just plain text"));
}
#[test]
fn extract_description_unterminated_script_breaks_out() {
let html = "<html><body><main><script>unterminated<p>x</p>";
let desc = extract_description(html, 200);
let _ = desc;
}
#[test]
fn extract_description_truncates_at_word_boundary() {
let html = "<html><body><main>one two three four five six seven eight nine ten eleven twelve thirteen fourteen fifteen sixteen seventeen eighteen nineteen twenty twenty-one twenty-two twenty-three twenty-four twenty-five</main></body></html>";
let desc = extract_description(html, 80);
assert!(desc.len() <= 80);
assert!(!desc.ends_with('-'));
}
#[test]
fn extract_description_truncates_without_space_falls_to_byte_cut() {
let html =
"<html><body><main>oneverylongwordwithnospacesanywherehere</main></body></html>";
let desc = extract_description(html, 10);
assert!(desc.len() <= 10);
}
#[test]
fn extract_description_respects_char_boundary_on_truncation() {
let html = "<html><body><main>Rust programming — é ñ ü characters everywhere in this text that we want to truncate mid-char</main></body></html>";
let desc = extract_description(html, 30);
assert!(desc.is_ascii() || !desc.is_empty());
}
#[test]
fn extract_description_truncation_walks_back_multiple_bytes() {
let mut input = String::from("<html><body><main>");
input.push_str(&"a".repeat(20));
input.push('🎉'); input.push_str(&"b".repeat(20));
input.push_str("</main></body></html>");
let desc = extract_description(&input, 22);
assert!(!desc.is_empty(), "expected non-empty desc");
let _ = desc.len();
}
#[test]
fn extract_description_body_fallback_unterminated_nav_breaks() {
let html = "<html><body><nav>unterminated nav block<p>visible</p>";
let desc = extract_description(html, 200);
let _ = desc;
}
#[test]
fn seo_plugin_file_without_head_tag_is_unchanged() {
let dir = tempdir().unwrap();
fs::write(
dir.path().join("fragment.html"),
"<p>no html/head/body structure</p>",
)
.unwrap();
let ctx = test_ctx(dir.path());
SeoPlugin.after_compile(&ctx).unwrap();
let out = fs::read_to_string(dir.path().join("fragment.html")).unwrap();
assert_eq!(out, "<p>no html/head/body structure</p>");
}
#[test]
fn seo_plugin_missing_site_dir_returns_ok() {
let dir = tempdir().unwrap();
let missing = dir.path().join("missing");
let ctx = test_ctx(&missing);
SeoPlugin.after_compile(&ctx).unwrap();
}
#[test]
fn robots_plugin_skips_existing_robots_txt() {
let dir = tempdir().unwrap();
let existing = dir.path().join("robots.txt");
fs::write(&existing, "USER: existing").unwrap();
let plugin = RobotsPlugin::new("https://example.com");
let ctx = test_ctx(dir.path());
plugin.after_compile(&ctx).unwrap();
assert_eq!(fs::read_to_string(&existing).unwrap(), "USER: existing");
}
#[test]
fn robots_plugin_writes_user_agent_and_sitemap() {
let dir = tempdir().unwrap();
let plugin = RobotsPlugin::new("https://example.com/");
let ctx = test_ctx(dir.path());
plugin.after_compile(&ctx).unwrap();
let body = fs::read_to_string(dir.path().join("robots.txt")).unwrap();
assert!(body.contains("User-agent: *"));
assert!(body.contains("Sitemap: https://example.com/sitemap.xml"));
}
#[test]
fn robots_plugin_missing_site_dir_returns_ok() {
let dir = tempdir().unwrap();
let missing = dir.path().join("missing");
let plugin = RobotsPlugin::new("https://example.com");
let ctx = test_ctx(&missing);
plugin.after_compile(&ctx).unwrap();
}
#[test]
fn robots_plugin_name_returns_static_identifier() {
assert_eq!(RobotsPlugin::new("").name(), "robots");
}
#[test]
fn canonical_plugin_missing_site_dir_returns_ok() {
let dir = tempdir().unwrap();
let missing = dir.path().join("missing");
let plugin = CanonicalPlugin::new("https://example.com");
let ctx = test_ctx(&missing);
plugin.after_compile(&ctx).unwrap();
}
#[test]
fn canonical_plugin_skips_pages_with_existing_canonical_link() {
let dir = tempdir().unwrap();
let html = r#"<html><head><link rel="canonical" href="/original"></head><body></body></html>"#;
fs::write(dir.path().join("p.html"), html).unwrap();
let plugin = CanonicalPlugin::new("https://example.com");
let ctx = test_ctx(dir.path());
plugin.after_compile(&ctx).unwrap();
let out = fs::read_to_string(dir.path().join("p.html")).unwrap();
assert_eq!(out.matches(r#"rel="canonical""#).count(), 1);
assert!(out.contains("/original"));
}
#[test]
fn canonical_plugin_skips_pages_with_single_quoted_canonical() {
let dir = tempdir().unwrap();
let html =
r"<html><head><link rel='canonical' href='/x'></head></html>";
fs::write(dir.path().join("p.html"), html).unwrap();
let plugin = CanonicalPlugin::new("https://example.com");
let ctx = test_ctx(dir.path());
plugin.after_compile(&ctx).unwrap();
let out = fs::read_to_string(dir.path().join("p.html")).unwrap();
assert_eq!(out.matches("canonical").count(), 1);
}
#[test]
fn canonical_plugin_page_without_head_is_left_unchanged() {
let dir = tempdir().unwrap();
let html = "<p>no structure</p>";
fs::write(dir.path().join("frag.html"), html).unwrap();
let plugin = CanonicalPlugin::new("https://example.com");
let ctx = test_ctx(dir.path());
plugin.after_compile(&ctx).unwrap();
let out = fs::read_to_string(dir.path().join("frag.html")).unwrap();
assert_eq!(out, html);
}
#[test]
fn canonical_plugin_injects_canonical_link_before_head_close() {
let dir = tempdir().unwrap();
let html = "<html><head><title>T</title></head><body></body></html>";
fs::write(dir.path().join("a.html"), html).unwrap();
let plugin = CanonicalPlugin::new("https://example.com/");
let ctx = test_ctx(dir.path());
plugin.after_compile(&ctx).unwrap();
let out = fs::read_to_string(dir.path().join("a.html")).unwrap();
assert!(out.contains(r#"rel="canonical""#));
assert!(out.contains("https://example.com/a.html"));
}
#[test]
fn canonical_plugin_name_returns_static_identifier() {
assert_eq!(CanonicalPlugin::new("").name(), "canonical");
}
#[test]
fn jsonld_plugin_missing_site_dir_returns_ok() {
let dir = tempdir().unwrap();
let missing = dir.path().join("missing");
let plugin = JsonLdPlugin::from_site("https://example.com", "Org");
let ctx = test_ctx(&missing);
plugin.after_compile(&ctx).unwrap();
}
#[test]
fn jsonld_plugin_skips_pages_without_head_tag() {
let dir = tempdir().unwrap();
let site = dir.path().join("site");
fs::create_dir_all(&site).unwrap();
fs::write(site.join("frag.html"), "<p>no head</p>").unwrap();
let ctx = test_ctx(&site);
let plugin = JsonLdPlugin::from_site("https://example.com", "Org");
plugin.after_compile(&ctx).unwrap();
let out = fs::read_to_string(site.join("frag.html")).unwrap();
assert_eq!(out, "<p>no head</p>");
}
#[test]
fn jsonld_plugin_generates_webpage_when_no_article_element() {
let dir = tempdir().unwrap();
let site = dir.path().join("site");
fs::create_dir_all(&site).unwrap();
let html = "<html><head><title>Hello</title></head><body><p>content</p></body></html>";
fs::write(site.join("index.html"), html).unwrap();
let ctx = test_ctx(&site);
let plugin = JsonLdPlugin::from_site("https://example.com", "Org");
plugin.after_compile(&ctx).unwrap();
let out = fs::read_to_string(site.join("index.html")).unwrap();
assert!(out.contains("application/ld+json"));
assert!(out.contains("WebPage"));
}
#[test]
fn jsonld_plugin_generates_article_when_article_element_present() {
let dir = tempdir().unwrap();
let site = dir.path().join("site");
fs::create_dir_all(&site).unwrap();
let html = "<html><head><title>Post</title></head><body><article><h1>Post</h1></article></body></html>";
fs::write(site.join("post.html"), html).unwrap();
let ctx = test_ctx(&site);
let plugin = JsonLdPlugin::from_site("https://example.com", "Org");
plugin.after_compile(&ctx).unwrap();
let out = fs::read_to_string(site.join("post.html")).unwrap();
assert!(out.contains("application/ld+json"));
assert!(out.contains(r#""Article""#));
}
#[test]
fn jsonld_plugin_new_stores_supplied_config() {
let cfg = JsonLdConfig {
base_url: "https://a".to_string(),
org_name: "Org".to_string(),
breadcrumbs: false,
};
let plugin = JsonLdPlugin::new(cfg.clone());
assert_eq!(plugin.config.base_url, "https://a");
assert_eq!(plugin.config.org_name, "Org");
assert!(!plugin.config.breadcrumbs);
}
#[test]
fn jsonld_plugin_name_returns_static_identifier() {
let plugin = JsonLdPlugin::from_site("https://example.com", "Org");
assert_eq!(plugin.name(), "json-ld");
}
#[test]
fn collect_html_files_recursive_filters_and_sorts() {
let dir = tempdir().unwrap();
let sub = dir.path().join("sub");
fs::create_dir(&sub).unwrap();
fs::write(dir.path().join("z.html"), "").unwrap();
fs::write(dir.path().join("a.html"), "").unwrap();
fs::write(sub.join("m.html"), "").unwrap();
fs::write(dir.path().join("ignore.css"), "").unwrap();
let files = collect_html_files_recursive(dir.path()).unwrap();
assert_eq!(files.len(), 3);
}
#[test]
fn collect_html_files_recursive_missing_dir_returns_empty() {
let dir = tempdir().unwrap();
let result =
collect_html_files_recursive(&dir.path().join("missing")).unwrap();
assert!(result.is_empty());
}
}