use crate::plugin::{Plugin, PluginContext};
use anyhow::Result;
use std::{
collections::BTreeMap,
fs,
path::{Path, PathBuf},
};
#[derive(Debug, Clone, Copy)]
pub struct AiPlugin;
impl Plugin for AiPlugin {
fn name(&self) -> &'static str {
"ai"
}
fn after_compile(&self, ctx: &PluginContext) -> Result<()> {
if !ctx.site_dir.exists() {
return Ok(());
}
generate_llms_txt(&ctx.site_dir, ctx.config.as_ref())?;
generate_llms_full_txt(&ctx.site_dir, ctx.config.as_ref())?;
let html_files = collect_html_files(&ctx.site_dir)?;
let pages_with_missing_alt =
process_html_for_ai(&html_files, &ctx.site_dir)?;
if pages_with_missing_alt > 0 {
log::warn!(
"[ai] {pages_with_missing_alt} page(s) have images without alt text"
);
}
Ok(())
}
}
fn process_html_for_ai(
html_files: &[PathBuf],
site_dir: &Path,
) -> Result<usize> {
let mut pages_with_missing_alt = 0usize;
for path in html_files {
let html = fs::read_to_string(path)?;
let modified = inject_max_snippet(&html);
check_alt_text(path, &modified, site_dir, &mut pages_with_missing_alt);
if modified != html {
fs::write(path, modified)?;
}
}
Ok(pages_with_missing_alt)
}
fn inject_max_snippet(html: &str) -> String {
if html.contains("max-snippet") || !html.contains("</head>") {
return html.to_string();
}
let tag = "<meta name=\"robots\" content=\"max-snippet:-1, max-image-preview:large, max-video-preview:-1\">\n";
if let Some(pos) = html.find("</head>") {
let mut modified = html.to_string();
modified.insert_str(pos, tag);
modified
} else {
html.to_string()
}
}
fn check_alt_text(
path: &Path,
html: &str,
site_dir: &Path,
counter: &mut usize,
) {
let missing = count_missing_alt(html);
if missing > 0 {
let rel = path.strip_prefix(site_dir).unwrap_or(path).display();
log::warn!("[ai] {missing} image(s) missing alt text in {rel}");
*counter += 1;
}
}
fn collect_page_entries(
site_dir: &Path,
) -> Result<Vec<(String, String, String)>> {
let html_files = collect_html_files(site_dir)?;
let mut entries = Vec::new();
for html_path in &html_files {
let rel = html_path.strip_prefix(site_dir).unwrap_or(html_path);
let sidecar_path = html_path.with_extension("meta.json");
let meta: serde_json::Map<String, serde_json::Value> =
if sidecar_path.exists() {
if let Ok(content) = fs::read_to_string(&sidecar_path) {
serde_json::from_str(&content).unwrap_or_default()
} else {
serde_json::Map::new()
}
} else {
serde_json::Map::new()
};
if is_excluded_page(rel, &meta) {
continue;
}
let title = meta
.get("title")
.and_then(serde_json::Value::as_str)
.unwrap_or_default()
.to_string();
let description = meta
.get("description")
.and_then(serde_json::Value::as_str)
.unwrap_or_default()
.to_string();
let url = format!("/{}", rel.to_string_lossy().replace('\\', "/"));
if !title.is_empty() {
entries.push((title, url, description));
}
}
Ok(entries)
}
fn is_excluded_page(
path: &Path,
frontmatter: &serde_json::Map<String, serde_json::Value>,
) -> bool {
let file_name = path
.file_name()
.map(|n| n.to_string_lossy().to_lowercase())
.unwrap_or_default();
if file_name == "404.html" || file_name.starts_with("error") {
return true;
}
if let Some(draft) = frontmatter.get("draft") {
if draft.as_bool().unwrap_or(false)
|| draft.as_str().is_some_and(|s| s == "true")
{
return true;
}
}
if let Some(private) = frontmatter.get("private") {
if private.as_bool().unwrap_or(false)
|| private.as_str().is_some_and(|s| s == "true")
{
return true;
}
}
false
}
fn group_pages_by_section(
entries: &[(String, String, String)],
) -> BTreeMap<String, Vec<(String, String, String)>> {
let mut sections: BTreeMap<String, Vec<(String, String, String)>> =
BTreeMap::new();
for (title, url, description) in entries {
let trimmed = url.trim_start_matches('/');
let section = if let Some(slash) = trimmed.find('/') {
let dir = &trimmed[..slash];
titlecase_word(dir)
} else {
"Pages".to_string()
};
sections.entry(section).or_default().push((
title.clone(),
url.clone(),
description.clone(),
));
}
sections
}
fn titlecase_word(s: &str) -> String {
let mut chars = s.chars();
match chars.next() {
None => String::new(),
Some(first) => {
let upper: String = first.to_uppercase().collect();
format!("{upper}{}", chars.as_str().to_lowercase())
}
}
}
fn parse_robots_disallow(site_dir: &Path) -> Vec<String> {
let robots_path = site_dir.join("robots.txt");
let Ok(content) = fs::read_to_string(&robots_path) else {
return Vec::new();
};
content
.lines()
.filter_map(|line| {
let trimmed = line.trim();
if let Some(rest) = trimmed.strip_prefix("Disallow:") {
let pattern = rest.trim();
if !pattern.is_empty() {
return Some(pattern.to_string());
}
}
None
})
.collect()
}
fn generate_llms_txt(
site_dir: &Path,
config: Option<&crate::cmd::SsgConfig>,
) -> Result<()> {
let site_name = config.map_or("Site", |c| c.site_name.as_str());
let base_url = config.map_or("", |c| c.base_url.as_str());
let description = config.map_or("", |c| c.site_description.as_str());
let language = config
.map(|c| c.language.as_str())
.filter(|l| !l.is_empty())
.unwrap_or("en");
let canonical_root = base_url.trim_end_matches('/');
let mut content =
format!("# {site_name}\n\n> {description}\n\nLanguage: {language}\n");
let entries = collect_page_entries(site_dir).unwrap_or_default();
let sections = group_pages_by_section(&entries);
for (section, pages) in §ions {
content.push_str(&format!("\n## {section}\n"));
for (title, url, desc) in pages {
let full_url = if canonical_root.is_empty() {
url.clone()
} else {
format!("{canonical_root}{url}")
};
if desc.is_empty() {
content.push_str(&format!("- [{title}]({full_url})\n"));
} else {
content.push_str(&format!("- [{title}]({full_url}): {desc}\n"));
}
}
}
let disallow = parse_robots_disallow(site_dir);
if !disallow.is_empty() {
content.push_str("\n## Disallow\n");
for pattern in &disallow {
content.push_str(&format!("- {pattern}\n"));
}
}
fs::write(site_dir.join("llms.txt"), content)?;
log::info!("[ai] Generated llms.txt");
Ok(())
}
fn generate_llms_full_txt(
site_dir: &Path,
config: Option<&crate::cmd::SsgConfig>,
) -> Result<()> {
let site_name = config.map_or("Site", |c| c.site_name.as_str());
let base_url = config.map_or("", |c| c.base_url.as_str());
let description = config.map_or("", |c| c.site_description.as_str());
let language = config
.map(|c| c.language.as_str())
.filter(|l| !l.is_empty())
.unwrap_or("en");
let canonical_root = base_url.trim_end_matches('/');
let mut content =
format!("# {site_name}\n\n> {description}\n\nLanguage: {language}\n");
let html_files = collect_html_files(site_dir)?;
for html_path in &html_files {
let rel = html_path.strip_prefix(site_dir).unwrap_or(html_path);
let sidecar_path = html_path.with_extension("meta.json");
let meta: serde_json::Map<String, serde_json::Value> =
if sidecar_path.exists() {
if let Ok(c) = fs::read_to_string(&sidecar_path) {
serde_json::from_str(&c).unwrap_or_default()
} else {
serde_json::Map::new()
}
} else {
serde_json::Map::new()
};
if is_excluded_page(rel, &meta) {
continue;
}
let title = meta
.get("title")
.and_then(serde_json::Value::as_str)
.unwrap_or_default();
if title.is_empty() {
continue;
}
let url = format!("/{}", rel.to_string_lossy().replace('\\', "/"));
let full_url = if canonical_root.is_empty() {
url.clone()
} else {
format!("{canonical_root}{url}")
};
let html = fs::read_to_string(html_path).unwrap_or_default();
let body_text = strip_html_tags(&extract_body(&html));
let trimmed = collapse_whitespace(&body_text);
content.push_str(&format!("\n---\n\n## [{title}]({full_url})\n\n"));
if !trimmed.is_empty() {
content.push_str(&trimmed);
content.push('\n');
}
}
fs::write(site_dir.join("llms-full.txt"), content)?;
log::info!("[ai] Generated llms-full.txt");
Ok(())
}
fn extract_body(html: &str) -> String {
let lower = html.to_lowercase();
let start = lower
.find("<body")
.and_then(|i| lower[i..].find('>').map(|j| i + j + 1))
.unwrap_or(0);
let end = lower.find("</body>").unwrap_or(html.len());
html[start..end].to_string()
}
fn strip_html_tags(html: &str) -> String {
let mut result = String::with_capacity(html.len());
let mut in_tag = false;
for ch in html.chars() {
match ch {
'<' => in_tag = true,
'>' => in_tag = false,
_ if !in_tag => result.push(ch),
_ => {}
}
}
result
}
fn collapse_whitespace(s: &str) -> String {
let mut result = String::with_capacity(s.len());
let mut prev_ws = true; for ch in s.chars() {
if ch.is_whitespace() {
if !prev_ws {
result.push(' ');
prev_ws = true;
}
} else {
result.push(ch);
prev_ws = false;
}
}
if result.ends_with(' ') {
let _ = result.pop();
}
result
}
fn count_missing_alt(html: &str) -> usize {
let lower = html.to_lowercase();
let mut count = 0;
let mut pos = 0;
while let Some(start) = lower[pos..].find("<img") {
let abs = pos + start;
let tag_end =
lower[abs..].find('>').map_or(lower.len(), |e| abs + e + 1);
let tag = &lower[abs..tag_end];
let has_alt = tag.contains("alt=");
let empty_alt = tag.contains("alt=\"\"") || tag.contains("alt=''");
if !has_alt || empty_alt {
count += 1;
}
pos = tag_end;
}
count
}
fn collect_html_files(dir: &Path) -> Result<Vec<PathBuf>> {
crate::walk::walk_files(dir, "html")
}
#[cfg(test)]
mod tests {
#![allow(clippy::unwrap_used, clippy::expect_used)]
use super::*;
use crate::cmd::SsgConfig;
use crate::test_support::init_logger;
use std::path::PathBuf;
use tempfile::{tempdir, TempDir};
fn make_site() -> (TempDir, PathBuf, PluginContext) {
init_logger();
let dir = tempdir().expect("create tempdir");
let site = dir.path().join("site");
fs::create_dir_all(&site).expect("mkdir site");
let ctx = PluginContext::new(dir.path(), dir.path(), &site, dir.path());
(dir, site, ctx)
}
fn write_page(
site: &Path,
rel_path: &str,
title: &str,
description: &str,
extra_fields: &str,
) {
let html_path = site.join(rel_path);
if let Some(parent) = html_path.parent() {
fs::create_dir_all(parent).unwrap();
}
let html = format!(
"<html><head><title>{title}</title></head>\
<body><h1>{title}</h1><p>{description}</p></body></html>"
);
fs::write(&html_path, html).unwrap();
let mut sidecar_json =
format!(r#"{{"title": "{title}", "description": "{description}""#);
if !extra_fields.is_empty() {
sidecar_json.push_str(", ");
sidecar_json.push_str(extra_fields);
}
sidecar_json.push('}');
fs::write(html_path.with_extension("meta.json"), sidecar_json).unwrap();
}
#[test]
fn ai_plugin_is_copy_after_move() {
let plugin = AiPlugin;
let _copy = plugin;
assert_eq!(plugin.name(), "ai");
}
#[test]
fn name_returns_static_ai_identifier() {
assert_eq!(AiPlugin.name(), "ai");
}
#[test]
fn count_missing_alt_table_driven() {
let cases: &[(&str, usize, &str)] = &[
(
r#"<img src="a.jpg" alt="ok">"#,
0,
"alt present and non-empty",
),
(r#"<img src="a.jpg">"#, 1, "no alt attribute at all"),
(r#"<img src="a.jpg" alt="">"#, 1, "empty double-quoted alt"),
(r#"<img src="a.jpg" alt=''>"#, 1, "empty single-quoted alt"),
(
r#"<img src="a.jpg"><img src="b.jpg" alt="ok">"#,
1,
"first missing, second ok",
),
(
r#"<img src="a.jpg"><img src="b.jpg">"#,
2,
"both missing — sequential scan progresses",
),
("", 0, "empty input → zero"),
("<p>no images here</p>", 0, "no <img> tags at all"),
(r#"<IMG SRC="a.jpg" ALT="ok">"#, 0, "case-insensitive ALT"),
(r#"<IMG SRC="a.jpg">"#, 1, "uppercase tag, no alt"),
];
for (input, expected, comment) in cases {
assert_eq!(
count_missing_alt(input),
*expected,
"{comment}: count_missing_alt({input:?})"
);
}
}
#[test]
fn count_missing_alt_unterminated_tag_does_not_panic() {
let result = count_missing_alt("<img src=foo");
assert!(result <= 1);
}
#[test]
fn test_parse_robots_disallow() {
let dir = tempdir().expect("tempdir");
fs::write(
dir.path().join("robots.txt"),
"User-agent: *\nDisallow: /admin/\nDisallow: /private/\nAllow: /\n",
)
.unwrap();
let result = parse_robots_disallow(dir.path());
assert_eq!(result, vec!["/admin/", "/private/"]);
}
#[test]
fn test_parse_robots_disallow_empty_file() {
let dir = tempdir().expect("tempdir");
fs::write(dir.path().join("robots.txt"), "").unwrap();
let result = parse_robots_disallow(dir.path());
assert!(result.is_empty());
}
#[test]
fn test_parse_robots_disallow_no_disallow_lines() {
let dir = tempdir().expect("tempdir");
fs::write(
dir.path().join("robots.txt"),
"User-agent: *\nAllow: /\nSitemap: https://example.com/sitemap.xml\n",
)
.unwrap();
let result = parse_robots_disallow(dir.path());
assert!(result.is_empty());
}
#[test]
fn test_parse_robots_disallow_multiple_user_agents() {
let dir = tempdir().expect("tempdir");
fs::write(
dir.path().join("robots.txt"),
"User-agent: Googlebot\nDisallow: /nogoogle/\n\n\
User-agent: *\nDisallow: /secret/\n",
)
.unwrap();
let result = parse_robots_disallow(dir.path());
assert_eq!(result, vec!["/nogoogle/", "/secret/"]);
}
#[test]
fn test_parse_robots_disallow_missing_file() {
let dir = tempdir().expect("tempdir");
let result = parse_robots_disallow(dir.path());
assert!(result.is_empty());
}
#[test]
fn test_parse_robots_disallow_empty_pattern_skipped() {
let dir = tempdir().expect("tempdir");
fs::write(
dir.path().join("robots.txt"),
"User-agent: *\nDisallow:\nDisallow: /blocked/\n",
)
.unwrap();
let result = parse_robots_disallow(dir.path());
assert_eq!(result, vec!["/blocked/"]);
}
#[test]
fn test_is_excluded_page_draft() {
let mut meta = serde_json::Map::new();
let _ = meta.insert("draft".to_string(), serde_json::Value::Bool(true));
assert!(is_excluded_page(Path::new("post.html"), &meta));
}
#[test]
fn test_is_excluded_page_draft_string() {
let mut meta = serde_json::Map::new();
let _ = meta.insert(
"draft".to_string(),
serde_json::Value::String("true".to_string()),
);
assert!(is_excluded_page(Path::new("post.html"), &meta));
}
#[test]
fn test_is_excluded_page_private() {
let mut meta = serde_json::Map::new();
let _ =
meta.insert("private".to_string(), serde_json::Value::Bool(true));
assert!(is_excluded_page(Path::new("post.html"), &meta));
}
#[test]
fn test_is_excluded_page_404() {
let meta = serde_json::Map::new();
assert!(is_excluded_page(Path::new("404.html"), &meta));
}
#[test]
fn test_is_excluded_page_normal() {
let mut meta = serde_json::Map::new();
let _ = meta.insert(
"title".to_string(),
serde_json::Value::String("Hello".to_string()),
);
assert!(!is_excluded_page(Path::new("index.html"), &meta));
}
#[test]
fn test_is_excluded_page_error_page() {
let meta = serde_json::Map::new();
assert!(is_excluded_page(Path::new("error500.html"), &meta));
}
#[test]
fn test_group_pages_by_section() {
let entries = vec![
(
"Home".to_string(),
"/index.html".to_string(),
"Welcome".to_string(),
),
(
"Post 1".to_string(),
"/blog/post1.html".to_string(),
"First".to_string(),
),
(
"Post 2".to_string(),
"/blog/post2.html".to_string(),
"Second".to_string(),
),
(
"API Ref".to_string(),
"/docs/api.html".to_string(),
"API docs".to_string(),
),
];
let grouped = group_pages_by_section(&entries);
assert_eq!(grouped.len(), 3);
assert!(grouped.contains_key("Pages"));
assert!(grouped.contains_key("Blog"));
assert!(grouped.contains_key("Docs"));
assert_eq!(grouped["Pages"].len(), 1);
assert_eq!(grouped["Blog"].len(), 2);
assert_eq!(grouped["Docs"].len(), 1);
}
#[test]
fn test_group_pages_by_section_root_only() {
let entries = vec![
(
"About".to_string(),
"/about.html".to_string(),
String::new(),
),
(
"Contact".to_string(),
"/contact.html".to_string(),
String::new(),
),
];
let grouped = group_pages_by_section(&entries);
assert_eq!(grouped.len(), 1);
assert_eq!(grouped["Pages"].len(), 2);
}
#[test]
fn test_group_pages_by_section_deterministic_order() {
let entries = vec![
("Z".to_string(), "/zebra/z.html".to_string(), String::new()),
("A".to_string(), "/alpha/a.html".to_string(), String::new()),
("M".to_string(), "/middle/m.html".to_string(), String::new()),
];
let grouped = group_pages_by_section(&entries);
let keys: Vec<&String> = grouped.keys().collect();
assert_eq!(keys, vec!["Alpha", "Middle", "Zebra"]);
}
#[test]
fn generate_llms_txt_with_full_config_includes_all_fields() {
let dir = tempdir().expect("tempdir");
let config = SsgConfig {
site_name: "My Site".to_string(),
site_description: "A great site".to_string(),
base_url: "https://example.com".to_string(),
language: "en".to_string(),
..Default::default()
};
generate_llms_txt(dir.path(), Some(&config)).unwrap();
let body = fs::read_to_string(dir.path().join("llms.txt")).unwrap();
assert!(body.contains("# My Site"));
assert!(body.contains("> A great site"));
assert!(body.contains("Language: en"));
}
#[test]
fn generate_llms_txt_without_config_uses_defaults() {
let dir = tempdir().expect("tempdir");
generate_llms_txt(dir.path(), None).unwrap();
let body = fs::read_to_string(dir.path().join("llms.txt")).unwrap();
assert!(body.contains("# Site"));
assert!(body.contains("Language: en"));
}
#[test]
fn generate_llms_txt_strips_trailing_slash_from_base_url() {
let dir = tempdir().expect("tempdir");
let config = SsgConfig {
site_name: "S".to_string(),
site_description: "D".to_string(),
base_url: "https://example.com/".to_string(),
..Default::default()
};
write_page(dir.path(), "index.html", "Home", "Welcome", "");
generate_llms_txt(dir.path(), Some(&config)).unwrap();
let body = fs::read_to_string(dir.path().join("llms.txt")).unwrap();
assert!(
!body.contains("//index.html"),
"trailing slash should be normalised:\n{body}"
);
}
#[test]
fn generate_llms_txt_into_missing_parent_returns_err() {
let bogus = Path::new("/this/path/should/not/exist");
assert!(generate_llms_txt(bogus, None).is_err());
}
#[test]
fn test_llms_txt_contains_language() {
let dir = tempdir().expect("tempdir");
let config = SsgConfig {
language: "fr".to_string(),
..Default::default()
};
generate_llms_txt(dir.path(), Some(&config)).unwrap();
let body = fs::read_to_string(dir.path().join("llms.txt")).unwrap();
assert!(
body.contains("Language: fr"),
"llms.txt must include Language field:\n{body}"
);
}
#[test]
fn test_llms_txt_contains_language_defaults_to_en() {
let dir = tempdir().expect("tempdir");
let config = SsgConfig {
language: String::new(),
..Default::default()
};
generate_llms_txt(dir.path(), Some(&config)).unwrap();
let body = fs::read_to_string(dir.path().join("llms.txt")).unwrap();
assert!(
body.contains("Language: en"),
"empty language should default to en:\n{body}"
);
}
#[test]
fn test_llms_txt_excludes_drafts() {
let dir = tempdir().expect("tempdir");
write_page(dir.path(), "published.html", "Published", "Visible", "");
write_page(
dir.path(),
"draft.html",
"Draft Post",
"Hidden",
r#""draft": true"#,
);
generate_llms_txt(dir.path(), None).unwrap();
let body = fs::read_to_string(dir.path().join("llms.txt")).unwrap();
assert!(
body.contains("Published"),
"published page must appear:\n{body}"
);
assert!(
!body.contains("Draft Post"),
"draft page must be excluded:\n{body}"
);
}
#[test]
fn test_llms_txt_excludes_private() {
let dir = tempdir().expect("tempdir");
write_page(dir.path(), "public.html", "Public", "Visible", "");
write_page(
dir.path(),
"secret.html",
"Secret",
"Hidden",
r#""private": true"#,
);
generate_llms_txt(dir.path(), None).unwrap();
let body = fs::read_to_string(dir.path().join("llms.txt")).unwrap();
assert!(
!body.contains("Secret"),
"private page must be excluded:\n{body}"
);
}
#[test]
fn test_llms_txt_excludes_404() {
let dir = tempdir().expect("tempdir");
write_page(dir.path(), "index.html", "Home", "Welcome", "");
write_page(dir.path(), "404.html", "Not Found", "Error page", "");
generate_llms_txt(dir.path(), None).unwrap();
let body = fs::read_to_string(dir.path().join("llms.txt")).unwrap();
assert!(
!body.contains("Not Found"),
"404 page must be excluded:\n{body}"
);
}
#[test]
fn test_llms_txt_contains_sections() {
let dir = tempdir().expect("tempdir");
write_page(dir.path(), "index.html", "Home", "Welcome", "");
write_page(dir.path(), "blog/post.html", "My Post", "A blog post", "");
write_page(
dir.path(),
"docs/api.html",
"API Docs",
"API reference",
"",
);
generate_llms_txt(dir.path(), None).unwrap();
let body = fs::read_to_string(dir.path().join("llms.txt")).unwrap();
assert!(
body.contains("## Pages"),
"should have Pages section:\n{body}"
);
assert!(
body.contains("## Blog"),
"should have Blog section:\n{body}"
);
assert!(
body.contains("## Docs"),
"should have Docs section:\n{body}"
);
assert!(
body.contains("- [My Post]"),
"should contain page link:\n{body}"
);
}
#[test]
fn test_llms_txt_contains_disallow_section() {
let dir = tempdir().expect("tempdir");
fs::write(
dir.path().join("robots.txt"),
"User-agent: *\nDisallow: /admin/\n",
)
.unwrap();
generate_llms_txt(dir.path(), None).unwrap();
let body = fs::read_to_string(dir.path().join("llms.txt")).unwrap();
assert!(
body.contains("## Disallow"),
"should have Disallow section:\n{body}"
);
assert!(
body.contains("- /admin/"),
"should contain disallow pattern:\n{body}"
);
}
#[test]
fn test_llms_txt_no_disallow_without_robots() {
let dir = tempdir().expect("tempdir");
generate_llms_txt(dir.path(), None).unwrap();
let body = fs::read_to_string(dir.path().join("llms.txt")).unwrap();
assert!(
!body.contains("## Disallow"),
"no robots.txt means no Disallow section:\n{body}"
);
}
#[test]
fn test_llms_full_txt_contains_body_content() {
let dir = tempdir().expect("tempdir");
write_page(dir.path(), "index.html", "Home", "Welcome home", "");
generate_llms_full_txt(dir.path(), None).unwrap();
let body =
fs::read_to_string(dir.path().join("llms-full.txt")).unwrap();
assert!(body.contains("# Site"), "header present:\n{body}");
assert!(body.contains("Language: en"), "language present:\n{body}");
assert!(body.contains("## [Home]"), "page title present:\n{body}");
assert!(body.contains("Welcome home"), "body text present:\n{body}");
}
#[test]
fn test_llms_full_txt_excludes_drafts() {
let dir = tempdir().expect("tempdir");
write_page(dir.path(), "ok.html", "Visible", "Content", "");
write_page(
dir.path(),
"hidden.html",
"Hidden",
"Secret",
r#""draft": true"#,
);
generate_llms_full_txt(dir.path(), None).unwrap();
let body =
fs::read_to_string(dir.path().join("llms-full.txt")).unwrap();
assert!(body.contains("Visible"), "published page present:\n{body}");
assert!(!body.contains("Hidden"), "draft excluded:\n{body}");
}
#[test]
fn test_llms_full_txt_excludes_404() {
let dir = tempdir().expect("tempdir");
write_page(dir.path(), "index.html", "Home", "Welcome", "");
write_page(dir.path(), "404.html", "Not Found", "Error", "");
generate_llms_full_txt(dir.path(), None).unwrap();
let body =
fs::read_to_string(dir.path().join("llms-full.txt")).unwrap();
assert!(!body.contains("Not Found"), "404 excluded:\n{body}");
}
#[test]
fn test_strip_html_tags() {
assert_eq!(strip_html_tags("<p>hello</p>"), "hello");
assert_eq!(strip_html_tags("<div><b>bold</b> text</div>"), "bold text");
assert_eq!(strip_html_tags("no tags"), "no tags");
assert_eq!(strip_html_tags(""), "");
}
#[test]
fn test_extract_body() {
let html =
"<html><head><title>T</title></head><body>Content</body></html>";
assert_eq!(extract_body(html), "Content");
}
#[test]
fn test_extract_body_with_attributes() {
let html = "<html><body class=\"main\">Content</body></html>";
assert_eq!(extract_body(html), "Content");
}
#[test]
fn test_extract_body_no_body_tag() {
let html = "<p>Just a fragment</p>";
assert_eq!(extract_body(html), html);
}
#[test]
fn test_collapse_whitespace() {
assert_eq!(collapse_whitespace(" hello world "), "hello world");
assert_eq!(collapse_whitespace("no extra"), "no extra");
assert_eq!(collapse_whitespace(""), "");
}
#[test]
fn test_titlecase_word() {
assert_eq!(titlecase_word("blog"), "Blog");
assert_eq!(titlecase_word("DOCS"), "Docs");
assert_eq!(titlecase_word(""), "");
assert_eq!(titlecase_word("a"), "A");
}
#[test]
fn after_compile_missing_site_dir_returns_ok_without_writing() {
let dir = tempdir().expect("tempdir");
let missing = dir.path().join("missing");
let ctx =
PluginContext::new(dir.path(), dir.path(), &missing, dir.path());
AiPlugin.after_compile(&ctx).expect("missing site is fine");
assert!(!missing.exists());
assert!(!dir.path().join("llms.txt").exists());
}
#[test]
fn after_compile_injects_max_snippet_meta_tag() {
let (_tmp, site, ctx) = make_site();
let html = "<html><head><title>X</title></head><body></body></html>";
fs::write(site.join("index.html"), html).unwrap();
AiPlugin.after_compile(&ctx).unwrap();
let output = fs::read_to_string(site.join("index.html")).unwrap();
assert!(output.contains("max-snippet"));
assert!(output.contains("max-image-preview:large"));
}
#[test]
fn after_compile_creates_llms_txt_in_site_root() {
let (_tmp, site, ctx) = make_site();
AiPlugin.after_compile(&ctx).unwrap();
assert!(site.join("llms.txt").exists());
}
#[test]
fn after_compile_creates_llms_full_txt_in_site_root() {
let (_tmp, site, ctx) = make_site();
AiPlugin.after_compile(&ctx).unwrap();
assert!(site.join("llms-full.txt").exists());
}
#[test]
fn after_compile_idempotent_does_not_duplicate_meta_tag() {
let (_tmp, site, ctx) = make_site();
let html = "<html><head><title>X</title></head><body></body></html>";
fs::write(site.join("index.html"), html).unwrap();
AiPlugin.after_compile(&ctx).unwrap();
AiPlugin.after_compile(&ctx).unwrap();
let output = fs::read_to_string(site.join("index.html")).unwrap();
assert_eq!(output.matches("max-snippet").count(), 1);
}
#[test]
fn after_compile_skips_html_files_without_head_tag() {
let (_tmp, site, ctx) = make_site();
fs::write(site.join("fragment.html"), "<p>just a fragment</p>")
.unwrap();
AiPlugin.after_compile(&ctx).unwrap();
let output = fs::read_to_string(site.join("fragment.html")).unwrap();
assert!(!output.contains("max-snippet"));
assert_eq!(output, "<p>just a fragment</p>");
}
#[test]
fn after_compile_processes_files_in_subdirectories() {
let (_tmp, site, ctx) = make_site();
let nested = site.join("blog");
fs::create_dir_all(&nested).unwrap();
fs::write(
nested.join("post.html"),
"<html><head></head><body></body></html>",
)
.unwrap();
AiPlugin.after_compile(&ctx).unwrap();
let output = fs::read_to_string(nested.join("post.html")).unwrap();
assert!(output.contains("max-snippet"));
}
#[test]
fn after_compile_logs_warning_for_pages_with_missing_alt() {
let (_tmp, site, ctx) = make_site();
fs::write(
site.join("bad.html"),
r#"<html><head></head><body><img src="a.jpg"></body></html>"#,
)
.unwrap();
fs::write(
site.join("worse.html"),
r#"<html><head></head><body><img src="a.jpg" alt=""></body></html>"#,
)
.unwrap();
AiPlugin.after_compile(&ctx).unwrap();
let bad = fs::read_to_string(site.join("bad.html")).unwrap();
assert!(bad.contains("max-snippet"));
}
#[test]
fn after_compile_does_not_rewrite_unchanged_files() {
let (_tmp, site, ctx) = make_site();
let html = "<html><head><meta name=\"robots\" content=\"max-snippet:-1\"></head><body></body></html>";
fs::write(site.join("index.html"), html).unwrap();
let original_mtime = fs::metadata(site.join("index.html"))
.unwrap()
.modified()
.unwrap();
AiPlugin.after_compile(&ctx).unwrap();
let after = fs::read_to_string(site.join("index.html")).unwrap();
assert_eq!(after, html, "unchanged file body must be preserved");
let _ = original_mtime;
}
#[test]
fn collect_html_files_returns_empty_for_missing_directory() {
let dir = tempdir().expect("tempdir");
let result = collect_html_files(&dir.path().join("missing")).unwrap();
assert!(result.is_empty());
}
#[test]
fn collect_html_files_filters_non_html_extensions() {
let dir = tempdir().expect("tempdir");
fs::write(dir.path().join("a.html"), "").unwrap();
fs::write(dir.path().join("b.css"), "").unwrap();
fs::write(dir.path().join("c.js"), "").unwrap();
let result = collect_html_files(dir.path()).unwrap();
assert_eq!(result.len(), 1);
}
#[test]
fn collect_html_files_recurses_into_nested_subdirectories() {
let dir = tempdir().expect("tempdir");
let nested = dir.path().join("a").join("b");
fs::create_dir_all(&nested).unwrap();
fs::write(dir.path().join("top.html"), "").unwrap();
fs::write(nested.join("deep.html"), "").unwrap();
let result = collect_html_files(dir.path()).unwrap();
assert_eq!(result.len(), 2);
}
#[test]
fn collect_html_files_returns_results_sorted() {
let dir = tempdir().expect("tempdir");
for name in ["zebra.html", "apple.html", "mango.html"] {
fs::write(dir.path().join(name), "").unwrap();
}
let result = collect_html_files(dir.path()).unwrap();
let names: Vec<_> = result
.iter()
.map(|p| p.file_name().unwrap().to_str().unwrap())
.collect();
assert_eq!(names, vec!["apple.html", "mango.html", "zebra.html"]);
}
}