use crate::parser::Document;
use crate::site::SiteConfig;
use chrono::Utc;
use std::collections::{HashMap, HashSet};
use std::fs::File;
use std::io::Write;
use std::path::Path;
pub fn generate_sitemap(
documents: &[Document],
site_config: &SiteConfig,
output_dir: &Path,
) -> Result<(), Box<dyn std::error::Error>> {
let document_groups = group_documents_by_base_name(documents);
let sitemap_content = generate_sitemap_xml(documents, &document_groups, site_config)?;
let sitemap_path = output_dir.join("sitemap.xml");
let mut file = File::create(&sitemap_path)?;
file.write_all(sitemap_content.as_bytes())?;
Ok(())
}
fn generate_sitemap_xml(
documents: &[Document],
document_groups: &HashMap<String, Vec<&Document>>,
site_config: &SiteConfig,
) -> Result<String, Box<dyn std::error::Error>> {
let mut sitemap = String::new();
sitemap.push_str("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n");
sitemap.push_str("<urlset xmlns=\"http://www.sitemaps.org/schemas/sitemap/0.9\" xmlns:xhtml=\"http://www.w3.org/1999/xhtml\" xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" xsi:schemaLocation=\"http://www.sitemaps.org/schemas/sitemap/0.9 http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd\">\n");
if let Some(ref base_url) = site_config.base_url {
sitemap.push_str(" <url>\n");
sitemap.push_str(&format!(" <loc>{}</loc>\n", escape_xml_url(base_url)));
let most_recent_date = documents
.iter()
.filter(|doc| should_include_in_sitemap(doc))
.filter_map(|doc| doc.front_matter.date)
.max()
.unwrap_or_else(Utc::now);
sitemap.push_str(&format!(
" <lastmod>{}</lastmod>\n",
most_recent_date.format("%Y-%m-%d")
));
sitemap.push_str(" <changefreq>weekly</changefreq>\n");
sitemap.push_str(" <priority>1.0</priority>\n");
sitemap.push_str(" </url>\n");
}
let mut processed_base_names: HashSet<String> = HashSet::new();
for document in documents {
if should_include_in_sitemap(document)
&& !processed_base_names.contains(&document.base_name)
{
processed_base_names.insert(document.base_name.clone());
if let Some(language_variants) = document_groups.get(&document.base_name) {
sitemap.push_str(&generate_sitemap_entry_for_group(
language_variants,
site_config,
)?);
}
}
}
sitemap.push_str("</urlset>\n");
Ok(sitemap)
}
fn group_documents_by_base_name(documents: &[Document]) -> HashMap<String, Vec<&Document>> {
let mut groups: HashMap<String, Vec<&Document>> = HashMap::new();
for doc in documents {
if should_include_in_sitemap(doc) {
groups.entry(doc.base_name.clone()).or_default().push(doc);
}
}
groups
}
fn generate_sitemap_entry_for_group(
language_variants: &[&Document],
site_config: &SiteConfig,
) -> Result<String, Box<dyn std::error::Error>> {
let mut entry = String::new();
entry.push_str(" <url>\n");
let canonical_doc = language_variants
.iter()
.find(|doc| doc.language == "en")
.unwrap_or(&language_variants[0]);
let canonical_url = generate_document_url(canonical_doc, site_config);
entry.push_str(&format!(
" <loc>{}</loc>\n",
escape_xml_url(&canonical_url)
));
let most_recent_date = language_variants
.iter()
.filter_map(|doc| doc.front_matter.date)
.max();
if let Some(date) = most_recent_date {
entry.push_str(&format!(
" <lastmod>{}</lastmod>\n",
date.format("%Y-%m-%d")
));
}
if is_post(canonical_doc) {
entry.push_str(" <changefreq>monthly</changefreq>\n");
entry.push_str(" <priority>0.8</priority>\n");
} else {
entry.push_str(" <changefreq>monthly</changefreq>\n");
entry.push_str(" <priority>0.6</priority>\n");
}
if language_variants.len() > 1 {
for variant in language_variants {
let variant_url = generate_document_url(variant, site_config);
entry.push_str(&format!(
" <xhtml:link rel=\"alternate\" hreflang=\"{}\" href=\"{}\" />\n",
&variant.language,
escape_xml_url(&variant_url)
));
}
}
entry.push_str(" </url>\n");
Ok(entry)
}
fn generate_document_url(document: &Document, site_config: &SiteConfig) -> String {
let mut path = std::path::PathBuf::from(&document.file_path);
path.set_extension("html");
if let Some(ref base_url) = site_config.base_url {
format!(
"{}/{}",
base_url.trim_end_matches('/'),
path.to_string_lossy()
)
} else {
path.to_string_lossy().to_string()
}
}
fn should_include_in_sitemap(document: &Document) -> bool {
if document.front_matter.draft.unwrap_or(false) {
return false;
}
true
}
fn is_post(document: &Document) -> bool {
document
.front_matter
.extra
.get("layout")
.and_then(|v| v.as_str())
== Some("post")
|| document.file_path.starts_with("posts/")
}
fn escape_xml_url(text: &str) -> String {
text.replace('&', "&")
.replace('<', "<")
.replace('>', ">")
}