systemprompt-generator 0.2.2

Static site generation, theme rendering, and asset bundling for systemprompt.io AI governance dashboards. Handlebars and Markdown pipeline for the MCP governance platform.
Documentation
use anyhow::{Context, Result, anyhow};
use chrono::Utc;
use std::path::Path;
use systemprompt_content::ContentRepository;
use systemprompt_database::DbPool;
use systemprompt_identifiers::SourceId;
use systemprompt_models::{AppPaths, Config, ContentConfigRaw, ContentSourceConfigRaw};
use tokio::fs;

use super::xml::{SitemapUrl, build_sitemap_index, build_sitemap_xml};

const MAX_URLS_PER_SITEMAP: usize = 50_000;
const SLUG_PLACEHOLDER: &str = "{slug}";

struct SitemapContext {
    config: ContentConfigRaw,
    db_pool: DbPool,
    base_url: String,
    web_dir: std::path::PathBuf,
}

pub async fn generate_sitemap(db_pool: DbPool) -> Result<()> {
    let ctx = load_sitemap_context(db_pool).await?;
    let urls = collect_sitemap_urls(&ctx).await?;
    write_sitemap_files(&ctx.web_dir, &urls, &ctx.base_url).await?;
    tracing::info!(url_count = urls.len(), "Sitemap generation completed");
    Ok(())
}

async fn load_sitemap_context(db_pool: DbPool) -> Result<SitemapContext> {
    let global_config = Config::get()?;
    let paths = AppPaths::get().map_err(|e| anyhow!("{}", e))?;
    let config_path = paths.system().content_config();

    let yaml_content = fs::read_to_string(&config_path)
        .await
        .context("Failed to read content config")?;

    let config: ContentConfigRaw =
        serde_yaml::from_str(&yaml_content).context("Failed to parse content config")?;

    let web_dir = AppPaths::get()
        .map_err(|e| anyhow!("{}", e))?
        .web()
        .dist()
        .to_path_buf();
    let base_url = global_config.api_external_url.clone();

    tracing::debug!(base_url = %base_url, "Using base URL");

    Ok(SitemapContext {
        config,
        db_pool,
        base_url,
        web_dir,
    })
}

async fn collect_sitemap_urls(ctx: &SitemapContext) -> Result<Vec<SitemapUrl>> {
    let mut all_urls = Vec::new();

    for (source_name, source) in &ctx.config.content_sources {
        let urls = collect_source_urls(ctx, source_name, source).await?;
        all_urls.extend(urls);
    }

    Ok(all_urls)
}

async fn collect_source_urls(
    ctx: &SitemapContext,
    source_name: &str,
    source: &ContentSourceConfigRaw,
) -> Result<Vec<SitemapUrl>> {
    let Some(sitemap_config) = get_enabled_sitemap_config(source) else {
        return Ok(Vec::new());
    };

    tracing::debug!(source = %source_name, "Processing source");

    let mut urls = fetch_urls_from_database(FetchParams {
        db_pool: &ctx.db_pool,
        source_id: source.source_id.as_str(),
        url_pattern: &sitemap_config.url_pattern,
        priority: sitemap_config.priority,
        changefreq: &sitemap_config.changefreq,
        base_url: &ctx.base_url,
    })
    .await
    .context(format!("Failed to fetch URLs for {source_name}"))?;

    urls.extend(build_parent_url(sitemap_config, &ctx.base_url));
    Ok(urls)
}

fn get_enabled_sitemap_config(
    source: &ContentSourceConfigRaw,
) -> Option<&systemprompt_models::SitemapConfig> {
    if !source.enabled {
        return None;
    }
    source.sitemap.as_ref().filter(|cfg| cfg.enabled)
}

fn build_parent_url(
    sitemap_config: &systemprompt_models::SitemapConfig,
    base_url: &str,
) -> Option<SitemapUrl> {
    let parent_config = sitemap_config.parent_route.as_ref()?;

    if !parent_config.enabled {
        return None;
    }

    Some(SitemapUrl {
        loc: format!("{}{}", base_url, parent_config.url),
        lastmod: Utc::now().format("%Y-%m-%d").to_string(),
        changefreq: parent_config.changefreq.clone(),
        priority: parent_config.priority,
    })
}

async fn write_sitemap_files(
    web_dir: &Path,
    all_urls: &[SitemapUrl],
    base_url: &str,
) -> Result<()> {
    let sitemap_chunks: Vec<Vec<_>> = all_urls
        .chunks(MAX_URLS_PER_SITEMAP)
        .map(<[_]>::to_vec)
        .collect();

    if sitemap_chunks.len() == 1 {
        write_single_sitemap(web_dir, &sitemap_chunks[0]).await
    } else {
        write_multiple_sitemaps(web_dir, &sitemap_chunks, base_url).await
    }
}

async fn write_single_sitemap(web_dir: &Path, urls: &[SitemapUrl]) -> Result<()> {
    let sitemap_xml = build_sitemap_xml(urls);
    let path = web_dir.join("sitemap.xml");
    fs::write(&path, sitemap_xml).await?;

    tracing::debug!(url_count = urls.len(), "Generated sitemap.xml");
    Ok(())
}

async fn write_multiple_sitemaps(
    web_dir: &Path,
    chunks: &[Vec<SitemapUrl>],
    base_url: &str,
) -> Result<()> {
    let sitemap_dir = web_dir.join("sitemaps");
    fs::create_dir_all(&sitemap_dir).await?;

    for (idx, chunk) in chunks.iter().enumerate() {
        write_numbered_sitemap(&sitemap_dir, idx, chunk).await?;
    }

    write_sitemap_index(web_dir, chunks, base_url).await
}

async fn write_numbered_sitemap(sitemap_dir: &Path, idx: usize, urls: &[SitemapUrl]) -> Result<()> {
    let filename = format!("sitemap-{}.xml", idx + 1);
    let sitemap_xml = build_sitemap_xml(urls);
    let path = sitemap_dir.join(&filename);
    fs::write(&path, sitemap_xml).await?;
    tracing::debug!(filename = %filename, url_count = urls.len(), "Generated sitemap file");
    Ok(())
}

async fn write_sitemap_index(
    web_dir: &Path,
    chunks: &[Vec<SitemapUrl>],
    base_url: &str,
) -> Result<()> {
    let index_xml = build_sitemap_index(chunks, base_url);
    let path = web_dir.join("sitemap.xml");
    fs::write(&path, index_xml).await?;
    tracing::debug!(file_count = chunks.len(), "Generated sitemap index");
    Ok(())
}

struct FetchParams<'a> {
    db_pool: &'a DbPool,
    source_id: &'a str,
    url_pattern: &'a str,
    priority: f32,
    changefreq: &'a str,
    base_url: &'a str,
}

async fn fetch_urls_from_database(params: FetchParams<'_>) -> Result<Vec<SitemapUrl>> {
    let repo = ContentRepository::new(params.db_pool)
        .map_err(|e| anyhow!("{}", e))
        .context("Failed to create content repository")?;

    let source_id = SourceId::new(params.source_id);
    let contents = repo
        .list_by_source(&source_id)
        .await
        .context("Failed to fetch content for sitemap")?;

    Ok(contents
        .iter()
        .map(|content| build_sitemap_url_from_content(content, &params))
        .collect())
}

fn build_sitemap_url_from_content(
    content: &systemprompt_content::models::Content,
    params: &FetchParams<'_>,
) -> SitemapUrl {
    let relative_url = params.url_pattern.replace(SLUG_PLACEHOLDER, &content.slug);
    let absolute_url = format!("{}{}", params.base_url, relative_url);

    let lastmod = content.published_at.format("%Y-%m-%d").to_string();

    SitemapUrl {
        loc: absolute_url,
        lastmod,
        changefreq: params.changefreq.to_string(),
        priority: params.priority,
    }
}