halldyll-core 0.1.0

Core scraping engine for Halldyll - high-performance async web scraper for AI agents
Documentation
//! Index - Sitemap index

use quick_xml::events::Event;
use quick_xml::Reader;
use url::Url;

/// Sitemap index entry
#[derive(Debug, Clone)]
pub struct SitemapIndexEntry {
    /// Sitemap URL
    pub loc: Url,
    /// Last modification date
    pub lastmod: Option<String>,
}

/// Sitemap index parser
pub struct SitemapIndex;

impl Default for SitemapIndex {
    fn default() -> Self {
        Self::new()
    }
}

impl SitemapIndex {
    /// New parser
    pub fn new() -> Self {
        Self
    }

    /// Parse a sitemap index
    pub fn parse(&self, xml: &str) -> Vec<SitemapIndexEntry> {
        let mut entries = Vec::new();
        let mut reader = Reader::from_str(xml);
        reader.trim_text(true);

        let mut current_entry: Option<PartialIndexEntry> = None;
        let mut current_tag = String::new();
        let mut in_sitemap = false;

        loop {
            match reader.read_event() {
                Ok(Event::Start(ref e)) => {
                    let name = String::from_utf8_lossy(e.name().as_ref()).to_string();
                    current_tag = name.clone();

                    if name == "sitemap" {
                        in_sitemap = true;
                        current_entry = Some(PartialIndexEntry::default());
                    }
                }
                Ok(Event::Text(e)) => {
                    if in_sitemap {
                        if let Some(ref mut entry) = current_entry {
                            let text = e.unescape().unwrap_or_default().to_string();
                            match current_tag.as_str() {
                                "loc" => entry.loc = Some(text),
                                "lastmod" => entry.lastmod = Some(text),
                                _ => {}
                            }
                        }
                    }
                }
                Ok(Event::End(ref e)) => {
                    let name = String::from_utf8_lossy(e.name().as_ref()).to_string();
                    if name == "sitemap" {
                        in_sitemap = false;
                        if let Some(entry) = current_entry.take() {
                            if let Some(loc_str) = entry.loc {
                                if let Ok(loc) = Url::parse(&loc_str) {
                                    entries.push(SitemapIndexEntry {
                                        loc,
                                        lastmod: entry.lastmod,
                                    });
                                }
                            }
                        }
                    }
                }
                Ok(Event::Eof) => break,
                Err(_) => break,
                _ => {}
            }
        }

        entries
    }

    /// Default sitemap URL
    pub fn default_sitemap_url(base_url: &Url) -> Option<Url> {
        let sitemap_url = format!("{}://{}/sitemap.xml", base_url.scheme(), base_url.host_str()?);
        Url::parse(&sitemap_url).ok()
    }

    /// Common sitemap URLs to try
    pub fn common_sitemap_paths() -> Vec<&'static str> {
        vec![
            "/sitemap.xml",
            "/sitemap_index.xml",
            "/sitemap-index.xml",
            "/sitemaps.xml",
            "/sitemap/sitemap.xml",
            "/wp-sitemap.xml",
            "/post-sitemap.xml",
            "/page-sitemap.xml",
        ]
    }
}

/// Partial entry during parsing
#[derive(Default)]
struct PartialIndexEntry {
    loc: Option<String>,
    lastmod: Option<String>,
}