halldyll-core 0.1.0

Core scraping engine for Halldyll - high-performance async web scraper for AI agents
Documentation
//! Parser - sitemap.xml parsing

use quick_xml::events::Event;
use quick_xml::Reader;
use url::Url;

/// Sitemap entry
#[derive(Debug, Clone)]
pub struct SitemapEntry {
    /// Page URL
    pub loc: Url,
    /// Last modification date
    pub lastmod: Option<String>,
    /// Change frequency
    pub changefreq: Option<ChangeFreq>,
    /// Priority (0.0 - 1.0)
    pub priority: Option<f32>,
    /// Associated images
    pub images: Vec<SitemapImage>,
    /// Associated videos
    pub videos: Vec<SitemapVideo>,
    /// Associated news
    pub news: Option<SitemapNews>,
}

/// Change frequency
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum ChangeFreq {
    /// Always changing
    Always,
    /// Hourly updates
    Hourly,
    /// Daily updates
    Daily,
    /// Weekly updates
    Weekly,
    /// Monthly updates
    Monthly,
    /// Yearly updates
    Yearly,
    /// Never changes
    Never,
}

impl ChangeFreq {
    fn from_str(s: &str) -> Option<Self> {
        match s.to_lowercase().as_str() {
            "always" => Some(ChangeFreq::Always),
            "hourly" => Some(ChangeFreq::Hourly),
            "daily" => Some(ChangeFreq::Daily),
            "weekly" => Some(ChangeFreq::Weekly),
            "monthly" => Some(ChangeFreq::Monthly),
            "yearly" => Some(ChangeFreq::Yearly),
            "never" => Some(ChangeFreq::Never),
            _ => None,
        }
    }
}

/// Image in sitemap
#[derive(Debug, Clone)]
pub struct SitemapImage {
    /// Image location URL
    pub loc: Url,
    /// Image title
    pub title: Option<String>,
    /// Image caption
    pub caption: Option<String>,
}

/// Video in sitemap
#[derive(Debug, Clone)]
pub struct SitemapVideo {
    /// Video content URL
    pub content_loc: Option<Url>,
    /// Video player URL
    pub player_loc: Option<Url>,
    /// Video thumbnail URL
    pub thumbnail_loc: Option<Url>,
    /// Video title
    pub title: Option<String>,
    /// Video description
    pub description: Option<String>,
    /// Video duration in seconds
    pub duration: Option<u32>,
}

/// News in sitemap
#[derive(Debug, Clone)]
pub struct SitemapNews {
    /// Publication name
    pub publication_name: Option<String>,
    /// Publication language
    pub publication_language: Option<String>,
    /// Publication date
    pub publication_date: Option<String>,
    /// News title
    pub title: Option<String>,
}

/// Sitemap parser
pub struct SitemapParser;

impl Default for SitemapParser {
    fn default() -> Self {
        Self::new()
    }
}

impl SitemapParser {
    /// New parser
    pub fn new() -> Self {
        Self
    }

    /// Parse an XML sitemap
    pub fn parse(&self, xml: &str) -> Vec<SitemapEntry> {
        let mut entries = Vec::new();
        let mut reader = Reader::from_str(xml);
        reader.trim_text(true);

        let mut current_entry: Option<PartialEntry> = None;
        let mut current_tag = String::new();
        let mut in_url = false;

        loop {
            match reader.read_event() {
                Ok(Event::Start(ref e)) => {
                    let name = String::from_utf8_lossy(e.name().as_ref()).to_string();
                    current_tag = name.clone();

                    if name == "url" {
                        in_url = true;
                        current_entry = Some(PartialEntry::default());
                    }
                }
                Ok(Event::Text(e)) => {
                    if in_url {
                        if let Some(ref mut entry) = current_entry {
                            let text = e.unescape().unwrap_or_default().to_string();
                            match current_tag.as_str() {
                                "loc" => entry.loc = Some(text),
                                "lastmod" => entry.lastmod = Some(text),
                                "changefreq" => entry.changefreq = Some(text),
                                "priority" => entry.priority = text.parse().ok(),
                                _ => {}
                            }
                        }
                    }
                }
                Ok(Event::End(ref e)) => {
                    let name = String::from_utf8_lossy(e.name().as_ref()).to_string();
                    if name == "url" {
                        in_url = false;
                        if let Some(entry) = current_entry.take() {
                            if let Some(loc_str) = entry.loc {
                                if let Ok(loc) = Url::parse(&loc_str) {
                                    entries.push(SitemapEntry {
                                        loc,
                                        lastmod: entry.lastmod,
                                        changefreq: entry.changefreq.and_then(|s| ChangeFreq::from_str(&s)),
                                        priority: entry.priority,
                                        images: Vec::new(),
                                        videos: Vec::new(),
                                        news: None,
                                    });
                                }
                            }
                        }
                    }
                }
                Ok(Event::Eof) => break,
                Err(_) => break,
                _ => {}
            }
        }

        entries
    }

    /// Check if it's a sitemap index
    pub fn is_sitemap_index(xml: &str) -> bool {
        xml.contains("<sitemapindex") || xml.contains("<sitemap>")
    }
}

/// Partial entry during parsing
#[derive(Default)]
struct PartialEntry {
    loc: Option<String>,
    lastmod: Option<String>,
    changefreq: Option<String>,
    priority: Option<f32>,
}