halldyll_core/sitemap/
parser.rs

1//! Parser - sitemap.xml parsing
2
3use quick_xml::events::Event;
4use quick_xml::Reader;
5use url::Url;
6
7/// Sitemap entry
8#[derive(Debug, Clone)]
9pub struct SitemapEntry {
10    /// Page URL
11    pub loc: Url,
12    /// Last modification date
13    pub lastmod: Option<String>,
14    /// Change frequency
15    pub changefreq: Option<ChangeFreq>,
16    /// Priority (0.0 - 1.0)
17    pub priority: Option<f32>,
18    /// Associated images
19    pub images: Vec<SitemapImage>,
20    /// Associated videos
21    pub videos: Vec<SitemapVideo>,
22    /// Associated news
23    pub news: Option<SitemapNews>,
24}
25
26/// Change frequency
27#[derive(Debug, Clone, PartialEq, Eq)]
28pub enum ChangeFreq {
29    /// Always changing
30    Always,
31    /// Hourly updates
32    Hourly,
33    /// Daily updates
34    Daily,
35    /// Weekly updates
36    Weekly,
37    /// Monthly updates
38    Monthly,
39    /// Yearly updates
40    Yearly,
41    /// Never changes
42    Never,
43}
44
45impl ChangeFreq {
46    fn from_str(s: &str) -> Option<Self> {
47        match s.to_lowercase().as_str() {
48            "always" => Some(ChangeFreq::Always),
49            "hourly" => Some(ChangeFreq::Hourly),
50            "daily" => Some(ChangeFreq::Daily),
51            "weekly" => Some(ChangeFreq::Weekly),
52            "monthly" => Some(ChangeFreq::Monthly),
53            "yearly" => Some(ChangeFreq::Yearly),
54            "never" => Some(ChangeFreq::Never),
55            _ => None,
56        }
57    }
58}
59
60/// Image in sitemap
61#[derive(Debug, Clone)]
62pub struct SitemapImage {
63    /// Image location URL
64    pub loc: Url,
65    /// Image title
66    pub title: Option<String>,
67    /// Image caption
68    pub caption: Option<String>,
69}
70
71/// Video in sitemap
72#[derive(Debug, Clone)]
73pub struct SitemapVideo {
74    /// Video content URL
75    pub content_loc: Option<Url>,
76    /// Video player URL
77    pub player_loc: Option<Url>,
78    /// Video thumbnail URL
79    pub thumbnail_loc: Option<Url>,
80    /// Video title
81    pub title: Option<String>,
82    /// Video description
83    pub description: Option<String>,
84    /// Video duration in seconds
85    pub duration: Option<u32>,
86}
87
88/// News in sitemap
89#[derive(Debug, Clone)]
90pub struct SitemapNews {
91    /// Publication name
92    pub publication_name: Option<String>,
93    /// Publication language
94    pub publication_language: Option<String>,
95    /// Publication date
96    pub publication_date: Option<String>,
97    /// News title
98    pub title: Option<String>,
99}
100
101/// Sitemap parser
102pub struct SitemapParser;
103
104impl Default for SitemapParser {
105    fn default() -> Self {
106        Self::new()
107    }
108}
109
110impl SitemapParser {
111    /// New parser
112    pub fn new() -> Self {
113        Self
114    }
115
116    /// Parse an XML sitemap
117    pub fn parse(&self, xml: &str) -> Vec<SitemapEntry> {
118        let mut entries = Vec::new();
119        let mut reader = Reader::from_str(xml);
120        reader.trim_text(true);
121
122        let mut current_entry: Option<PartialEntry> = None;
123        let mut current_tag = String::new();
124        let mut in_url = false;
125
126        loop {
127            match reader.read_event() {
128                Ok(Event::Start(ref e)) => {
129                    let name = String::from_utf8_lossy(e.name().as_ref()).to_string();
130                    current_tag = name.clone();
131
132                    if name == "url" {
133                        in_url = true;
134                        current_entry = Some(PartialEntry::default());
135                    }
136                }
137                Ok(Event::Text(e)) => {
138                    if in_url {
139                        if let Some(ref mut entry) = current_entry {
140                            let text = e.unescape().unwrap_or_default().to_string();
141                            match current_tag.as_str() {
142                                "loc" => entry.loc = Some(text),
143                                "lastmod" => entry.lastmod = Some(text),
144                                "changefreq" => entry.changefreq = Some(text),
145                                "priority" => entry.priority = text.parse().ok(),
146                                _ => {}
147                            }
148                        }
149                    }
150                }
151                Ok(Event::End(ref e)) => {
152                    let name = String::from_utf8_lossy(e.name().as_ref()).to_string();
153                    if name == "url" {
154                        in_url = false;
155                        if let Some(entry) = current_entry.take() {
156                            if let Some(loc_str) = entry.loc {
157                                if let Ok(loc) = Url::parse(&loc_str) {
158                                    entries.push(SitemapEntry {
159                                        loc,
160                                        lastmod: entry.lastmod,
161                                        changefreq: entry.changefreq.and_then(|s| ChangeFreq::from_str(&s)),
162                                        priority: entry.priority,
163                                        images: Vec::new(),
164                                        videos: Vec::new(),
165                                        news: None,
166                                    });
167                                }
168                            }
169                        }
170                    }
171                }
172                Ok(Event::Eof) => break,
173                Err(_) => break,
174                _ => {}
175            }
176        }
177
178        entries
179    }
180
181    /// Check if it's a sitemap index
182    pub fn is_sitemap_index(xml: &str) -> bool {
183        xml.contains("<sitemapindex") || xml.contains("<sitemap>")
184    }
185}
186
187/// Partial entry during parsing
188#[derive(Default)]
189struct PartialEntry {
190    loc: Option<String>,
191    lastmod: Option<String>,
192    changefreq: Option<String>,
193    priority: Option<f32>,
194}