halldyll_core/sitemap/
index.rs

1//! Index - Sitemap index
2
3use quick_xml::events::Event;
4use quick_xml::Reader;
5use url::Url;
6
7/// Sitemap index entry
8#[derive(Debug, Clone)]
9pub struct SitemapIndexEntry {
10    /// Sitemap URL
11    pub loc: Url,
12    /// Last modification date
13    pub lastmod: Option<String>,
14}
15
16/// Sitemap index parser
17pub struct SitemapIndex;
18
19impl Default for SitemapIndex {
20    fn default() -> Self {
21        Self::new()
22    }
23}
24
25impl SitemapIndex {
26    /// New parser
27    pub fn new() -> Self {
28        Self
29    }
30
31    /// Parse a sitemap index
32    pub fn parse(&self, xml: &str) -> Vec<SitemapIndexEntry> {
33        let mut entries = Vec::new();
34        let mut reader = Reader::from_str(xml);
35        reader.trim_text(true);
36
37        let mut current_entry: Option<PartialIndexEntry> = None;
38        let mut current_tag = String::new();
39        let mut in_sitemap = false;
40
41        loop {
42            match reader.read_event() {
43                Ok(Event::Start(ref e)) => {
44                    let name = String::from_utf8_lossy(e.name().as_ref()).to_string();
45                    current_tag = name.clone();
46
47                    if name == "sitemap" {
48                        in_sitemap = true;
49                        current_entry = Some(PartialIndexEntry::default());
50                    }
51                }
52                Ok(Event::Text(e)) => {
53                    if in_sitemap {
54                        if let Some(ref mut entry) = current_entry {
55                            let text = e.unescape().unwrap_or_default().to_string();
56                            match current_tag.as_str() {
57                                "loc" => entry.loc = Some(text),
58                                "lastmod" => entry.lastmod = Some(text),
59                                _ => {}
60                            }
61                        }
62                    }
63                }
64                Ok(Event::End(ref e)) => {
65                    let name = String::from_utf8_lossy(e.name().as_ref()).to_string();
66                    if name == "sitemap" {
67                        in_sitemap = false;
68                        if let Some(entry) = current_entry.take() {
69                            if let Some(loc_str) = entry.loc {
70                                if let Ok(loc) = Url::parse(&loc_str) {
71                                    entries.push(SitemapIndexEntry {
72                                        loc,
73                                        lastmod: entry.lastmod,
74                                    });
75                                }
76                            }
77                        }
78                    }
79                }
80                Ok(Event::Eof) => break,
81                Err(_) => break,
82                _ => {}
83            }
84        }
85
86        entries
87    }
88
89    /// Default sitemap URL
90    pub fn default_sitemap_url(base_url: &Url) -> Option<Url> {
91        let sitemap_url = format!("{}://{}/sitemap.xml", base_url.scheme(), base_url.host_str()?);
92        Url::parse(&sitemap_url).ok()
93    }
94
95    /// Common sitemap URLs to try
96    pub fn common_sitemap_paths() -> Vec<&'static str> {
97        vec![
98            "/sitemap.xml",
99            "/sitemap_index.xml",
100            "/sitemap-index.xml",
101            "/sitemaps.xml",
102            "/sitemap/sitemap.xml",
103            "/wp-sitemap.xml",
104            "/post-sitemap.xml",
105            "/page-sitemap.xml",
106        ]
107    }
108}
109
110/// Partial entry during parsing
111#[derive(Default)]
112struct PartialIndexEntry {
113    loc: Option<String>,
114    lastmod: Option<String>,
115}