sitemap_iter/
lib.rs

1use log::{error, warn};
2use std::fmt::Debug;
3use std::str::FromStr;
4
5#[derive(Debug, PartialEq, Eq, Clone)]
6pub enum FrequencyParseError {
7    InvalidFrequency,
8}
9/// The frequency of change to a page.
10#[derive(Debug, PartialEq, Eq, Clone, Copy)]
11pub enum Frequency {
12    Always,
13    Hourly,
14    Daily,
15    Weekly,
16    Monthly,
17    Yearly,
18    Never,
19}
20impl FromStr for Frequency {
21    type Err = FrequencyParseError;
22    fn from_str(s: &str) -> Result<Self, Self::Err> {
23        Ok(if s.eq_ignore_ascii_case("always") {
24            Self::Always
25        } else if s.eq_ignore_ascii_case("hourly") {
26            Self::Hourly
27        } else if s.eq_ignore_ascii_case("daily") {
28            Self::Daily
29        } else if s.eq_ignore_ascii_case("weekly") {
30            Self::Weekly
31        } else if s.eq_ignore_ascii_case("monthly") {
32            Self::Monthly
33        } else if s.eq_ignore_ascii_case("yearly") {
34            Self::Yearly
35        } else if s.eq_ignore_ascii_case("never") {
36            Self::Never
37        } else {
38            return Err(FrequencyParseError::InvalidFrequency);
39        })
40    }
41}
42/// The data of a entry in the `urlset`.
43///
44/// See the [official spec](https://sitemaps.org/protocol.html) for more details.
45#[derive(Debug, PartialEq, Clone, Copy)]
46pub struct UrlEntry<'a> {
47    /// The location of this entry.
48    ///
49    /// `<loc>`
50    ///
51    /// I recommend using `http::Uri` to parse this, then extract the `Uri::path()`.
52    pub location: &'a str,
53    /// The date of last modification.
54    ///
55    /// `<lastmod>`
56    ///
57    /// Format should be in [W3C Datetime](https://www.w3.org/TR/NOTE-datetime).
58    pub last_modified: Option<&'a str>,
59    /// The frequency of change in this resource.
60    ///
61    /// `<changefreq>`
62    pub change_frequency: Option<Frequency>,
63    /// The priority of this page compared to other pages.
64    ///
65    /// `<priority>`
66    ///
67    /// Ranges from `0.0` to `1.0`
68    pub priority: Option<f32>,
69}
70#[derive(Debug, PartialEq, Eq, Clone)]
71pub enum Error {
72    /// The mandatory `<urlset>` tag is missing.
73    ///
74    /// You maybe don't have a sitemap.
75    UrlsetMissing,
76    Parse(roxmltree::Error),
77}
78pub struct Document<'a> {
79    doc: roxmltree::Document<'a>,
80}
81impl<'a> Document<'a> {
82    /// Takes `xml_document` and parses it according to [the spec](https://sitemaps.org/protocol.html).
83    pub fn parse(xml_document: &'a str) -> Result<Self, Error> {
84        roxmltree::Document::parse(xml_document)
85            .map_err(Error::Parse)
86            .map(|doc| Self { doc })
87    }
88    /// Returns an iterator of [`UrlEntry`].
89    ///
90    /// Uses [`log`] for logging errors in the XML.
91    pub fn iterate(
92        &'a self,
93    ) -> Result<impl Iterator<Item = UrlEntry<'a>> + DoubleEndedIterator + Clone + Debug + 'a, Error>
94    {
95        self.doc
96            .root()
97            .children()
98            .find(|c| c.is_element())
99            .and_then(|node| {
100                if node.tag_name().name() == "urlset" {
101                    Some(node)
102                } else {
103                    error!("Expected <urlset> but got {:?}", node);
104                    None
105                }
106            })
107            .map(|node| {
108                node.children().filter_map(|c| {
109                    let children = c.children().filter(|c| c.is_element());
110                    let mut loc = None;
111                    let mut lastmod = None;
112                    let mut changefreq = None;
113                    let mut priority = None;
114                    for child in children {
115                        if let Some(text) = node_text_expected_name(&child, "loc") {
116                            if loc.is_none() {
117                                loc = Some(text);
118                            } else {
119                                error!("Multiple <loc> in entry.");
120                                return None;
121                            }
122                        } else if let Some(text) = node_text_expected_name(&child, "lastmod") {
123                            if lastmod.is_some() {
124                                warn!("Multiple <lastmod> in entry.");
125                            }
126                            lastmod = Some(text);
127                        } else if let Some(text) = node_text_expected_name(&child, "changefreq") {
128                            if changefreq.is_some() {
129                                warn!("Multiple <changefreq> in entry.");
130                            }
131                            if let Ok(frequency) = text.parse() {
132                                changefreq = Some(frequency);
133                            } else {
134                                warn!("<changefreq> has invalid format: {text:?}");
135                            }
136                        } else if let Some(text) = node_text_expected_name(&child, "priority") {
137                            if priority.is_some() {
138                                warn!("Multiple <priority> in entry.");
139                            }
140                            if let Ok(num) = text.parse() {
141                                if (0.0..=1.0).contains(&num) {
142                                    priority = Some(num)
143                                } else {
144                                    warn!("<priority> {num} is out of range",)
145                                }
146                            }else {
147                                warn!("<priority> has invalid format: {text:?}. Expected floating-point number.");
148                            }
149                        }
150                    }
151                    if let Some(loc) = loc {
152                        Some(UrlEntry::<'a> {
153                            location: loc,
154                            last_modified: lastmod,
155                            change_frequency: changefreq,
156                            priority,
157                        })
158                    } else {
159                        error!("Expected <loc>, but found none.");
160                        None
161                    }
162                })
163            })
164            .ok_or(Error::UrlsetMissing)
165    }
166}
167fn node_text_expected_name<'a>(
168    node: &roxmltree::Node<'a, 'a>,
169    expected_tag: &str,
170) -> Option<&'a str> {
171    if node.tag_name().name() == expected_tag {
172        if let Some(text) = node.text() {
173            return Some(text);
174        }
175    }
176    None
177}