use quick_xml::events::Event;
use quick_xml::Reader;
use url::Url;
#[derive(Debug, Clone)]
pub struct SitemapEntry {
pub loc: Url,
pub lastmod: Option<String>,
pub changefreq: Option<ChangeFreq>,
pub priority: Option<f32>,
pub images: Vec<SitemapImage>,
pub videos: Vec<SitemapVideo>,
pub news: Option<SitemapNews>,
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum ChangeFreq {
Always,
Hourly,
Daily,
Weekly,
Monthly,
Yearly,
Never,
}
impl ChangeFreq {
fn from_str(s: &str) -> Option<Self> {
match s.to_lowercase().as_str() {
"always" => Some(ChangeFreq::Always),
"hourly" => Some(ChangeFreq::Hourly),
"daily" => Some(ChangeFreq::Daily),
"weekly" => Some(ChangeFreq::Weekly),
"monthly" => Some(ChangeFreq::Monthly),
"yearly" => Some(ChangeFreq::Yearly),
"never" => Some(ChangeFreq::Never),
_ => None,
}
}
}
#[derive(Debug, Clone)]
pub struct SitemapImage {
pub loc: Url,
pub title: Option<String>,
pub caption: Option<String>,
}
#[derive(Debug, Clone)]
pub struct SitemapVideo {
pub content_loc: Option<Url>,
pub player_loc: Option<Url>,
pub thumbnail_loc: Option<Url>,
pub title: Option<String>,
pub description: Option<String>,
pub duration: Option<u32>,
}
#[derive(Debug, Clone)]
pub struct SitemapNews {
pub publication_name: Option<String>,
pub publication_language: Option<String>,
pub publication_date: Option<String>,
pub title: Option<String>,
}
pub struct SitemapParser;
impl Default for SitemapParser {
fn default() -> Self {
Self::new()
}
}
impl SitemapParser {
pub fn new() -> Self {
Self
}
pub fn parse(&self, xml: &str) -> Vec<SitemapEntry> {
let mut entries = Vec::new();
let mut reader = Reader::from_str(xml);
reader.trim_text(true);
let mut current_entry: Option<PartialEntry> = None;
let mut current_tag = String::new();
let mut in_url = false;
loop {
match reader.read_event() {
Ok(Event::Start(ref e)) => {
let name = String::from_utf8_lossy(e.name().as_ref()).to_string();
current_tag = name.clone();
if name == "url" {
in_url = true;
current_entry = Some(PartialEntry::default());
}
}
Ok(Event::Text(e)) => {
if in_url {
if let Some(ref mut entry) = current_entry {
let text = e.unescape().unwrap_or_default().to_string();
match current_tag.as_str() {
"loc" => entry.loc = Some(text),
"lastmod" => entry.lastmod = Some(text),
"changefreq" => entry.changefreq = Some(text),
"priority" => entry.priority = text.parse().ok(),
_ => {}
}
}
}
}
Ok(Event::End(ref e)) => {
let name = String::from_utf8_lossy(e.name().as_ref()).to_string();
if name == "url" {
in_url = false;
if let Some(entry) = current_entry.take() {
if let Some(loc_str) = entry.loc {
if let Ok(loc) = Url::parse(&loc_str) {
entries.push(SitemapEntry {
loc,
lastmod: entry.lastmod,
changefreq: entry.changefreq.and_then(|s| ChangeFreq::from_str(&s)),
priority: entry.priority,
images: Vec::new(),
videos: Vec::new(),
news: None,
});
}
}
}
}
}
Ok(Event::Eof) => break,
Err(_) => break,
_ => {}
}
}
entries
}
pub fn is_sitemap_index(xml: &str) -> bool {
xml.contains("<sitemapindex") || xml.contains("<sitemap>")
}
}
#[derive(Default)]
struct PartialEntry {
loc: Option<String>,
lastmod: Option<String>,
changefreq: Option<String>,
priority: Option<f32>,
}