use quick_xml::events::Event;
use quick_xml::Reader;
use url::Url;
#[derive(Debug, Clone)]
pub struct SitemapIndexEntry {
pub loc: Url,
pub lastmod: Option<String>,
}
pub struct SitemapIndex;
impl Default for SitemapIndex {
fn default() -> Self {
Self::new()
}
}
impl SitemapIndex {
pub fn new() -> Self {
Self
}
pub fn parse(&self, xml: &str) -> Vec<SitemapIndexEntry> {
let mut entries = Vec::new();
let mut reader = Reader::from_str(xml);
reader.trim_text(true);
let mut current_entry: Option<PartialIndexEntry> = None;
let mut current_tag = String::new();
let mut in_sitemap = false;
loop {
match reader.read_event() {
Ok(Event::Start(ref e)) => {
let name = String::from_utf8_lossy(e.name().as_ref()).to_string();
current_tag = name.clone();
if name == "sitemap" {
in_sitemap = true;
current_entry = Some(PartialIndexEntry::default());
}
}
Ok(Event::Text(e)) => {
if in_sitemap {
if let Some(ref mut entry) = current_entry {
let text = e.unescape().unwrap_or_default().to_string();
match current_tag.as_str() {
"loc" => entry.loc = Some(text),
"lastmod" => entry.lastmod = Some(text),
_ => {}
}
}
}
}
Ok(Event::End(ref e)) => {
let name = String::from_utf8_lossy(e.name().as_ref()).to_string();
if name == "sitemap" {
in_sitemap = false;
if let Some(entry) = current_entry.take() {
if let Some(loc_str) = entry.loc {
if let Ok(loc) = Url::parse(&loc_str) {
entries.push(SitemapIndexEntry {
loc,
lastmod: entry.lastmod,
});
}
}
}
}
}
Ok(Event::Eof) => break,
Err(_) => break,
_ => {}
}
}
entries
}
pub fn default_sitemap_url(base_url: &Url) -> Option<Url> {
let sitemap_url = format!("{}://{}/sitemap.xml", base_url.scheme(), base_url.host_str()?);
Url::parse(&sitemap_url).ok()
}
pub fn common_sitemap_paths() -> Vec<&'static str> {
vec![
"/sitemap.xml",
"/sitemap_index.xml",
"/sitemap-index.xml",
"/sitemaps.xml",
"/sitemap/sitemap.xml",
"/wp-sitemap.xml",
"/post-sitemap.xml",
"/page-sitemap.xml",
]
}
}
#[derive(Default)]
struct PartialIndexEntry {
loc: Option<String>,
lastmod: Option<String>,
}